diff options
Diffstat (limited to 'megapixels/notebooks/datasets/msceleb/identity.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/msceleb/identity.ipynb | 378 |
1 files changed, 0 insertions, 378 deletions
diff --git a/megapixels/notebooks/datasets/msceleb/identity.ipynb b/megapixels/notebooks/datasets/msceleb/identity.ipynb deleted file mode 100644 index d330badb..00000000 --- a/megapixels/notebooks/datasets/msceleb/identity.ipynb +++ /dev/null @@ -1,378 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Knowledge Graph MS Celeb" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "import time\n", - "from datetime import datetime\n", - "\n", - "import requests\n", - "\n", - "import json\n", - "import urllib\n", - "from multiprocessing.pool import ThreadPool\n", - "import threading\n", - "from urllib.request import urlopen\n", - "import urllib.request\n", - "\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2\n", - "import sys\n", - "sys.path.append('/work/megapixels_dev/megapixels/')\n", - "from app.utils import file_utils, im_utils" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", - "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", - "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "def get_kg_meta(obj, url):\n", - " \n", - "def get_kg_from_name(obj):\n", - " \n", - "def get_kg_from_kg_id(obj):\n", - " # TODO detect 503 service unavailable\n", - " if obj['accessed']:\n", - " return obj\n", - " global api_key, url_kg_api\n", - " kg_id = obj['kg_id']\n", - " params = {\n", - " 'query': q,\n", - " 'limit': 5,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " \n", - " params = {\n", - " 'ids': kg_id,\n", - " 'limit': 1,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", - " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", - " try:\n", - " json_response = urllib.request.urlopen(url).read()\n", - " except Exception as e:\n", - " result['error'] = str(e)\n", - " else:\n", - " try:\n", - " response = json.loads(json_response)\n", - " items = response.get('itemListElement', [])\n", - " result['accessed'] = True\n", - " if items:\n", - " item = items[0]\n", - " item_result = item.get('result', [])\n", - " result['description'] = item_result.get('description', '')\n", - " det_desc = item_result.get('detailedDescription', '')\n", - " if det_desc:\n", - " result['description_extended'] = det_desc.get('articleBody','')\n", - " result['description_license'] = det_desc.get('license','')\n", - " result['description_url'] = det_desc.get('url','')\n", - " else:\n", - " result['description_extended'] = ''\n", - " result['description_license'] = ''\n", - " result['description_url'] = ''\n", - " result_img = item_result.get('image', '')\n", - " if result_img:\n", - " result['image_url'] = result_img.get('contentUrl', '')\n", - " result['name'] = item_result.get('name', '')\n", - " result['score'] = item.get('resultScore', 0.0)\n", - " result['url'] = item_result.get('url', '')\n", - " except Exception as e:\n", - " result['error'] = str(e)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]\n", - "opt_threads = 10\n", - "pbar = tqdm(total=len(unmapped_persons))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# define thread mapping function\n", - "def pool_map_persons(obj):\n", - " global pbar\n", - " pbar.update(1)\n", - " kg_obj = get_kg_from_kg_obj(obj)\n", - " return kg_obj" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "#mapped_persons_bkup = mapped_persons.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "886ce68bd7484d2fa4ab2da0beec5359", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# convert to thread pool\n", - "#mapped_persons = []\n", - "pool = ThreadPool(opt_threads)\n", - "\n", - "# start threading\n", - "with tqdm(total=len(unmapped_persons)) as pbar:\n", - " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", - "\n", - "# close tqdm\n", - "pbar.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "93418" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapped_persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and Chief Executive of Prestbury Investment Holdings Limited and Chairman of Prestbury Investments LLP. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Nick_Leslau', 'name': 'Nick Leslau'}]" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapped_persons[93415:]" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5\n" - ] - } - ], - "source": [ - "# reduce CC attribution string\n", - "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", - "cc_short = 'CC BY-SA 3.0'\n", - "nchanged = 0\n", - "for mapped_person in mapped_persons:\n", - " license = mapped_person.get('description_license',None)\n", - " if license == cc_long:\n", - " nchanged += 1\n", - " mapped_person['description_license'] = cc_short\n", - "print(nchanged)" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "# find number not accessed\n", - "n_empty = 0\n", - "for mapped_person in mapped_persons:\n", - " if not mapped_person.get('accessed', False):\n", - " n_empty += 1\n", - " print(mapped_person['kg_id'])\n", - "print(n_empty)" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "# create dataframe\n", - "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", - "df_mapped_persons.index.name = 'index'\n", - "fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'\n", - "df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "df_mapped_persons.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [], - "source": [ - "# create small version\n", - "limit = 1000\n", - "fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'\n", - "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", - "df_mapped_persons_sm.index.name = 'index'\n", - "df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}\n" - ] - } - ], - "source": [ - "#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})\n", - "#print(a)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} |
