diff options
Diffstat (limited to 'megapixels/notebooks/datasets/identity/pubfig83_identity.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/identity/pubfig83_identity.ipynb | 657 |
1 files changed, 657 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/pubfig83_identity.ipynb b/megapixels/notebooks/datasets/identity/pubfig83_identity.ipynb new file mode 100644 index 00000000..9809d6d6 --- /dev/null +++ b/megapixels/notebooks/datasets/identity/pubfig83_identity.ipynb @@ -0,0 +1,657 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PubFig83 Knowledge Graph Identities\n", + "\n", + "- convert filename-names to names\n", + "- fetch Google Knowledge Graph entity IDs for each name\n", + "- save KG IDs to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get List of Names" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83\n" + ] + } + ], + "source": [ + "dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'\n", + "names_orig = [x for x in os.listdir(dir_lfw)]\n", + "names_query = [x.replace('_', ' ') for x in names_orig]\n", + "print(len(names))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "julia_stiles\n" + ] + } + ], + "source": [ + "print(names_orig[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'Indian film director',\n", + " 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '\n", + " 'script writer, and producer. Adoor Gopalakrishnan '\n", + " 'had a major role in revolutioning Malayalam cinema '\n", + " 'during the 1970s and is regarded as one of the most '\n", + " 'notable filmmakers of India. ',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',\n", + " 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',\n", + " 'kg_id': '/m/07s7wk',\n", + " 'name': 'Adoor Gopalakrishnan',\n", + " 'query': 'Adoor Gopalakrishnan',\n", + " 'score': 501.590881,\n", + " 'url': 'http://www.adoorgopalakrishnan.com'}\n" + ] + } + ], + "source": [ + "# test get from name\n", + "q = 'Adoor Gopalakrishnan'\n", + "obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "for fn in names:\n", + " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83\n", + "['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']\n" + ] + } + ], + "source": [ + "print(len(mapped_persons))\n", + "print(names[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0af8e1f2d849473f933f506f5c8ced2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "09fa539f1d62416caf7fd217e7cf4892", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9 remaining. Sleeping...\n", + "9/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c22e1ce3e6e441839f12e88846612825", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "6 remaining. Sleeping...\n", + "6/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c7c5af3d562b475ea3420eca594cee85", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "5 remaining. Sleeping...\n", + "5/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7fcb0916185443cbbca9e553923e232f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2 remaining. Sleeping...\n", + "2/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7a5b35b2832d4e54bb87241f8bb29390", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "num_threads = 5\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed} remaining. Sleeping...')\n", + " time.sleep(60) # wait X minutes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "updated CC license: 0\n", + "items w/o KG meta: 0\n" + ] + } + ], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(f'updated CC license: {nchanged}')\n", + "\n", + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(f'items w/o KG meta: {n_empty}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>accessed</th>\n", + " <th>description</th>\n", + " <th>description_extended</th>\n", + " <th>description_license</th>\n", + " <th>description_url</th>\n", + " <th>image_url</th>\n", + " <th>kg_id</th>\n", + " <th>name</th>\n", + " <th>query</th>\n", + " <th>score</th>\n", + " <th>url</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>True</td>\n", + " <td>American actress</td>\n", + " <td>Julia O'Hara Stiles is an American actress. Bo...</td>\n", + " <td>CC BY-SA 3.0</td>\n", + " <td>https://en.wikipedia.org/wiki/Julia_Stiles</td>\n", + " <td>http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB...</td>\n", + " <td>/m/02jtjz</td>\n", + " <td>Julia Stiles</td>\n", + " <td>julia stiles</td>\n", + " <td>637.113647</td>\n", + " <td>http://www.juliastilesblog.com</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>True</td>\n", + " <td>Actor</td>\n", + " <td>Orlando Jonathan Blanchard Bloom is an English...</td>\n", + " <td>CC BY-SA 3.0</td>\n", + " <td>https://en.wikipedia.org/wiki/Orlando_Bloom</td>\n", + " <td>http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc...</td>\n", + " <td>/m/09wj5</td>\n", + " <td>Orlando Bloom</td>\n", + " <td>orlando bloom</td>\n", + " <td>689.364319</td>\n", + " <td></td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " accessed description \\\n", + "index \n", + "0 True American actress \n", + "1 True Actor \n", + "\n", + " description_extended description_license \\\n", + "index \n", + "0 Julia O'Hara Stiles is an American actress. Bo... CC BY-SA 3.0 \n", + "1 Orlando Jonathan Blanchard Bloom is an English... CC BY-SA 3.0 \n", + "\n", + " description_url \\\n", + "index \n", + "0 https://en.wikipedia.org/wiki/Julia_Stiles \n", + "1 https://en.wikipedia.org/wiki/Orlando_Bloom \n", + "\n", + " image_url kg_id \\\n", + "index \n", + "0 http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB... /m/02jtjz \n", + "1 http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc... /m/09wj5 \n", + "\n", + " name query score \\\n", + "index \n", + "0 Julia Stiles julia stiles 637.113647 \n", + "1 Orlando Bloom orlando bloom 689.364319 \n", + "\n", + " url \n", + "index \n", + "0 http://www.juliastilesblog.com \n", + "1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check output\n", + "df_mapped_persons.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "megapixels", + "language": "python", + "name": "megapixels" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
