{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PubFig83 Knowledge Graph Identities\n", "\n", "- convert filename-names to names\n", "- fetch Google Knowledge Graph entity IDs for each name\n", "- save KG IDs to CSV" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "import os.path as osp\n", "from os.path import join\n", "from glob import glob\n", "import random\n", "import math\n", "from pathlib import Path\n", "from datetime import datetime\n", "import requests\n", "import json\n", "import time\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get List of Names" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "83\n" ] } ], "source": [ "dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'\n", "names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", "print(len(names))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "julia stiles\n" ] } ], "source": [ "print(names[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Google Knowledge Graph API" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# read API key\n", "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def _get_kg_meta(result_obj, params):\n", " global api_key, url_kg_api\n", " \n", " params['indent'] = True\n", " params['key'] = api_key\n", " params['limit'] = 1\n", " \n", " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", " try:\n", " json_response = urllib.request.urlopen(url).read()\n", " except Exception as e:\n", " result['error'] = str(e)\n", " else:\n", " try:\n", " response = json.loads(json_response)\n", " items = response.get('itemListElement', [])\n", " result_obj['accessed'] = True\n", " if items:\n", " item = items[0]\n", " item_result = item.get('result', [])\n", " result_obj['description'] = item_result.get('description', '')\n", " det_desc = item_result.get('detailedDescription', '')\n", " if not result_obj['kg_id']:\n", " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", " if det_desc:\n", " result_obj['description_extended'] = det_desc.get('articleBody','')\n", " result_obj['description_license'] = det_desc.get('license','')\n", " result_obj['description_url'] = det_desc.get('url','')\n", " else:\n", " result_obj['description_extended'] = ''\n", " result_obj['description_license'] = ''\n", " result_obj['description_url'] = ''\n", " result_img = item_result.get('image', '')\n", " if result_img:\n", " result_obj['image_url'] = result_img.get('contentUrl', '')\n", " result_obj['name'] = item_result.get('name', '')\n", " result_obj['score'] = item.get('resultScore', 0.0)\n", " result_obj['url'] = item_result.get('url', '')\n", " except Exception as e:\n", " result_obj['error'] = str(e)\n", " return result_obj\n", " \n", "def get_kg_from_name(obj):\n", " if obj['accessed']:\n", " return obj\n", " params = {'query': obj['query']}\n", " return _get_kg_meta(obj, params)\n", " \n", "def get_kg_from_kg_id(obj):\n", " if obj['accessed']:\n", " return obj\n", " params = {'ids': obj['kg_id']}\n", " return _get_kg_meta(obj, params)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'accessed': True,\n", " 'description': 'Indian film director',\n", " 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '\n", " 'script writer, and producer. Adoor Gopalakrishnan '\n", " 'had a major role in revolutioning Malayalam cinema '\n", " 'during the 1970s and is regarded as one of the most '\n", " 'notable filmmakers of India. ',\n", " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", " 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',\n", " 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',\n", " 'kg_id': '/m/07s7wk',\n", " 'name': 'Adoor Gopalakrishnan',\n", " 'query': 'Adoor Gopalakrishnan',\n", " 'score': 501.001862,\n", " 'url': 'http://www.adoorgopalakrishnan.com'}\n" ] } ], "source": [ "# test get from name\n", "q = 'Adoor Gopalakrishnan'\n", "obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", "result = get_kg_from_name(obj)\n", "pprint(obj)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# define thread mapping function\n", "def pool_map_persons(obj):\n", " global pbar\n", " pbar.update(1)\n", " kg_obj = get_kg_from_name(obj)\n", " return kg_obj" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# build mapped_person objects\n", "mapped_persons = []\n", "for fn in names:\n", " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", " mapped_persons.append(obj)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "83\n", "['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']\n" ] } ], "source": [ "print(len(mapped_persons))\n", "print(names[0:10])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0af8e1f2d849473f933f506f5c8ced2b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "12/83 remaining\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "09fa539f1d62416caf7fd217e7cf4892", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "9 remaining. Sleeping...\n", "9/83 remaining\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c22e1ce3e6e441839f12e88846612825", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "6 remaining. Sleeping...\n", "6/83 remaining\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c7c5af3d562b475ea3420eca594cee85", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "5 remaining. Sleeping...\n", "5/83 remaining\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7fcb0916185443cbbca9e553923e232f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "2 remaining. Sleeping...\n", "2/83 remaining\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7a5b35b2832d4e54bb87241f8bb29390", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "num_threads = 5\n", "pbar = tqdm(total=len(mapped_persons))\n", "\n", "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", "\n", "# convert to thread pool\n", "while num_non_accessed > 0:\n", " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", " pool = ThreadPool(num_threads)\n", "\n", " # start threading\n", " with tqdm(total=len(mapped_persons)) as pbar:\n", " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", "\n", " # close tqdm\n", " pbar.close()\n", "\n", " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", " if num_non_accessed > 0:\n", " print(f'{num_non_accessed} remaining. Sleeping...')\n", " time.sleep(60) # wait X minutes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Clean data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "updated CC license: 0\n", "items w/o KG meta: 0\n" ] } ], "source": [ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", "cc_short = 'CC BY-SA 3.0'\n", "nchanged = 0\n", "for mapped_person in mapped_persons:\n", " license = mapped_person.get('description_license', None)\n", " if license == cc_long:\n", " nchanged += 1\n", " mapped_person['description_license'] = cc_short\n", "print(f'updated CC license: {nchanged}')\n", "\n", "# find number not accessed\n", "n_empty = 0\n", "for mapped_person in mapped_persons:\n", " if not mapped_person.get('accessed', False):\n", " n_empty += 1\n", " print(mapped_person['kg_id'])\n", "print(f'items w/o KG meta: {n_empty}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# create dataframe for mapped persons\n", "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", "df_mapped_persons.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accesseddescriptiondescription_extendeddescription_licensedescription_urlimage_urlkg_idnamequeryscoreurl
index
0TrueAmerican actressJulia O'Hara Stiles is an American actress. Bo...CC BY-SA 3.0https://en.wikipedia.org/wiki/Julia_Stileshttp://t1.gstatic.com/images?q=tbn:ANd9GcToFqB.../m/02jtjzJulia Stilesjulia stiles637.113647http://www.juliastilesblog.com
1TrueActorOrlando Jonathan Blanchard Bloom is an English...CC BY-SA 3.0https://en.wikipedia.org/wiki/Orlando_Bloomhttp://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc.../m/09wj5Orlando Bloomorlando bloom689.364319
\n", "
" ], "text/plain": [ " accessed description \\\n", "index \n", "0 True American actress \n", "1 True Actor \n", "\n", " description_extended description_license \\\n", "index \n", "0 Julia O'Hara Stiles is an American actress. Bo... CC BY-SA 3.0 \n", "1 Orlando Jonathan Blanchard Bloom is an English... CC BY-SA 3.0 \n", "\n", " description_url \\\n", "index \n", "0 https://en.wikipedia.org/wiki/Julia_Stiles \n", "1 https://en.wikipedia.org/wiki/Orlando_Bloom \n", "\n", " image_url kg_id \\\n", "index \n", "0 http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB... /m/02jtjz \n", "1 http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc... /m/09wj5 \n", "\n", " name query score \\\n", "index \n", "0 Julia Stiles julia stiles 637.113647 \n", "1 Orlando Bloom orlando bloom 689.364319 \n", "\n", " url \n", "index \n", "0 http://www.juliastilesblog.com \n", "1 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check output\n", "df_mapped_persons.head(2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# save\n", "fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'\n", "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# create small version\n", "limit = 1000\n", "fpp_out = Path(fp_out)\n", "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", "df_mapped_persons_sm.index.name = 'index'\n", "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }