summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/pubfig83/identity.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
committeradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
commitdd2c36288aa1e8af14588f9258f6785879b8638c (patch)
tree543564ff7cc9b83ae1ecbc5b0d89bca9a6c17742 /megapixels/notebooks/datasets/pubfig83/identity.ipynb
parentb0b06be0defe97ef19cf4d0f3328db40d299e110 (diff)
add utils for analyzing identities
Diffstat (limited to 'megapixels/notebooks/datasets/pubfig83/identity.ipynb')
-rw-r--r--megapixels/notebooks/datasets/pubfig83/identity.ipynb656
1 files changed, 656 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/pubfig83/identity.ipynb b/megapixels/notebooks/datasets/pubfig83/identity.ipynb
new file mode 100644
index 00000000..697d9cee
--- /dev/null
+++ b/megapixels/notebooks/datasets/pubfig83/identity.ipynb
@@ -0,0 +1,656 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PubFig83 Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "from pathlib import Path\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get List of Names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "83\n"
+ ]
+ }
+ ],
+ "source": [
+ "dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'\n",
+ "names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n",
+ "print(len(names))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "julia stiles\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(names[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'Indian film director',\n",
+ " 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '\n",
+ " 'script writer, and producer. Adoor Gopalakrishnan '\n",
+ " 'had a major role in revolutioning Malayalam cinema '\n",
+ " 'during the 1970s and is regarded as one of the most '\n",
+ " 'notable filmmakers of India. ',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',\n",
+ " 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',\n",
+ " 'kg_id': '/m/07s7wk',\n",
+ " 'name': 'Adoor Gopalakrishnan',\n",
+ " 'query': 'Adoor Gopalakrishnan',\n",
+ " 'score': 501.001862,\n",
+ " 'url': 'http://www.adoorgopalakrishnan.com'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test get from name\n",
+ "q = 'Adoor Gopalakrishnan'\n",
+ "obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "for fn in names:\n",
+ " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "83\n",
+ "['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(mapped_persons))\n",
+ "print(names[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0af8e1f2d849473f933f506f5c8ced2b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "12/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "09fa539f1d62416caf7fd217e7cf4892",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9 remaining. Sleeping...\n",
+ "9/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c22e1ce3e6e441839f12e88846612825",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "6 remaining. Sleeping...\n",
+ "6/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c7c5af3d562b475ea3420eca594cee85",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "5 remaining. Sleeping...\n",
+ "5/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7fcb0916185443cbbca9e553923e232f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "2 remaining. Sleeping...\n",
+ "2/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7a5b35b2832d4e54bb87241f8bb29390",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_threads = 5\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed} remaining. Sleeping...')\n",
+ " time.sleep(60) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Clean data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "updated CC license: 0\n",
+ "items w/o KG meta: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(f'updated CC license: {nchanged}')\n",
+ "\n",
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(f'items w/o KG meta: {n_empty}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>accessed</th>\n",
+ " <th>description</th>\n",
+ " <th>description_extended</th>\n",
+ " <th>description_license</th>\n",
+ " <th>description_url</th>\n",
+ " <th>image_url</th>\n",
+ " <th>kg_id</th>\n",
+ " <th>name</th>\n",
+ " <th>query</th>\n",
+ " <th>score</th>\n",
+ " <th>url</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>True</td>\n",
+ " <td>American actress</td>\n",
+ " <td>Julia O'Hara Stiles is an American actress. Bo...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Julia_Stiles</td>\n",
+ " <td>http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB...</td>\n",
+ " <td>/m/02jtjz</td>\n",
+ " <td>Julia Stiles</td>\n",
+ " <td>julia stiles</td>\n",
+ " <td>637.113647</td>\n",
+ " <td>http://www.juliastilesblog.com</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>True</td>\n",
+ " <td>Actor</td>\n",
+ " <td>Orlando Jonathan Blanchard Bloom is an English...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Orlando_Bloom</td>\n",
+ " <td>http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc...</td>\n",
+ " <td>/m/09wj5</td>\n",
+ " <td>Orlando Bloom</td>\n",
+ " <td>orlando bloom</td>\n",
+ " <td>689.364319</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " accessed description \\\n",
+ "index \n",
+ "0 True American actress \n",
+ "1 True Actor \n",
+ "\n",
+ " description_extended description_license \\\n",
+ "index \n",
+ "0 Julia O'Hara Stiles is an American actress. Bo... CC BY-SA 3.0 \n",
+ "1 Orlando Jonathan Blanchard Bloom is an English... CC BY-SA 3.0 \n",
+ "\n",
+ " description_url \\\n",
+ "index \n",
+ "0 https://en.wikipedia.org/wiki/Julia_Stiles \n",
+ "1 https://en.wikipedia.org/wiki/Orlando_Bloom \n",
+ "\n",
+ " image_url kg_id \\\n",
+ "index \n",
+ "0 http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB... /m/02jtjz \n",
+ "1 http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc... /m/09wj5 \n",
+ "\n",
+ " name query score \\\n",
+ "index \n",
+ "0 Julia Stiles julia stiles 637.113647 \n",
+ "1 Orlando Bloom orlando bloom 689.364319 \n",
+ "\n",
+ " url \n",
+ "index \n",
+ "0 http://www.juliastilesblog.com \n",
+ "1 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}