diff options
Diffstat (limited to 'megapixels/notebooks/datasets/imdb_wiki/identity.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/imdb_wiki/identity.ipynb | 498 |
1 files changed, 0 insertions, 498 deletions
diff --git a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb deleted file mode 100644 index 40d7bd86..00000000 --- a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb +++ /dev/null @@ -1,498 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# IMDB-WIKI Knowledge Graph\n", - "\n", - "- convert names to Knowledge Graph entity IDs\n", - "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "from pathlib import Path\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "import requests\n", - "import json\n", - "import time\n", - "from pprint import pprint\n", - "from multiprocessing.pool import ThreadPool\n", - "import threading\n", - "import urllib.request\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load IMDB Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>celeb_id</th>\n", - " <th>dob</th>\n", - " <th>filepath</th>\n", - " <th>gender</th>\n", - " <th>name</th>\n", - " <th>x1</th>\n", - " <th>x2</th>\n", - " <th>y1</th>\n", - " <th>y2</th>\n", - " <th>year_photo</th>\n", - " </tr>\n", - " <tr>\n", - " <th>index</th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " <th></th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>6488</td>\n", - " <td>1900-5-11</td>\n", - " <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n", - " <td>m</td>\n", - " <td>Fred Astaire</td>\n", - " <td>1072.926</td>\n", - " <td>1214.784</td>\n", - " <td>161.838</td>\n", - " <td>303.696</td>\n", - " <td>1968</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>6488</td>\n", - " <td>1900-5-11</td>\n", - " <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n", - " <td>m</td>\n", - " <td>Fred Astaire</td>\n", - " <td>477.184</td>\n", - " <td>622.592</td>\n", - " <td>100.352</td>\n", - " <td>245.760</td>\n", - " <td>1970</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " celeb_id dob filepath \\\n", - "index \n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n", - "\n", - " gender name x1 x2 y1 y2 year_photo \n", - "index \n", - "0 m Fred Astaire 1072.926 1214.784 161.838 303.696 1968 \n", - "1 m Fred Astaire 477.184 622.592 100.352 245.760 1970 " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n", - "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n", - "df_meta_imdb.head(2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Google Knowledge Graph API" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# read API key\n", - "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", - "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "def _get_kg_meta(result_obj, params):\n", - " global api_key, url_kg_api\n", - " \n", - " params['indent'] = True\n", - " params['key'] = api_key\n", - " params['limit'] = 1\n", - " \n", - " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", - " try:\n", - " json_response = urllib.request.urlopen(url).read()\n", - " except Exception as e:\n", - " result['error'] = str(e)\n", - " else:\n", - " try:\n", - " response = json.loads(json_response)\n", - " items = response.get('itemListElement', [])\n", - " result_obj['accessed'] = True\n", - " if items:\n", - " item = items[0]\n", - " item_result = item.get('result', [])\n", - " result_obj['description'] = item_result.get('description', '')\n", - " det_desc = item_result.get('detailedDescription', '')\n", - " if not result_obj['kg_id']:\n", - " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", - " if det_desc:\n", - " result_obj['description_extended'] = det_desc.get('articleBody','')\n", - " result_obj['description_license'] = det_desc.get('license','')\n", - " result_obj['description_url'] = det_desc.get('url','')\n", - " else:\n", - " result_obj['description_extended'] = ''\n", - " result_obj['description_license'] = ''\n", - " result_obj['description_url'] = ''\n", - " result_img = item_result.get('image', '')\n", - " if result_img:\n", - " result_obj['image_url'] = result_img.get('contentUrl', '')\n", - " result_obj['name'] = item_result.get('name', '')\n", - " result_obj['score'] = item.get('resultScore', 0.0)\n", - " result_obj['url'] = item_result.get('url', '')\n", - " except Exception as e:\n", - " result_obj['error'] = str(e)\n", - " return result_obj\n", - " \n", - "def get_kg_from_name(obj):\n", - " if obj['accessed']:\n", - " return obj\n", - " params = {'query': obj['query']}\n", - " return _get_kg_meta(obj, params)\n", - " \n", - "def get_kg_from_kg_id(obj):\n", - " if obj['accessed']:\n", - " return obj\n", - " params = {'ids': obj['kg_id']}\n", - " return _get_kg_meta(obj, params)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'accessed': True,\n", - " 'description': 'American singer',\n", - " 'description_extended': 'Taylor Alison Swift is an American '\n", - " \"singer-songwriter. As one of the world's leading \"\n", - " 'contemporary recording artists, she is known for '\n", - " 'narrative songs about her personal life, which has '\n", - " 'received widespread media coverage.\\n',\n", - " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", - " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n", - " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n", - " 'kg_id': '/m/0dl567',\n", - " 'name': 'Taylor Swift',\n", - " 'query': 'Taylor Swift',\n", - " 'score': 1241.476318,\n", - " 'url': 'http://taylorswift.com/'}\n" - ] - } - ], - "source": [ - "# make a test query to check if API works\n", - "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", - "result = get_kg_from_name(obj)\n", - "pprint(obj)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", - "result = get_kg_from_id(obj)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "# build mapped_person objects\n", - "mapped_persons = []\n", - "count = 0\n", - "df_person_groups = df_meta_imdb.groupby('name')\n", - "for group_name, df_name_group in df_person_groups:\n", - " obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", - " mapped_persons.append(obj)" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "# define thread mapping function\n", - "def pool_map_persons(obj):\n", - " global pbar\n", - " pbar.update(1)\n", - " kg_obj = get_kg_from_name(obj)\n", - " return kg_obj" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "87f6a2be42284199b8a67458f4090497", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0/20284 remaining\n" - ] - } - ], - "source": [ - "num_threads = 2\n", - "pbar = tqdm(total=len(mapped_persons))\n", - "\n", - "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", - "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", - "\n", - "# convert to thread pool\n", - "while num_non_accessed > 0:\n", - " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", - " pool = ThreadPool(num_threads)\n", - "\n", - " # start threading\n", - " with tqdm(total=len(mapped_persons)) as pbar:\n", - " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", - "\n", - " # close tqdm\n", - " pbar.close()\n", - "\n", - " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", - " if num_non_accessed > 0:\n", - " print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n", - " time.sleep(60*20) # wait X minutes" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee QuiƱones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee QuiƱones'}\n" - ] - } - ], - "source": [ - "# test output for a person\n", - "print(mapped_persons[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", - "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", - "cc_short = 'CC BY-SA 3.0'\n", - "nchanged = 0\n", - "for mapped_person in mapped_persons:\n", - " license = mapped_person.get('description_license', None)\n", - " if license == cc_long:\n", - " nchanged += 1\n", - " mapped_person['description_license'] = cc_short\n", - "print(nchanged)" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "# find number not accessed\n", - "n_empty = 0\n", - "for mapped_person in mapped_persons:\n", - " if not mapped_person.get('accessed', False):\n", - " n_empty += 1\n", - " print(mapped_person['kg_id'])\n", - "print(n_empty)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "# create dataframe for mapped persons\n", - "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", - "df_mapped_persons.index.name = 'index'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# check output\n", - "df_mapped_persons.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "# save\n", - "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n", - "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "# create small version\n", - "limit = 1000\n", - "fpp_out = Path(fp_out)\n", - "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", - "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", - "df_mapped_persons_sm.index.name = 'index'\n", - "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} |
