{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IMDB-WIKI Knowledge Graph\n",
    "\n",
    "- convert names to Knowledge Graph entity IDs\n",
    "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import os\n",
    "import os.path as osp\n",
    "from os.path import join\n",
    "from glob import glob\n",
    "from pathlib import Path\n",
    "import random\n",
    "import math\n",
    "from datetime import datetime\n",
    "import requests\n",
    "import json\n",
    "import time\n",
    "from pprint import pprint\n",
    "from multiprocessing.pool import ThreadPool\n",
    "import threading\n",
    "import urllib.request\n",
    "\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "import pandas as pd\n",
    "from scipy.io import loadmat\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load IMDB Metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>celeb_id</th>\n",
       "      <th>dob</th>\n",
       "      <th>filepath</th>\n",
       "      <th>gender</th>\n",
       "      <th>name</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>y1</th>\n",
       "      <th>y2</th>\n",
       "      <th>year_photo</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6488</td>\n",
       "      <td>1900-5-11</td>\n",
       "      <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
       "      <td>m</td>\n",
       "      <td>Fred Astaire</td>\n",
       "      <td>1072.926</td>\n",
       "      <td>1214.784</td>\n",
       "      <td>161.838</td>\n",
       "      <td>303.696</td>\n",
       "      <td>1968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6488</td>\n",
       "      <td>1900-5-11</td>\n",
       "      <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
       "      <td>m</td>\n",
       "      <td>Fred Astaire</td>\n",
       "      <td>477.184</td>\n",
       "      <td>622.592</td>\n",
       "      <td>100.352</td>\n",
       "      <td>245.760</td>\n",
       "      <td>1970</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       celeb_id        dob                                      filepath  \\\n",
       "index                                                                      \n",
       "0          6488  1900-5-11   01/nm0000001_rm124825600_1899-5-10_1968.jpg   \n",
       "1          6488  1900-5-11  01/nm0000001_rm3343756032_1899-5-10_1970.jpg   \n",
       "\n",
       "      gender          name        x1        x2       y1       y2  year_photo  \n",
       "index                                                                         \n",
       "0          m  Fred Astaire  1072.926  1214.784  161.838  303.696        1968  \n",
       "1          m  Fred Astaire   477.184   622.592  100.352  245.760        1970  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n",
    "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n",
    "df_meta_imdb.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Google Knowledge Graph API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read API key\n",
    "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
    "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_kg_meta(result_obj, params):\n",
    "  global api_key, url_kg_api\n",
    "  \n",
    "  params['indent'] = True\n",
    "  params['key'] = api_key\n",
    "  params['limit'] = 1\n",
    "  \n",
    "  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
    "  try:\n",
    "    json_response = urllib.request.urlopen(url).read()\n",
    "  except Exception as e:\n",
    "    result['error'] = str(e)\n",
    "  else:\n",
    "    try:\n",
    "      response = json.loads(json_response)\n",
    "      items = response.get('itemListElement', [])\n",
    "      result_obj['accessed'] = True\n",
    "      if items:\n",
    "        item = items[0]\n",
    "        item_result = item.get('result', [])\n",
    "        result_obj['description'] = item_result.get('description', '')\n",
    "        det_desc = item_result.get('detailedDescription', '')\n",
    "        if not result_obj['kg_id']:\n",
    "          result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
    "        if det_desc:\n",
    "          result_obj['description_extended'] = det_desc.get('articleBody','')\n",
    "          result_obj['description_license'] = det_desc.get('license','')\n",
    "          result_obj['description_url'] = det_desc.get('url','')\n",
    "        else:\n",
    "          result_obj['description_extended'] = ''\n",
    "          result_obj['description_license'] = ''\n",
    "          result_obj['description_url'] = ''\n",
    "        result_img = item_result.get('image', '')\n",
    "        if result_img:\n",
    "          result_obj['image_url'] = result_img.get('contentUrl', '')\n",
    "        result_obj['name'] = item_result.get('name', '')\n",
    "        result_obj['score'] = item.get('resultScore', 0.0)\n",
    "        result_obj['url'] = item_result.get('url', '')\n",
    "    except Exception as e:\n",
    "      result_obj['error'] = str(e)\n",
    "  return result_obj\n",
    "  \n",
    "def get_kg_from_name(obj):\n",
    "  if obj['accessed']:\n",
    "    return obj\n",
    "  params = {'query': obj['query']}\n",
    "  return _get_kg_meta(obj, params)\n",
    "  \n",
    "def get_kg_from_kg_id(obj):\n",
    "  if obj['accessed']:\n",
    "    return obj\n",
    "  params = {'ids': obj['kg_id']}\n",
    "  return _get_kg_meta(obj, params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'accessed': True,\n",
      " 'description': 'American singer',\n",
      " 'description_extended': 'Taylor Alison Swift is an American '\n",
      "                         \"singer-songwriter. As one of the world's leading \"\n",
      "                         'contemporary recording artists, she is known for '\n",
      "                         'narrative songs about her personal life, which has '\n",
      "                         'received widespread media coverage.\\n',\n",
      " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
      " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
      " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
      " 'kg_id': '/m/0dl567',\n",
      " 'name': 'Taylor Swift',\n",
      " 'query': 'Taylor Swift',\n",
      " 'score': 1241.476318,\n",
      " 'url': 'http://taylorswift.com/'}\n"
     ]
    }
   ],
   "source": [
    "# make a test query to check if API works\n",
    "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default\n",
    "result = get_kg_from_name(obj)\n",
    "pprint(obj)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default\n",
    "result = get_kg_from_id(obj)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# build mapped_person objects\n",
    "mapped_persons = []\n",
    "count = 0\n",
    "df_person_groups = df_meta_imdb.groupby('name')\n",
    "for group_name, df_name_group in df_person_groups:\n",
    "  obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
    "  mapped_persons.append(obj)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define thread mapping function\n",
    "def pool_map_persons(obj):\n",
    "  global pbar\n",
    "  pbar.update(1)\n",
    "  kg_obj = get_kg_from_name(obj)\n",
    "  return kg_obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "87f6a2be42284199b8a67458f4090497",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0/20284 remaining\n"
     ]
    }
   ],
   "source": [
    "num_threads = 2\n",
    "pbar = tqdm(total=len(mapped_persons))\n",
    "\n",
    "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
    "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
    "\n",
    "# convert to thread pool\n",
    "while num_non_accessed > 0:\n",
    "  print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
    "  pool = ThreadPool(num_threads)\n",
    "\n",
    "  # start threading\n",
    "  with tqdm(total=len(mapped_persons)) as pbar:\n",
    "    mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
    "\n",
    "  # close tqdm\n",
    "  pbar.close()\n",
    "\n",
    "  num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
    "  if num_non_accessed > 0:\n",
    "    print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n",
    "    time.sleep(60*20)  # wait X minutes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n"
     ]
    }
   ],
   "source": [
    "# test output for a person\n",
    "print(mapped_persons[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
    "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
    "cc_short = 'CC BY-SA 3.0'\n",
    "nchanged = 0\n",
    "for mapped_person in mapped_persons:\n",
    "  license = mapped_person.get('description_license', None)\n",
    "  if license == cc_long:\n",
    "    nchanged += 1\n",
    "    mapped_person['description_license'] = cc_short\n",
    "print(nchanged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "# find number not accessed\n",
    "n_empty = 0\n",
    "for mapped_person in mapped_persons:\n",
    "  if not mapped_person.get('accessed', False):\n",
    "    n_empty += 1\n",
    "    print(mapped_person['kg_id'])\n",
    "print(n_empty)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create dataframe for mapped persons\n",
    "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
    "df_mapped_persons.index.name = 'index'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check output\n",
    "df_mapped_persons.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save\n",
    "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n",
    "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create small version\n",
    "limit = 1000\n",
    "fpp_out = Path(fp_out)\n",
    "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
    "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
    "df_mapped_persons_sm.index.name = 'index'\n",
    "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:megapixels]",
   "language": "python",
   "name": "conda-env-megapixels-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}