3 files changed, 518 insertions, 634 deletions
diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb
index 648fb9ac..1bf7b590 100644
--- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb
+++ b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 06: Face pose dlib/MTCNN"
+    "# IMDB WIKI: Convert .mat to CSVs"
    ]
   },
   {
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -107,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -120,7 +120,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d50c6e22d1694b54815a86d85cda6241",
+       "model_id": "8a4a106e3bee4fde89492ceef50b9c05",
        "version_major": 2,
        "version_minor": 0
       },
@@ -145,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +276,7 @@
        "4  Fred Astaire  1013.859002  1201.586128  233.882042  421.609168        1968  "
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -286,161 +286,13 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Create DataFrame for metadata"
-   ]
-  },
-  {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_results = pd.DataFrame.from_dict(results_meta)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>celeb_id</th>\n",
-       "      <th>dob</th>\n",
-       "      <th>filepath</th>\n",
-       "      <th>gender</th>\n",
-       "      <th>name</th>\n",
-       "      <th>x1</th>\n",
-       "      <th>x2</th>\n",
-       "      <th>y1</th>\n",
-       "      <th>y2</th>\n",
-       "      <th>year_photo</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>1072.926000</td>\n",
-       "      <td>1214.784000</td>\n",
-       "      <td>161.838000</td>\n",
-       "      <td>303.696000</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>477.184000</td>\n",
-       "      <td>622.592000</td>\n",
-       "      <td>100.352000</td>\n",
-       "      <td>245.760000</td>\n",
-       "      <td>1970</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm577153792_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>114.969643</td>\n",
-       "      <td>451.686572</td>\n",
-       "      <td>114.969643</td>\n",
-       "      <td>451.686572</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm946909184_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>622.885506</td>\n",
-       "      <td>844.339008</td>\n",
-       "      <td>424.217504</td>\n",
-       "      <td>645.671006</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm980463616_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>1013.859002</td>\n",
-       "      <td>1201.586128</td>\n",
-       "      <td>233.882042</td>\n",
-       "      <td>421.609168</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   celeb_id        dob                                      filepath gender  \\\n",
-       "0      6488  1900-5-11   01/nm0000001_rm124825600_1899-5-10_1968.jpg      m   \n",
-       "1      6488  1900-5-11  01/nm0000001_rm3343756032_1899-5-10_1970.jpg      m   \n",
-       "2      6488  1900-5-11   01/nm0000001_rm577153792_1899-5-10_1968.jpg      m   \n",
-       "3      6488  1900-5-11   01/nm0000001_rm946909184_1899-5-10_1968.jpg      m   \n",
-       "4      6488  1900-5-11   01/nm0000001_rm980463616_1899-5-10_1968.jpg      m   \n",
-       "\n",
-       "           name           x1           x2          y1          y2  year_photo  \n",
-       "0  Fred Astaire  1072.926000  1214.784000  161.838000  303.696000        1968  \n",
-       "1  Fred Astaire   477.184000   622.592000  100.352000  245.760000        1970  \n",
-       "2  Fred Astaire   114.969643   451.686572  114.969643  451.686572        1968  \n",
-       "3  Fred Astaire   622.885506   844.339008  424.217504  645.671006        1968  \n",
-       "4  Fred Astaire  1013.859002  1201.586128  233.882042  421.609168        1968  "
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_results.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_results.to_csv(join(dir_out,'imdb_wiki.csv'), index=False)"
+    "df_meta.index.name = 'index'\n",
+    "df_meta.to_csv(join(dir_out,'imdb_mat.csv'))"
    ]
   },
   {
@@ -452,16 +304,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_name_groups = df_results.groupby('name')\n",
+    "# count images per person and save to CSV\n",
+    "df_name_groups = df_meta.groupby('name')\n",
     "images_per_person = []\n",
     "for name, df_name in df_name_groups:\n",
     "  images_per_person.append({'name': name, 'num_images': len(df_name)})\n",
     "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n",
-    "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'), index=False)"
+    "df_images_per_person.index.name = 'index'\n",
+    "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'))"
    ]
   },
   {
@@ -473,7 +327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -482,7 +336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -491,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
diff --git a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb
new file mode 100644
index 00000000..40d7bd86
--- /dev/null
+++ b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb
@@ -0,0 +1,498 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# IMDB-WIKI Knowledge Graph\n",
+    "\n",
+    "- convert names to Knowledge Graph entity IDs\n",
+    "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "import os.path as osp\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "import random\n",
+    "import math\n",
+    "from datetime import datetime\n",
+    "import requests\n",
+    "import json\n",
+    "import time\n",
+    "from pprint import pprint\n",
+    "from multiprocessing.pool import ThreadPool\n",
+    "import threading\n",
+    "import urllib.request\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "from scipy.io import loadmat\n",
+    "import numpy as np\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load IMDB Metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>celeb_id</th>\n",
+       "      <th>dob</th>\n",
+       "      <th>filepath</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>name</th>\n",
+       "      <th>x1</th>\n",
+       "      <th>x2</th>\n",
+       "      <th>y1</th>\n",
+       "      <th>y2</th>\n",
+       "      <th>year_photo</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6488</td>\n",
+       "      <td>1900-5-11</td>\n",
+       "      <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
+       "      <td>m</td>\n",
+       "      <td>Fred Astaire</td>\n",
+       "      <td>1072.926</td>\n",
+       "      <td>1214.784</td>\n",
+       "      <td>161.838</td>\n",
+       "      <td>303.696</td>\n",
+       "      <td>1968</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6488</td>\n",
+       "      <td>1900-5-11</td>\n",
+       "      <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
+       "      <td>m</td>\n",
+       "      <td>Fred Astaire</td>\n",
+       "      <td>477.184</td>\n",
+       "      <td>622.592</td>\n",
+       "      <td>100.352</td>\n",
+       "      <td>245.760</td>\n",
+       "      <td>1970</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       celeb_id        dob                                      filepath  \\\n",
+       "index                                                                      \n",
+       "0          6488  1900-5-11   01/nm0000001_rm124825600_1899-5-10_1968.jpg   \n",
+       "1          6488  1900-5-11  01/nm0000001_rm3343756032_1899-5-10_1970.jpg   \n",
+       "\n",
+       "      gender          name        x1        x2       y1       y2  year_photo  \n",
+       "index                                                                         \n",
+       "0          m  Fred Astaire  1072.926  1214.784  161.838  303.696        1968  \n",
+       "1          m  Fred Astaire   477.184   622.592  100.352  245.760        1970  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n",
+    "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n",
+    "df_meta_imdb.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Google Knowledge Graph API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read API key\n",
+    "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+    "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _get_kg_meta(result_obj, params):\n",
+    "  global api_key, url_kg_api\n",
+    "  \n",
+    "  params['indent'] = True\n",
+    "  params['key'] = api_key\n",
+    "  params['limit'] = 1\n",
+    "  \n",
+    "  url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+    "  try:\n",
+    "    json_response = urllib.request.urlopen(url).read()\n",
+    "  except Exception as e:\n",
+    "    result['error'] = str(e)\n",
+    "  else:\n",
+    "    try:\n",
+    "      response = json.loads(json_response)\n",
+    "      items = response.get('itemListElement', [])\n",
+    "      result_obj['accessed'] = True\n",
+    "      if items:\n",
+    "        item = items[0]\n",
+    "        item_result = item.get('result', [])\n",
+    "        result_obj['description'] = item_result.get('description', '')\n",
+    "        det_desc = item_result.get('detailedDescription', '')\n",
+    "        if not result_obj['kg_id']:\n",
+    "          result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+    "        if det_desc:\n",
+    "          result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+    "          result_obj['description_license'] = det_desc.get('license','')\n",
+    "          result_obj['description_url'] = det_desc.get('url','')\n",
+    "        else:\n",
+    "          result_obj['description_extended'] = ''\n",
+    "          result_obj['description_license'] = ''\n",
+    "          result_obj['description_url'] = ''\n",
+    "        result_img = item_result.get('image', '')\n",
+    "        if result_img:\n",
+    "          result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+    "        result_obj['name'] = item_result.get('name', '')\n",
+    "        result_obj['score'] = item.get('resultScore', 0.0)\n",
+    "        result_obj['url'] = item_result.get('url', '')\n",
+    "    except Exception as e:\n",
+    "      result_obj['error'] = str(e)\n",
+    "  return result_obj\n",
+    "  \n",
+    "def get_kg_from_name(obj):\n",
+    "  if obj['accessed']:\n",
+    "    return obj\n",
+    "  params = {'query': obj['query']}\n",
+    "  return _get_kg_meta(obj, params)\n",
+    "  \n",
+    "def get_kg_from_kg_id(obj):\n",
+    "  if obj['accessed']:\n",
+    "    return obj\n",
+    "  params = {'ids': obj['kg_id']}\n",
+    "  return _get_kg_meta(obj, params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'accessed': True,\n",
+      " 'description': 'American singer',\n",
+      " 'description_extended': 'Taylor Alison Swift is an American '\n",
+      "                         \"singer-songwriter. As one of the world's leading \"\n",
+      "                         'contemporary recording artists, she is known for '\n",
+      "                         'narrative songs about her personal life, which has '\n",
+      "                         'received widespread media coverage.\\n',\n",
+      " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+      " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
+      " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
+      " 'kg_id': '/m/0dl567',\n",
+      " 'name': 'Taylor Swift',\n",
+      " 'query': 'Taylor Swift',\n",
+      " 'score': 1241.476318,\n",
+      " 'url': 'http://taylorswift.com/'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# make a test query to check if API works\n",
+    "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default\n",
+    "result = get_kg_from_name(obj)\n",
+    "pprint(obj)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}  # default\n",
+    "result = get_kg_from_id(obj)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build mapped_person objects\n",
+    "mapped_persons = []\n",
+    "count = 0\n",
+    "df_person_groups = df_meta_imdb.groupby('name')\n",
+    "for group_name, df_name_group in df_person_groups:\n",
+    "  obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+    "  mapped_persons.append(obj)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define thread mapping function\n",
+    "def pool_map_persons(obj):\n",
+    "  global pbar\n",
+    "  pbar.update(1)\n",
+    "  kg_obj = get_kg_from_name(obj)\n",
+    "  return kg_obj"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "87f6a2be42284199b8a67458f4090497",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0/20284 remaining\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_threads = 2\n",
+    "pbar = tqdm(total=len(mapped_persons))\n",
+    "\n",
+    "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+    "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+    "\n",
+    "# convert to thread pool\n",
+    "while num_non_accessed > 0:\n",
+    "  print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+    "  pool = ThreadPool(num_threads)\n",
+    "\n",
+    "  # start threading\n",
+    "  with tqdm(total=len(mapped_persons)) as pbar:\n",
+    "    mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+    "\n",
+    "  # close tqdm\n",
+    "  pbar.close()\n",
+    "\n",
+    "  num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+    "  if num_non_accessed > 0:\n",
+    "    print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n",
+    "    time.sleep(60*20)  # wait X minutes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test output for a person\n",
+    "print(mapped_persons[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+    "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+    "cc_short = 'CC BY-SA 3.0'\n",
+    "nchanged = 0\n",
+    "for mapped_person in mapped_persons:\n",
+    "  license = mapped_person.get('description_license', None)\n",
+    "  if license == cc_long:\n",
+    "    nchanged += 1\n",
+    "    mapped_person['description_license'] = cc_short\n",
+    "print(nchanged)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# find number not accessed\n",
+    "n_empty = 0\n",
+    "for mapped_person in mapped_persons:\n",
+    "  if not mapped_person.get('accessed', False):\n",
+    "    n_empty += 1\n",
+    "    print(mapped_person['kg_id'])\n",
+    "print(n_empty)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create dataframe for mapped persons\n",
+    "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+    "df_mapped_persons.index.name = 'index'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check output\n",
+    "df_mapped_persons.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save\n",
+    "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n",
+    "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create small version\n",
+    "limit = 1000\n",
+    "fpp_out = Path(fp_out)\n",
+    "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+    "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+    "df_mapped_persons_sm.index.name = 'index'\n",
+    "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:megapixels]",
+   "language": "python",
+   "name": "conda-env-megapixels-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
deleted file mode 100644
index b9a77fda..00000000
--- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
+++ /dev/null
@@ -1,468 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# IMDB-WIKI Knowledge Graph"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import os.path as osp\n",
-    "from os.path import join\n",
-    "from glob import glob\n",
-    "import random\n",
-    "import math\n",
-    "from datetime import datetime\n",
-    "import requests\n",
-    "import json\n",
-    "import urllib\n",
-    "\n",
-    "import cv2 as cv\n",
-    "import pandas as pd\n",
-    "from scipy.io import loadmat\n",
-    "import numpy as np\n",
-    "%matplotlib inline\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "from tqdm import tqdm_notebook as tqdm\n",
-    "%reload_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load Metadata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fp_meta = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_wiki.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_meta = pd.read_csv(fp_meta).set_index('index')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>celeb_id</th>\n",
-       "      <th>dob</th>\n",
-       "      <th>filepath</th>\n",
-       "      <th>gender</th>\n",
-       "      <th>name</th>\n",
-       "      <th>x1</th>\n",
-       "      <th>x2</th>\n",
-       "      <th>y1</th>\n",
-       "      <th>y2</th>\n",
-       "      <th>year_photo</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>index</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>1072.926000</td>\n",
-       "      <td>1214.784000</td>\n",
-       "      <td>161.838000</td>\n",
-       "      <td>303.696000</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>477.184000</td>\n",
-       "      <td>622.592000</td>\n",
-       "      <td>100.352000</td>\n",
-       "      <td>245.760000</td>\n",
-       "      <td>1970</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm577153792_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>114.969643</td>\n",
-       "      <td>451.686572</td>\n",
-       "      <td>114.969643</td>\n",
-       "      <td>451.686572</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm946909184_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>622.885506</td>\n",
-       "      <td>844.339008</td>\n",
-       "      <td>424.217504</td>\n",
-       "      <td>645.671006</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>6488</td>\n",
-       "      <td>1900-5-11</td>\n",
-       "      <td>01/nm0000001_rm980463616_1899-5-10_1968.jpg</td>\n",
-       "      <td>m</td>\n",
-       "      <td>Fred Astaire</td>\n",
-       "      <td>1013.859002</td>\n",
-       "      <td>1201.586128</td>\n",
-       "      <td>233.882042</td>\n",
-       "      <td>421.609168</td>\n",
-       "      <td>1968</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       celeb_id        dob                                      filepath  \\\n",
-       "index                                                                      \n",
-       "0          6488  1900-5-11   01/nm0000001_rm124825600_1899-5-10_1968.jpg   \n",
-       "1          6488  1900-5-11  01/nm0000001_rm3343756032_1899-5-10_1970.jpg   \n",
-       "2          6488  1900-5-11   01/nm0000001_rm577153792_1899-5-10_1968.jpg   \n",
-       "3          6488  1900-5-11   01/nm0000001_rm946909184_1899-5-10_1968.jpg   \n",
-       "4          6488  1900-5-11   01/nm0000001_rm980463616_1899-5-10_1968.jpg   \n",
-       "\n",
-       "      gender          name           x1           x2          y1          y2  \\\n",
-       "index                                                                          \n",
-       "0          m  Fred Astaire  1072.926000  1214.784000  161.838000  303.696000   \n",
-       "1          m  Fred Astaire   477.184000   622.592000  100.352000  245.760000   \n",
-       "2          m  Fred Astaire   114.969643   451.686572  114.969643  451.686572   \n",
-       "3          m  Fred Astaire   622.885506   844.339008  424.217504  645.671006   \n",
-       "4          m  Fred Astaire  1013.859002  1201.586128  233.882042  421.609168   \n",
-       "\n",
-       "       year_photo  \n",
-       "index              \n",
-       "0            1968  \n",
-       "1            1970  \n",
-       "2            1968  \n",
-       "3            1968  \n",
-       "4            1968  "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_meta.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ids"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
-    "\n",
-    "def get_knowledge(q, api_key):\n",
-    "  service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
-    "  params = {\n",
-    "      'query': q,\n",
-    "      'limit': 5,\n",
-    "      'indent': True,\n",
-    "      'key': api_key,\n",
-    "      }\n",
-    "  url = service_url + '?' + urllib.parse.urlencode(params)  # TODO: use requests\n",
-    "  response = json.loads(urllib.request.urlopen(url).read())\n",
-    "  response = response.get('itemListElement', [])\n",
-    "  if len(response) > 0:\n",
-    "    result = response[0].get('result', [])\n",
-    "    result['score'] = response[0]['resultScore']\n",
-    "    return result\n",
-    "  else:\n",
-    "    return []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "General Secretary of the Communist Party of China\n",
-      "Xi Jinping\n"
-     ]
-    },
-    {
-     "ename": "KeyError",
-     "evalue": "'url'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                           Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-106-654588fe3a11>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'description'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'url'"
-     ]
-    }
-   ],
-   "source": [
-    "# test\n",
-    "q = 'Xi Jinping'\n",
-    "r = get_knowledge(q, api_key)\n",
-    "print(r['description'])\n",
-    "print(r['name'])\n",
-    "print(r['url'])\n",
-    "print(r['score'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pprint import pprint"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "kg:/m/06ff60\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(r['@id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'@id': 'kg:/g/11f4ksbzcm',\n",
-      " '@type': ['Thing', 'Event'],\n",
-      " 'detailedDescription': {'articleBody': 'On February 14, 2018, a gunman opened '\n",
-      "                                        'fire at Marjory Stoneman Douglas High '\n",
-      "                                        'School in Parkland, Florida, killing '\n",
-      "                                        'seventeen students and staff members '\n",
-      "                                        'and injuring seventeen others. ',\n",
-      "                         'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
-      "                         'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
-      " 'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQmY7VqmGt4zEJU8Rc4EwPWroYd-L0QQ5wkZfiFO-WRqNBC-FPN',\n",
-      "           'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
-      " 'name': 'Stoneman Douglas High School shooting',\n",
-      " 'score': 60.411652}\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(r)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kgs_msceleb = os.listdir(dir_msceleb)\n",
-    "kgs_msceleb = ['/' + x.replace('.','/') for x in kgs_msceleb]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "'/m/06ff60' in kgs_msceleb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_kg_by_id(kg_id, api_key):\n",
-    "  service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
-    "  params = {\n",
-    "      'ids': kg_id,\n",
-    "      'limit': 1,\n",
-    "      'indent': True,\n",
-    "      'key': api_key,\n",
-    "      }\n",
-    "  url = service_url + '?' + urllib.parse.urlencode(params)  # TODO: use requests\n",
-    "  try:\n",
-    "    response = json.loads(urllib.request.urlopen(url).read())\n",
-    "    response = response.get('itemListElement', [])\n",
-    "    result = response[0].get('result', [])\n",
-    "    result['score'] = response[0]['resultScore']\n",
-    "    return result\n",
-    "  except Exception as e:\n",
-    "    return []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "a = get_kg_by_id('/m/0100n5bs', api_key)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 123,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python [conda env:megapixels]",
-   "language": "python",
-   "name": "conda-env-megapixels-py"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}