summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-01-22 13:42:56 +0100
committeradamhrv <adam@ahprojects.com>2019-01-22 13:42:56 +0100
commitb0b06be0defe97ef19cf4d0f3328db40d299e110 (patch)
tree5d2388d716c8bba11380728bd88158116861d630 /megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
parentad1f5d63198915c1902694edfb65705a9646a2f0 (diff)
add kg nb
Diffstat (limited to 'megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb')
-rw-r--r--megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb468
1 files changed, 468 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
new file mode 100644
index 00000000..b9a77fda
--- /dev/null
+++ b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
@@ -0,0 +1,468 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# IMDB-WIKI Knowledge Graph"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import urllib\n",
+ "\n",
+ "import cv2 as cv\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "%reload_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_meta = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_wiki.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_meta = pd.read_csv(fp_meta).set_index('index')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>celeb_id</th>\n",
+ " <th>dob</th>\n",
+ " <th>filepath</th>\n",
+ " <th>gender</th>\n",
+ " <th>name</th>\n",
+ " <th>x1</th>\n",
+ " <th>x2</th>\n",
+ " <th>y1</th>\n",
+ " <th>y2</th>\n",
+ " <th>year_photo</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>1072.926000</td>\n",
+ " <td>1214.784000</td>\n",
+ " <td>161.838000</td>\n",
+ " <td>303.696000</td>\n",
+ " <td>1968</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>477.184000</td>\n",
+ " <td>622.592000</td>\n",
+ " <td>100.352000</td>\n",
+ " <td>245.760000</td>\n",
+ " <td>1970</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm577153792_1899-5-10_1968.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>114.969643</td>\n",
+ " <td>451.686572</td>\n",
+ " <td>114.969643</td>\n",
+ " <td>451.686572</td>\n",
+ " <td>1968</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm946909184_1899-5-10_1968.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>622.885506</td>\n",
+ " <td>844.339008</td>\n",
+ " <td>424.217504</td>\n",
+ " <td>645.671006</td>\n",
+ " <td>1968</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm980463616_1899-5-10_1968.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>1013.859002</td>\n",
+ " <td>1201.586128</td>\n",
+ " <td>233.882042</td>\n",
+ " <td>421.609168</td>\n",
+ " <td>1968</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " celeb_id dob filepath \\\n",
+ "index \n",
+ "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n",
+ "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n",
+ "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg \n",
+ "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg \n",
+ "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg \n",
+ "\n",
+ " gender name x1 x2 y1 y2 \\\n",
+ "index \n",
+ "0 m Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 \n",
+ "1 m Fred Astaire 477.184000 622.592000 100.352000 245.760000 \n",
+ "2 m Fred Astaire 114.969643 451.686572 114.969643 451.686572 \n",
+ "3 m Fred Astaire 622.885506 844.339008 424.217504 645.671006 \n",
+ "4 m Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 \n",
+ "\n",
+ " year_photo \n",
+ "index \n",
+ "0 1968 \n",
+ "1 1970 \n",
+ "2 1968 \n",
+ "3 1968 \n",
+ "4 1968 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_meta.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ids"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "\n",
+ "def get_knowledge(q, api_key):\n",
+ " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
+ " params = {\n",
+ " 'query': q,\n",
+ " 'limit': 5,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n",
+ " response = json.loads(urllib.request.urlopen(url).read())\n",
+ " response = response.get('itemListElement', [])\n",
+ " if len(response) > 0:\n",
+ " result = response[0].get('result', [])\n",
+ " result['score'] = response[0]['resultScore']\n",
+ " return result\n",
+ " else:\n",
+ " return []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "General Secretary of the Communist Party of China\n",
+ "Xi Jinping\n"
+ ]
+ },
+ {
+ "ename": "KeyError",
+ "evalue": "'url'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-106-654588fe3a11>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'description'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'url'"
+ ]
+ }
+ ],
+ "source": [
+ "# test\n",
+ "q = 'Xi Jinping'\n",
+ "r = get_knowledge(q, api_key)\n",
+ "print(r['description'])\n",
+ "print(r['name'])\n",
+ "print(r['url'])\n",
+ "print(r['score'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pprint import pprint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "kg:/m/06ff60\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(r['@id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'@id': 'kg:/g/11f4ksbzcm',\n",
+ " '@type': ['Thing', 'Event'],\n",
+ " 'detailedDescription': {'articleBody': 'On February 14, 2018, a gunman opened '\n",
+ " 'fire at Marjory Stoneman Douglas High '\n",
+ " 'School in Parkland, Florida, killing '\n",
+ " 'seventeen students and staff members '\n",
+ " 'and injuring seventeen others. ',\n",
+ " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
+ " 'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQmY7VqmGt4zEJU8Rc4EwPWroYd-L0QQ5wkZfiFO-WRqNBC-FPN',\n",
+ " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
+ " 'name': 'Stoneman Douglas High School shooting',\n",
+ " 'score': 60.411652}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pprint(r)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kgs_msceleb = os.listdir(dir_msceleb)\n",
+ "kgs_msceleb = ['/' + x.replace('.','/') for x in kgs_msceleb]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 109,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "'/m/06ff60' in kgs_msceleb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_kg_by_id(kg_id, api_key):\n",
+ " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
+ " params = {\n",
+ " 'ids': kg_id,\n",
+ " 'limit': 1,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n",
+ " try:\n",
+ " response = json.loads(urllib.request.urlopen(url).read())\n",
+ " response = response.get('itemListElement', [])\n",
+ " result = response[0].get('result', [])\n",
+ " result['score'] = response[0]['resultScore']\n",
+ " return result\n",
+ " except Exception as e:\n",
+ " return []"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "a = get_kg_by_id('/m/0100n5bs', api_key)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 123,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "a"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}