summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/vgg_face2/identity.ipynb
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-02-12 15:18:46 +0100
committeradamhrv <adam@ahprojects.com>2019-02-12 15:18:46 +0100
commita5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 (patch)
tree1e7a45a8d2c746994584cc5f8e4ccdabad82f8d8 /megapixels/notebooks/datasets/vgg_face2/identity.ipynb
parente95455a8a4013dafdeb7e41cfa8fb1f3ccc28dbb (diff)
reorder nbs
Diffstat (limited to 'megapixels/notebooks/datasets/vgg_face2/identity.ipynb')
-rw-r--r--megapixels/notebooks/datasets/vgg_face2/identity.ipynb439
1 files changed, 0 insertions, 439 deletions
diff --git a/megapixels/notebooks/datasets/vgg_face2/identity.ipynb b/megapixels/notebooks/datasets/vgg_face2/identity.ipynb
deleted file mode 100644
index 66eeeb90..00000000
--- a/megapixels/notebooks/datasets/vgg_face2/identity.ipynb
+++ /dev/null
@@ -1,439 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# UMD Faces Knowledge Graph Identities\n",
- "\n",
- "- convert filename-names to names\n",
- "- fetch Google Knowledge Graph entity IDs for each name\n",
- "- save KG IDs to CSV"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext autoreload\n",
- "%autoreload 2\n",
- "\n",
- "import os\n",
- "import os.path as osp\n",
- "from os.path import join\n",
- "from glob import glob\n",
- "import random\n",
- "import math\n",
- "from datetime import datetime\n",
- "import requests\n",
- "import json\n",
- "import time\n",
- "from pprint import pprint\n",
- "from multiprocessing.pool import ThreadPool\n",
- "import threading\n",
- "import urllib.request\n",
- "\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "import pandas as pd\n",
- "from scipy.io import loadmat\n",
- "import numpy as np\n",
- "%matplotlib inline\n",
- "import matplotlib.pyplot as plt"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load IMDB Metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [],
- "source": [
- "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n",
- "with open(fp_filenames, 'r') as fp:\n",
- " filenames = fp.readlines()\n",
- "_ = filenames.pop(0)\n",
- "filenames = [x.replace('_', ' ').strip() for x in filenames]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "aaron rodgers\n"
- ]
- }
- ],
- "source": [
- "print(filenames[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Google Knowledge Graph API"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "# read API key\n",
- "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
- "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "def _get_kg_meta(result_obj, params):\n",
- " global api_key, url_kg_api\n",
- " \n",
- " params['indent'] = True\n",
- " params['key'] = api_key\n",
- " params['limit'] = 1\n",
- " \n",
- " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
- " try:\n",
- " json_response = urllib.request.urlopen(url).read()\n",
- " except Exception as e:\n",
- " result['error'] = str(e)\n",
- " else:\n",
- " try:\n",
- " response = json.loads(json_response)\n",
- " items = response.get('itemListElement', [])\n",
- " result_obj['accessed'] = True\n",
- " if items:\n",
- " item = items[0]\n",
- " item_result = item.get('result', [])\n",
- " result_obj['description'] = item_result.get('description', '')\n",
- " det_desc = item_result.get('detailedDescription', '')\n",
- " if not result_obj['kg_id']:\n",
- " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
- " if det_desc:\n",
- " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
- " result_obj['description_license'] = det_desc.get('license','')\n",
- " result_obj['description_url'] = det_desc.get('url','')\n",
- " else:\n",
- " result_obj['description_extended'] = ''\n",
- " result_obj['description_license'] = ''\n",
- " result_obj['description_url'] = ''\n",
- " result_img = item_result.get('image', '')\n",
- " if result_img:\n",
- " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
- " result_obj['name'] = item_result.get('name', '')\n",
- " result_obj['score'] = item.get('resultScore', 0.0)\n",
- " result_obj['url'] = item_result.get('url', '')\n",
- " except Exception as e:\n",
- " result_obj['error'] = str(e)\n",
- " return result_obj\n",
- " \n",
- "def get_kg_from_name(obj):\n",
- " if obj['accessed']:\n",
- " return obj\n",
- " params = {'query': obj['query']}\n",
- " return _get_kg_meta(obj, params)\n",
- " \n",
- "def get_kg_from_kg_id(obj):\n",
- " if obj['accessed']:\n",
- " return obj\n",
- " params = {'ids': obj['kg_id']}\n",
- " return _get_kg_meta(obj, params)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'accessed': True,\n",
- " 'description': 'American singer',\n",
- " 'description_extended': 'Taylor Alison Swift is an American '\n",
- " \"singer-songwriter. As one of the world's leading \"\n",
- " 'contemporary recording artists, she is known for '\n",
- " 'narrative songs about her personal life, which has '\n",
- " 'received widespread media coverage.\\n',\n",
- " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
- " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
- " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
- " 'kg_id': '/m/0dl567',\n",
- " 'name': 'Taylor Swift',\n",
- " 'query': 'Taylor Swift',\n",
- " 'score': 1241.476318,\n",
- " 'url': 'http://taylorswift.com/'}\n"
- ]
- }
- ],
- "source": [
- "# test get from name\n",
- "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
- "result = get_kg_from_name(obj)\n",
- "pprint(obj)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [],
- "source": [
- "# define thread mapping function\n",
- "def pool_map_persons(obj):\n",
- " global pbar\n",
- " pbar.update(1)\n",
- " kg_obj = get_kg_from_name(obj)\n",
- " return kg_obj"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [],
- "source": [
- "# build mapped_person objects\n",
- "mapped_persons = []\n",
- "for fn in filenames:\n",
- " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
- " mapped_persons.append(obj)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "3107\n",
- "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n"
- ]
- }
- ],
- "source": [
- "print(len(mapped_persons))\n",
- "print(filenames[0:10])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "4752a8e0280e4a58843a21401d9ed649",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1102/3107 remaining\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "882c60006b0d4a9e809297bbc1e86807",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "num_threads = 20\n",
- "pbar = tqdm(total=len(mapped_persons))\n",
- "\n",
- "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
- "\n",
- "# convert to thread pool\n",
- "while num_non_accessed > 0:\n",
- " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
- " pool = ThreadPool(num_threads)\n",
- "\n",
- " # start threading\n",
- " with tqdm(total=len(mapped_persons)) as pbar:\n",
- " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
- "\n",
- " # close tqdm\n",
- " pbar.close()\n",
- "\n",
- " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
- " if num_non_accessed > 0:\n",
- " print(f'{num_non_accessed} remaining. Sleeping...')\n",
- " time.sleep(60*20) # wait X minutes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee QuiƱones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee QuiƱones'}\n"
- ]
- }
- ],
- "source": [
- "# test output for a person\n",
- "print(mapped_persons[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
- "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
- "cc_short = 'CC BY-SA 3.0'\n",
- "nchanged = 0\n",
- "for mapped_person in mapped_persons:\n",
- " license = mapped_person.get('description_license', None)\n",
- " if license == cc_long:\n",
- " nchanged += 1\n",
- " mapped_person['description_license'] = cc_short\n",
- "print(nchanged)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# find number not accessed\n",
- "n_empty = 0\n",
- "for mapped_person in mapped_persons:\n",
- " if not mapped_person.get('accessed', False):\n",
- " n_empty += 1\n",
- " print(mapped_person['kg_id'])\n",
- "print(n_empty)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create dataframe for mapped persons\n",
- "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
- "df_mapped_persons.index.name = 'index'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# check output\n",
- "df_mapped_persons.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# save\n",
- "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n",
- "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create small version\n",
- "limit = 1000\n",
- "fpp_out = Path(fp_out)\n",
- "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
- "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
- "df_mapped_persons_sm.index.name = 'index'\n",
- "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# for later, check similarity score to othyer identity kg CSVs\n",
- "from difflib import SequenceMatcher\n",
- "def similar(a, b):\n",
- " return SequenceMatcher(None, a, b).ratio()"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:megapixels]",
- "language": "python",
- "name": "conda-env-megapixels-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}