From dd2c36288aa1e8af14588f9258f6785879b8638c Mon Sep 17 00:00:00 2001 From: adamhrv Date: Mon, 28 Jan 2019 18:11:36 +0100 Subject: add utils for analyzing identities --- .../notebooks/datasets/imdb_wiki/convert_mat.ipynb | 427 +++++++++++ .../notebooks/datasets/imdb_wiki/identity.ipynb | 498 +++++++++++++ .../datasets/imdb_wiki/imdb_wiki_kg.ipynb | 468 ------------ .../datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb | 573 --------------- .../datasets/knowledge_graph/identity.ipynb | 792 +++++++++++++++++++++ .../notebooks/datasets/lfw/count_images.ipynb | 247 +++++++ megapixels/notebooks/datasets/lfw/lfw_names.ipynb | 2 +- .../notebooks/datasets/msceleb/identity.ipynb | 378 ++++++++++ megapixels/notebooks/datasets/names_kg.ipynb | 243 ------- .../notebooks/datasets/pubfig83/identity.ipynb | 656 +++++++++++++++++ .../notebooks/datasets/umd_faces/identity.ipynb | 675 ++++++++++++++++++ .../vgg_face2/clean_vgg_identity_meta_kg.ipynb | 2 +- .../notebooks/datasets/vgg_face2/identity.ipynb | 439 ++++++++++++ 13 files changed, 4114 insertions(+), 1286 deletions(-) create mode 100644 megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb create mode 100644 megapixels/notebooks/datasets/imdb_wiki/identity.ipynb delete mode 100644 megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb delete mode 100644 megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb create mode 100644 megapixels/notebooks/datasets/knowledge_graph/identity.ipynb create mode 100644 megapixels/notebooks/datasets/lfw/count_images.ipynb create mode 100644 megapixels/notebooks/datasets/msceleb/identity.ipynb delete mode 100644 megapixels/notebooks/datasets/names_kg.ipynb create mode 100644 megapixels/notebooks/datasets/pubfig83/identity.ipynb create mode 100644 megapixels/notebooks/datasets/umd_faces/identity.ipynb create mode 100644 megapixels/notebooks/datasets/vgg_face2/identity.ipynb (limited to 'megapixels/notebooks') diff --git a/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb new file mode 100644 index 00000000..1bf7b590 --- /dev/null +++ b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB WIKI: Convert .mat to CSVs" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "\n", + "import cv2 as cv\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "%reload_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "fp_mat = '/data_store_hdd/datasets/people/imdb_wiki/downloads/imdb.mat'\n", + "dir_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mat_data = loadmat(fp_mat)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# row 3\n", + "def load_parse_imdb_mat(mat):\n", + " metadata = mat['imdb'][0][0]\n", + " results = []\n", + " num_records = len(metadata[0][0])\n", + " print(f'loaded: {num_records} records')\n", + " for i in tqdm(range(num_records), total=num_records):\n", + " dob_matlab = metadata[0][0][i]\n", + " dob = datetime.fromordinal(dob_matlab)\n", + " dob_str = f'{dob.year}-{dob.month}-{dob.day}'\n", + " year_photo = metadata[1][0][i]\n", + " fp = metadata[2][0][i][0]\n", + " gender_val = metadata[3][0][i]\n", + " if gender_val == 0:\n", + " gender = 'f'\n", + " elif gender_val == 1:\n", + " gender = 'm'\n", + " else:\n", + " gender = None\n", + " name = metadata[4][0][i][0]\n", + " roi = metadata[5][0][i][0]\n", + " face_conf = metadata[6][0][i]\n", + " face_conf_second = metadata[7][0][i]\n", + " celeb_id = metadata[9][0][i]\n", + " result = {\n", + " 'dob': dob_str,\n", + " 'year_photo': year_photo,\n", + " 'filepath': fp,\n", + " 'gender': gender,\n", + " 'name': name,\n", + " 'x1': roi[0],\n", + " 'y1': roi[1],\n", + " 'x2': roi[2],\n", + " 'y2': roi[3],\n", + " 'celeb_id': celeb_id\n", + " }\n", + " results.append(result)\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loaded: 460723 records\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a4a106e3bee4fde89492ceef50b9c05", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=460723), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "results_meta = load_parse_imdb_mat(mat_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta = pd.DataFrame.from_dict(results_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", + "
" + ], + "text/plain": [ + " celeb_id dob filepath gender \\\n", + "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", + "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", + "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", + "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", + "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", + "\n", + " name x1 x2 y1 y2 year_photo \n", + "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", + "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", + "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", + "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", + "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta.index.name = 'index'\n", + "df_meta.to_csv(join(dir_out,'imdb_mat.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count Images per Person" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# count images per person and save to CSV\n", + "df_name_groups = df_meta.groupby('name')\n", + "images_per_person = []\n", + "for name, df_name in df_name_groups:\n", + " images_per_person.append({'name': name, 'num_images': len(df_name)})\n", + "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n", + "df_images_per_person.index.name = 'index'\n", + "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Find Face Size" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "sizes = [(x['x2'] - x['x1']) for x in results_meta]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "buckets = list(range(0,500,50))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt \n", + "import numpy as np \n", + "bins = list(range(0,500,20))\n", + "plt.figure(figsize=(12,8))\n", + "plt.hist(sizes, bins=bins)\n", + "plt.title(\"Face Image Sizes\") \n", + "plt.ylabel(\"Images\")\n", + "plt.xlabel(\"Width (px)\")\n", + "plt.yticks(range(0, 60000, 10000))\n", + "plt.title('IMDB-Wiki: Face Pixel Size')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "dob: date of birth (Matlab serial date number)\n", + "photo_taken: year when the photo was taken\n", + "full_path: path to file\n", + "gender: 0 for female and 1 for male, NaN if unknown\n", + "name: name of the celebrity\n", + "face_location: location of the face. To crop the face in Matlab run\n", + "\n", + "img(face_location(2):face_location(4),face_location(1):face_location(3),:))\n", + "\n", + "face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image\n", + "second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.\n", + "celeb_names (IMDB only): list of all celebrity names\n", + "celeb_id (IMDB only): index of celebrity name\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb new file mode 100644 index 00000000..40d7bd86 --- /dev/null +++ b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb @@ -0,0 +1,498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB-WIKI Knowledge Graph\n", + "\n", + "- convert names to Knowledge Graph entity IDs\n", + "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load IMDB Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
index
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9261214.784161.838303.6961968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184622.592100.352245.7601970
\n", + "
" + ], + "text/plain": [ + " celeb_id dob filepath \\\n", + "index \n", + "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n", + "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n", + "\n", + " gender name x1 x2 y1 y2 year_photo \n", + "index \n", + "0 m Fred Astaire 1072.926 1214.784 161.838 303.696 1968 \n", + "1 m Fred Astaire 477.184 622.592 100.352 245.760 1970 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n", + "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n", + "df_meta_imdb.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'American singer',\n", + " 'description_extended': 'Taylor Alison Swift is an American '\n", + " \"singer-songwriter. As one of the world's leading \"\n", + " 'contemporary recording artists, she is known for '\n", + " 'narrative songs about her personal life, which has '\n", + " 'received widespread media coverage.\\n',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n", + " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n", + " 'kg_id': '/m/0dl567',\n", + " 'name': 'Taylor Swift',\n", + " 'query': 'Taylor Swift',\n", + " 'score': 1241.476318,\n", + " 'url': 'http://taylorswift.com/'}\n" + ] + } + ], + "source": [ + "# make a test query to check if API works\n", + "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_id(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "count = 0\n", + "df_person_groups = df_meta_imdb.groupby('name')\n", + "for group_name, df_name_group in df_person_groups:\n", + " obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "87f6a2be42284199b8a67458f4090497", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0/20284 remaining\n" + ] + } + ], + "source": [ + "num_threads = 2\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n", + " time.sleep(60*20) # wait X minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n" + ] + } + ], + "source": [ + "# test output for a person\n", + "print(mapped_persons[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(nchanged)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(n_empty)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check output\n", + "df_mapped_persons.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb deleted file mode 100644 index b9a77fda..00000000 --- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb +++ /dev/null @@ -1,468 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# IMDB-WIKI Knowledge Graph" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "import requests\n", - "import json\n", - "import urllib\n", - "\n", - "import cv2 as cv\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "fp_meta = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_wiki.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.read_csv(fp_meta).set_index('index')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
index
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath \\\n", - "index \n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg \n", - "\n", - " gender name x1 x2 y1 y2 \\\n", - "index \n", - "0 m Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 \n", - "1 m Fred Astaire 477.184000 622.592000 100.352000 245.760000 \n", - "2 m Fred Astaire 114.969643 451.686572 114.969643 451.686572 \n", - "3 m Fred Astaire 622.885506 844.339008 424.217504 645.671006 \n", - "4 m Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 \n", - "\n", - " year_photo \n", - "index \n", - "0 1968 \n", - "1 1970 \n", - "2 1968 \n", - "3 1968 \n", - "4 1968 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_meta.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ids" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", - "\n", - "def get_knowledge(q, api_key):\n", - " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n", - " params = {\n", - " 'query': q,\n", - " 'limit': 5,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n", - " response = json.loads(urllib.request.urlopen(url).read())\n", - " response = response.get('itemListElement', [])\n", - " if len(response) > 0:\n", - " result = response[0].get('result', [])\n", - " result['score'] = response[0]['resultScore']\n", - " return result\n", - " else:\n", - " return []" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "General Secretary of the Communist Party of China\n", - "Xi Jinping\n" - ] - }, - { - "ename": "KeyError", - "evalue": "'url'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'description'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'url'" - ] - } - ], - "source": [ - "# test\n", - "q = 'Xi Jinping'\n", - "r = get_knowledge(q, api_key)\n", - "print(r['description'])\n", - "print(r['name'])\n", - "print(r['url'])\n", - "print(r['score'])" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kg:/m/06ff60\n" - ] - } - ], - "source": [ - "print(r['@id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'@id': 'kg:/g/11f4ksbzcm',\n", - " '@type': ['Thing', 'Event'],\n", - " 'detailedDescription': {'articleBody': 'On February 14, 2018, a gunman opened '\n", - " 'fire at Marjory Stoneman Douglas High '\n", - " 'School in Parkland, Florida, killing '\n", - " 'seventeen students and staff members '\n", - " 'and injuring seventeen others. ',\n", - " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", - " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n", - " 'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQmY7VqmGt4zEJU8Rc4EwPWroYd-L0QQ5wkZfiFO-WRqNBC-FPN',\n", - " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n", - " 'name': 'Stoneman Douglas High School shooting',\n", - " 'score': 60.411652}\n" - ] - } - ], - "source": [ - "pprint(r)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "kgs_msceleb = os.listdir(dir_msceleb)\n", - "kgs_msceleb = ['/' + x.replace('.','/') for x in kgs_msceleb]" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'/m/06ff60' in kgs_msceleb" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [], - "source": [ - "def get_kg_by_id(kg_id, api_key):\n", - " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n", - " params = {\n", - " 'ids': kg_id,\n", - " 'limit': 1,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n", - " try:\n", - " response = json.loads(urllib.request.urlopen(url).read())\n", - " response = response.get('itemListElement', [])\n", - " result = response[0].get('result', [])\n", - " result['score'] = response[0]['resultScore']\n", - " return result\n", - " except Exception as e:\n", - " return []" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [], - "source": [ - "a = get_kg_by_id('/m/0100n5bs', api_key)" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb deleted file mode 100644 index 648fb9ac..00000000 --- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb +++ /dev/null @@ -1,573 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 06: Face pose dlib/MTCNN" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "from datetime import datetime\n", - "\n", - "import cv2 as cv\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "fp_mat = '/data_store_hdd/datasets/people/imdb_wiki/downloads/imdb.mat'\n", - "dir_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "mat_data = loadmat(fp_mat)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# row 3\n", - "def load_parse_imdb_mat(mat):\n", - " metadata = mat['imdb'][0][0]\n", - " results = []\n", - " num_records = len(metadata[0][0])\n", - " print(f'loaded: {num_records} records')\n", - " for i in tqdm(range(num_records), total=num_records):\n", - " dob_matlab = metadata[0][0][i]\n", - " dob = datetime.fromordinal(dob_matlab)\n", - " dob_str = f'{dob.year}-{dob.month}-{dob.day}'\n", - " year_photo = metadata[1][0][i]\n", - " fp = metadata[2][0][i][0]\n", - " gender_val = metadata[3][0][i]\n", - " if gender_val == 0:\n", - " gender = 'f'\n", - " elif gender_val == 1:\n", - " gender = 'm'\n", - " else:\n", - " gender = None\n", - " name = metadata[4][0][i][0]\n", - " roi = metadata[5][0][i][0]\n", - " face_conf = metadata[6][0][i]\n", - " face_conf_second = metadata[7][0][i]\n", - " celeb_id = metadata[9][0][i]\n", - " result = {\n", - " 'dob': dob_str,\n", - " 'year_photo': year_photo,\n", - " 'filepath': fp,\n", - " 'gender': gender,\n", - " 'name': name,\n", - " 'x1': roi[0],\n", - " 'y1': roi[1],\n", - " 'x2': roi[2],\n", - " 'y2': roi[3],\n", - " 'celeb_id': celeb_id\n", - " }\n", - " results.append(result)\n", - " return results" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "loaded: 460723 records\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d50c6e22d1694b54815a86d85cda6241", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=460723), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "results_meta = load_parse_imdb_mat(mat_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df_meta = pd.DataFrame.from_dict(results_meta)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath gender \\\n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", - "\n", - " name x1 x2 y1 y2 year_photo \n", - "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", - "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", - "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", - "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", - "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_meta.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create DataFrame for metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "df_results = pd.DataFrame.from_dict(results_meta)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
celeb_iddobfilepathgendernamex1x2y1y2year_photo
064881900-5-1101/nm0000001_rm124825600_1899-5-10_1968.jpgmFred Astaire1072.9260001214.784000161.838000303.6960001968
164881900-5-1101/nm0000001_rm3343756032_1899-5-10_1970.jpgmFred Astaire477.184000622.592000100.352000245.7600001970
264881900-5-1101/nm0000001_rm577153792_1899-5-10_1968.jpgmFred Astaire114.969643451.686572114.969643451.6865721968
364881900-5-1101/nm0000001_rm946909184_1899-5-10_1968.jpgmFred Astaire622.885506844.339008424.217504645.6710061968
464881900-5-1101/nm0000001_rm980463616_1899-5-10_1968.jpgmFred Astaire1013.8590021201.586128233.882042421.6091681968
\n", - "
" - ], - "text/plain": [ - " celeb_id dob filepath gender \\\n", - "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n", - "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n", - "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n", - "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n", - "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n", - "\n", - " name x1 x2 y1 y2 year_photo \n", - "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n", - "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n", - "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n", - "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n", - "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_results.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "df_results.to_csv(join(dir_out,'imdb_wiki.csv'), index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count Images per Person" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "df_name_groups = df_results.groupby('name')\n", - "images_per_person = []\n", - "for name, df_name in df_name_groups:\n", - " images_per_person.append({'name': name, 'num_images': len(df_name)})\n", - "df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n", - "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'), index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Find Face Size" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "sizes = [(x['x2'] - x['x1']) for x in results_meta]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "buckets = list(range(0,500,50))" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from matplotlib import pyplot as plt \n", - "import numpy as np \n", - "bins = list(range(0,500,20))\n", - "plt.figure(figsize=(12,8))\n", - "plt.hist(sizes, bins=bins)\n", - "plt.title(\"Face Image Sizes\") \n", - "plt.ylabel(\"Images\")\n", - "plt.xlabel(\"Width (px)\")\n", - "plt.yticks(range(0, 60000, 10000))\n", - "plt.title('IMDB-Wiki: Face Pixel Size')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "dob: date of birth (Matlab serial date number)\n", - "photo_taken: year when the photo was taken\n", - "full_path: path to file\n", - "gender: 0 for female and 1 for male, NaN if unknown\n", - "name: name of the celebrity\n", - "face_location: location of the face. To crop the face in Matlab run\n", - "\n", - "img(face_location(2):face_location(4),face_location(1):face_location(3),:))\n", - "\n", - "face_score: detector score (the higher the better). Inf implies that no face was found in the image and the face_location then just returns the entire image\n", - "second_face_score: detector score of the face with the second highest score. This is useful to ignore images with more than one face. second_face_score is NaN if no second face was detected.\n", - "celeb_names (IMDB only): list of all celebrity names\n", - "celeb_id (IMDB only): index of celebrity name\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb new file mode 100644 index 00000000..81a74faf --- /dev/null +++ b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb @@ -0,0 +1,792 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Knowledge Graph Identities\n", + "\n", + "- convert filename-names to names\n", + "- fetch Google Knowledge Graph entity IDs for each name\n", + "- save KG IDs to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "import difflib\n", + "import unidecode\n", + "import slugify\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import api_utils\n", + "from app.settings import types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get List of Names" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_names(enum_dataset):\n", + " if enum_dataset == types.Dataset.LFW:\n", + " dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n", + " names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", + " elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n", + " names = [x for x in names if 'labeled faces.txt' not in x]\n", + " return names" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n" + ] + } + ], + "source": [ + "names = get_names(types.Dataset.LFW)\n", + "print(names[0:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n", + "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n", + "wp_api = api_utils.WikipediaAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 241, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wp\n", + "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n", + "kg\n", + "{'kg_accessed': True,\n", + " 'kg_bio': '',\n", + " 'kg_bio_url': '',\n", + " 'kg_description': '',\n", + " 'kg_id': '',\n", + " 'kg_image_url': '',\n", + " 'kg_name': '',\n", + " 'kg_score': 0,\n", + " 'kg_url': '',\n", + " 'query': 'Jeff Dederian'}\n" + ] + } + ], + "source": [ + "#wp_api.test_access()\n", + "print('wp')\n", + "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n", + "print('kg')\n", + "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test Name Similarity Matching" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "metadata": {}, + "outputs": [], + "source": [ + "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n", + " if name == '':\n", + " return False\n", + " # check and remove if WP added parenthesis\n", + " if '(' in name and ')' in name:\n", + " name = name.split('(')[0]\n", + " \n", + " # then strip spaces and split into list\n", + " query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')] # query\n", + " result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] # result\n", + " min_str_len = min(len(result_strings), len(query_strings))\n", + " # match each word in the query\n", + " matched_strings = []\n", + " \n", + " for i in range(len(query_strings)):\n", + " # for each word in the shorter text string\n", + " result_strings_tmp = result_strings.copy()\n", + " for j in range(len(result_strings_tmp)):\n", + " a = query_strings[i]\n", + " b = result_strings_tmp[j]\n", + " # make a the shorter string\n", + " lengths = [len(a), len(b)]\n", + " min_ratio = (min(lengths) / max(lengths) * .75)\n", + " ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n", + " result = (ratio >= min_ratio)\n", + " if verbose:\n", + " print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n", + " if result:\n", + " # remove this item from result strings\n", + " matched_string = result_strings.pop(j)\n", + " matched_strings.append(matched_string)\n", + " break # exit loop and use shortened result string haystack\n", + "\n", + " matched = len(matched_strings) >= min_str_len\n", + " if verbose:\n", + " print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n", + " return matched" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n", + "\n", + "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n", + "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n", + "True because 2 >= 2\n", + "(David Letterman == Dave Letterman) = True\n", + "\n", + "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n", + "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n", + "False because 1 >= 2\n", + "(Charles Booker == Charles Dickens) = False\n", + "\n", + "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n", + "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n", + "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n", + "True because 2 >= 2\n", + "(Don J. Trump == Donald Trump) = True\n", + "\n", + "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n", + "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n", + "True because 2 >= 2\n", + "(Wang Fei (female footballer) == Wang Fei) = True\n" + ] + } + ], + "source": [ + "test_sim_match = True\n", + "if test_sim_match:\n", + " # Test name similarity search\n", + " query = 'Adoors Gopalakarishnan ok'\n", + " wp_name = 'Adoor Gopalakrishnan'\n", + " matched = same_person(query, wp_name)\n", + " print(f'({wp_name} == {query}) = {matched}')\n", + " print('')\n", + "\n", + " query = 'Dave Letterman'\n", + " wp_name = 'David Letterman'\n", + " matched = same_person(query, wp_name, verbose=True)\n", + " print(f'({wp_name} == {query}) = {matched}')\n", + " print('')\n", + "\n", + " query = 'Charles Dickens'\n", + " wp_name = 'Charles Booker'\n", + " matched = same_person(query, wp_name, verbose=True)\n", + " print(f'({wp_name} == {query}) = {matched}')\n", + " print('')\n", + "\n", + " query = 'Donald Trump'\n", + " wp_name = 'Don J. Trump'\n", + " matched = same_person(query, wp_name, verbose=True)\n", + " print(f'({wp_name} == {query}) = {matched}')\n", + " print('')\n", + " \n", + " query = 'Wang Fei'\n", + " kg_name = 'Faye Wong'\n", + " wp_name = 'Wang Fei (female footballer)'\n", + " matched = same_person(query, wp_name, verbose=True)\n", + " print(f'({wp_name} == {query}) = {matched}')" + ] + }, + { + "cell_type": "code", + "execution_count": 246, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = kg_api.get_kg_from_name(obj)\n", + " wp_obj = wp_api.get_meta(obj)\n", + " person_obj = {**kg_obj, **wp_obj}\n", + " return person_obj\n", + "\n", + "def num_non_accessed(mps):\n", + " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load existing CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# load existing CSV\n", + "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", + "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n", + "# fill nulls\n", + "df.fillna('', inplace = True)\n", + "mapped_persons = df.to_dict('records')\n", + "# add columns\n", + "for mp in mapped_persons:\n", + " mp['wp_error'] = ''\n", + " mp['kg_error'] = ''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Knowledge Graph Data" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5507f5c19de746df94aa5445e3c7cf46", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "832/5749 remaining\n", + "832/5749 remaining. Using 5 threads\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "411d08f873174d13a1de1f8b21f9f993", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done. 0 remaining.\n" + ] + } + ], + "source": [ + "num_threads_max = 5\n", + "sleep_min = 1\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "nna = num_non_accessed(mapped_persons)\n", + "print(f'{nna}/{len(mapped_persons)} remaining')\n", + "\n", + "# convert to thread pool\n", + "while nna > 0:\n", + " num_threads = max(1, min(num_threads_max, nna))\n", + " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " nna = num_non_accessed(mapped_persons)\n", + " if nna > 0:\n", + " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n", + " time.sleep(60 * sleep_min)\n", + "\n", + "print(f'Done. {nna} remaining.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Wikipedia API data" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "for i, mp in enumerate(mapped_persons):\n", + " kg_name = mp.get('kg_name')\n", + " wp_name = mp.get('wp_name')\n", + " query = mp.get('query')\n", + " name_orig = mp.get('source_name')\n", + " kg_score = int(mp.get('kg_score',0))\n", + "\n", + " kg_matches = same_person(name_orig, kg_name)\n", + " wp_matches = same_person(name_orig, wp_name)\n", + "\n", + " if kg_matches and wp_matches and kg_score > 100:\n", + " # very likely a match, confirm it\n", + " match_status = 2 # supermatch\n", + " # default to using wp because descriptions are more appropriate/udpated\n", + " source = 'wp'\n", + " elif kg_matches and wp_matches:\n", + " match_status = 1\n", + " # default to using wp because descriptions are more appropriate/udpated\n", + " source = 'wp'\n", + " elif kg_matches and not wp_matches:\n", + " # if the KG score is medium-high, but wp failed, needs review\n", + " source = 'kg'\n", + " match_status = 0\n", + " elif wp_matches and not kg_matches:\n", + " # if wikipedia text matched the query, then confirm\n", + " source = 'wp'\n", + " match_status = 0\n", + " else:\n", + " # no information available\n", + " match_status = -1\n", + " source = None\n", + " \n", + " slug = slugify.slugify(name_orig, separator='_')\n", + " mp_bio = mp.get('kg_bio', '')\n", + " wp_desc = mp.get('wp_description', '')\n", + " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n", + " \n", + " if source == 'kg':\n", + " # google knowledge graph\n", + " mp_name = mp['kg_name']\n", + " mp_description = mp.get('kg_description', '')\n", + " elif source == 'wp':\n", + " # wikipedia\n", + " mp_name = mp['wp_name']\n", + " mp_description = mp.get('wp_description', '')\n", + " \n", + " if 'disambiguation' in wp_desc.lower():\n", + " #print(f\"disambiguate: {name_orig}\")\n", + " match_status = 0 # needs review if \"disambiguation appears\"\n", + " mp_name = ''\n", + " mp_description = ''\n", + " mp_bio = ''\n", + " \n", + " mp['source_url'] = source_url\n", + " mp['mp_slug'] = slug\n", + " mp['matched'] = match_status\n", + " mp['mp_bio'] = mp_bio\n", + " mp['mp_name'] = mp_name\n", + " mp['mp_description'] = mp_description" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "match: 4359\n", + "review: 718\n", + "fail: 672\n", + "no kg accessed: 0\n", + "no wp accessed: 0\n" + ] + } + ], + "source": [ + "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n", + "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n", + "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n", + "\n", + "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n", + "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save data to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n", + "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n", + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
kg_biokg_bio_urlkg_descriptionkg_idkg_image_urlkg_namekg_scorekg_urlmatchedmp_biomp_descriptionmp_namemp_slugquerysourcesource_namesource_urlwp_descriptionwp_namewp_page_id
index
0Kim Antonie Lode Clijsters is a Belgian former...https://en.wikipedia.org/wiki/Kim_ClijstersBelgian tennis player/m/01m_ghhttp://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...Kim Clijsters618.2727052Kim Antonie Lode Clijsters is a Belgian former...Belgian tennis playerKim Clijsterskim_clijstersKim ClijsterslfwKim_Clijstershttp://vis-www.cs.umass.edu/lfw/person/Kim_Cli...Belgian tennis playerKim Clijsters262793
1William Rosenberg was an American entrepreneur...https://en.wikipedia.org/wiki/William_RosenbergAmerican entrepreneur/m/07dy4zWilliam Rosenberg367.8797302William Rosenberg was an American entrepreneur...American businessmanWilliam Rosenbergwilliam_rosenbergWilliam RosenberglfwWilliam_Rosenberghttp://vis-www.cs.umass.edu/lfw/person/William...American businessmanWilliam Rosenberg2.44981e+06
\n", + "
" + ], + "text/plain": [ + " kg_bio \\\n", + "index \n", + "0 Kim Antonie Lode Clijsters is a Belgian former... \n", + "1 William Rosenberg was an American entrepreneur... \n", + "\n", + " kg_bio_url kg_description \\\n", + "index \n", + "0 https://en.wikipedia.org/wiki/Kim_Clijsters Belgian tennis player \n", + "1 https://en.wikipedia.org/wiki/William_Rosenberg American entrepreneur \n", + "\n", + " kg_id kg_image_url \\\n", + "index \n", + "0 /m/01m_gh http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... \n", + "1 /m/07dy4z \n", + "\n", + " kg_name kg_score kg_url matched \\\n", + "index \n", + "0 Kim Clijsters 618.272705 2 \n", + "1 William Rosenberg 367.879730 2 \n", + "\n", + " mp_bio \\\n", + "index \n", + "0 Kim Antonie Lode Clijsters is a Belgian former... \n", + "1 William Rosenberg was an American entrepreneur... \n", + "\n", + " mp_description mp_name mp_slug \\\n", + "index \n", + "0 Belgian tennis player Kim Clijsters kim_clijsters \n", + "1 American businessman William Rosenberg william_rosenberg \n", + "\n", + " query source source_name \\\n", + "index \n", + "0 Kim Clijsters lfw Kim_Clijsters \n", + "1 William Rosenberg lfw William_Rosenberg \n", + "\n", + " source_url \\\n", + "index \n", + "0 http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... \n", + "1 http://vis-www.cs.umass.edu/lfw/person/William... \n", + "\n", + " wp_description wp_name wp_page_id \n", + "index \n", + "0 Belgian tennis player Kim Clijsters 262793 \n", + "1 American businessman William Rosenberg 2.44981e+06 " + ] + }, + "execution_count": 237, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_mapped_persons.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean data" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "for mp in mapped_persons:\n", + " mp['source_name'] = mp['source_name'].replace(' ', '_')\n", + "# mp['kg_description'] = mp['kg_description'].strip()\n", + "# mp['kg_name'] = mp['kg_name'].strip()\n", + "# mp['kg_bio_url'] = mp['kg_bio_url'].strip()\n", + "# mp['kg_bio'] = mp['kg_bio'].strip()\n", + "# mp['kg_url'] = mp['kg_url'].strip()\n", + " \n", + "# mp['wp_description'] = mp['wp_description'].strip()\n", + "# mp['wp_name'] = mp['wp_name'].strip()\n", + " \n", + "# mp['mp_name'] = ''\n", + "# mp['mp_bio'] = ''\n", + "# mp['mp_description'] = ''\n", + "# mp['mp_slug'] = ''\n", + " \n", + " #mp.setdefault('kg_description','')\n", + "# if mp.get('kg_score', 0) == 0:\n", + "# mp['kg_image_url'] = ''\n", + "# mp['kg_bio_url'] = ''\n", + "# mp['kg_id'] = ''\n", + "# mp['kg_url'] = ''\n", + "# mp['kg_description'] = ''\n", + "# mp['kg_bio_url'] = ''\n", + "# mp['kg_name'] = ''\n", + "# if mp['kg_url'] == [] or mp['kg_url'] == '[]':\n", + "# mp['kg_url'] = ''\n", + "\n", + " try:\n", + " _ = mp.pop('wp_bio')\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/lfw/count_images.ipynb b/megapixels/notebooks/datasets/lfw/count_images.ipynb new file mode 100644 index 00000000..26682f8b --- /dev/null +++ b/megapixels/notebooks/datasets/lfw/count_images.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Count Images for LFW\n", + "\n", + "- use sub-directory as `identity_key`" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "import numpy as np\n", + "from slugify import slugify\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels')\n", + "from app.utils import file_utils\n", + "from app.settings import types, app_cfg\n", + "from app.models.data_store import DataStore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get image counts" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "opt_dataset = types.Dataset.LFW\n", + "opt_data_store = types.DataStore.HDD\n", + "data_store = DataStore(opt_data_store, opt_dataset)\n", + "# get filepath out\n", + "fp_records = data_store.metadata(types.Metadata.FILE_RECORD)\n", + "fp_img_counts = data_store.metadata(types.Metadata.IMAGE_COUNT)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "df_records = pd.read_csv(fp_records).set_index('index')\n", + "records = df_records.to_dict('records')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# paths\n", + "fp_dirs = '/data_store_hdd/datasets/people/lfw/media/original/'\n", + "\n", + "fp_out = '/data_store_hdd/datasets/people/lfw/metadata/image_counts.csv'\n", + "\n", + "# glob\n", + "dirs = glob(join(fp_dirs,'*'))\n", + "\n", + "# count images\n", + "image_counts = []\n", + "\n", + "for d in tqdm(dirs):\n", + " # get number of images\n", + " files = file_utils.glob_multi(d, ['jpg', 'png'], recursive=False)\n", + " count = len(files)\n", + " name = Path(d).stem\n", + " image_counts.append({'identity_key': name, 'count': count})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df_counts = pd.DataFrame.from_dict(image_counts)\n", + "df_counts.index.name = 'index'\n", + "df_counts.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countidentity_key
index
014Kim_Clijsters
11William_Rosenberg
22John_Brady
33Juan_Ignacio_Chela
41Floyd_Keith
\n", + "
" + ], + "text/plain": [ + " count identity_key\n", + "index \n", + "0 14 Kim_Clijsters\n", + "1 1 William_Rosenberg\n", + "2 2 John_Brady\n", + "3 3 Juan_Ignacio_Chela\n", + "4 1 Floyd_Keith" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_counts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb index 37a1bd8f..8c474dd7 100644 --- a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb +++ b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb @@ -218,7 +218,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/megapixels/notebooks/datasets/msceleb/identity.ipynb b/megapixels/notebooks/datasets/msceleb/identity.ipynb new file mode 100644 index 00000000..d330badb --- /dev/null +++ b/megapixels/notebooks/datasets/msceleb/identity.ipynb @@ -0,0 +1,378 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Knowledge Graph MS Celeb" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "import time\n", + "from datetime import datetime\n", + "\n", + "import requests\n", + "\n", + "import json\n", + "import urllib\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "from urllib.request import urlopen\n", + "import urllib.request\n", + "\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils, im_utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", + "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def get_kg_meta(obj, url):\n", + " \n", + "def get_kg_from_name(obj):\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " # TODO detect 503 service unavailable\n", + " if obj['accessed']:\n", + " return obj\n", + " global api_key, url_kg_api\n", + " kg_id = obj['kg_id']\n", + " params = {\n", + " 'query': q,\n", + " 'limit': 5,\n", + " 'indent': True,\n", + " 'key': api_key,\n", + " }\n", + " \n", + " params = {\n", + " 'ids': kg_id,\n", + " 'limit': 1,\n", + " 'indent': True,\n", + " 'key': api_key,\n", + " }\n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if det_desc:\n", + " result['description_extended'] = det_desc.get('articleBody','')\n", + " result['description_license'] = det_desc.get('license','')\n", + " result['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result['description_extended'] = ''\n", + " result['description_license'] = ''\n", + " result['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result['image_url'] = result_img.get('contentUrl', '')\n", + " result['name'] = item_result.get('name', '')\n", + " result['score'] = item.get('resultScore', 0.0)\n", + " result['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]\n", + "opt_threads = 10\n", + "pbar = tqdm(total=len(unmapped_persons))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_kg_obj(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "#mapped_persons_bkup = mapped_persons.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "886ce68bd7484d2fa4ab2da0beec5359", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# convert to thread pool\n", + "#mapped_persons = []\n", + "pool = ThreadPool(opt_threads)\n", + "\n", + "# start threading\n", + "with tqdm(total=len(unmapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + "# close tqdm\n", + "pbar.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "93418" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mapped_persons)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and Chief Executive of Prestbury Investment Holdings Limited and Chairman of Prestbury Investments LLP. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Nick_Leslau', 'name': 'Nick Leslau'}]" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapped_persons[93415:]" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n" + ] + } + ], + "source": [ + "# reduce CC attribution string\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license',None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(nchanged)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(n_empty)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'\n", + "fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "df_mapped_persons.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}\n" + ] + } + ], + "source": [ + "#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})\n", + "#print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/names_kg.ipynb b/megapixels/notebooks/datasets/names_kg.ipynb deleted file mode 100644 index ab4edc4b..00000000 --- a/megapixels/notebooks/datasets/names_kg.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Find Knowledge Graph Names" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import os.path as osp\n", - "from os.path import join\n", - "from glob import glob\n", - "import random\n", - "import math\n", - "import time\n", - "from datetime import datetime\n", - "\n", - "import requests\n", - "\n", - "import json\n", - "import urllib\n", - "from multiprocessing.pool import ThreadPool\n", - "import threading\n", - "from urllib.request import urlopen\n", - "import urllib.request\n", - "\n", - "\n", - "import cv2 as cv\n", - "import pandas as pd\n", - "from scipy.io import loadmat\n", - "import numpy as np\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from tqdm import tqdm_notebook as tqdm\n", - "%reload_ext autoreload\n", - "%autoreload 2\n", - "import sys\n", - "sys.path.append('/work/megapixels_dev/megapixels/')\n", - "from app.utils import file_utils, im_utils" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n", - "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", - "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def get_kg_from_kg_obj(obj):\n", - " # TODO detect 503 service unavailable\n", - " timeout_error_msg = b'HTTP Error 503: Service Unavailable'\n", - " url_error_msg = b'HTTP Error 400: Bad Request'\n", - " global api_key, url_kg_api\n", - " kg_id = obj['kg_id']\n", - " params = {\n", - " 'ids': kg_id,\n", - " 'limit': 1,\n", - " 'indent': True,\n", - " 'key': api_key,\n", - " }\n", - " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", - " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", - " try:\n", - " json_response = urllib.request.urlopen(url).read()\n", - " except Exception as e:\n", - " result['error'] = str(e)\n", - " else:\n", - " try:\n", - " response = json.loads(json_response)\n", - " items = response.get('itemListElement', [])\n", - " result['accessed'] = True\n", - " if items:\n", - " item = items[0]\n", - " item_result = item.get('result', [])\n", - " result['description'] = item_result.get('description', '')\n", - " det_desc = item_result.get('detailedDescription', '')\n", - " if det_desc:\n", - " result['description_extended'] = det_desc.get('articleBody','')\n", - " result['description_license'] = det_desc.get('license','')\n", - " result['description_url'] = det_desc.get('url','')\n", - " else:\n", - " result['description_extended'] = ''\n", - " result['description_license'] = ''\n", - " result['description_url'] = ''\n", - " result_img = item_result.get('image', '')\n", - " if result_img:\n", - " result['image_url'] = result_img.get('contentUrl', '')\n", - " result['name'] = item_result.get('name', '')\n", - " result['score'] = item.get('resultScore', 0.0)\n", - " result['url'] = item_result.get('url', '')\n", - " except Exception as e:\n", - " result['error'] = str(e)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2d0733764379489aa82ed20f20edbb9b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f38b47614c5b4894b7e026b6a46a5057", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "opt_threads = 10\n", - "pbar = tqdm(total=len(unmapped_persons))\n", - "\n", - "# define thread mapping function\n", - "def pool_map_persons(obj):\n", - " global pbar\n", - " pbar.update(1)\n", - " kg_obj = get_kg_from_kg_obj(obj)\n", - " return kg_obj\n", - "\n", - "# convert to thread pool\n", - "mapped_persons = []\n", - "pool = ThreadPool(opt_threads)\n", - "\n", - "# start threading\n", - "with tqdm(total=len(unmapped_persons)) as pbar:\n", - " mapped_persons = pool.map(pool_map_persons, unmapped_persons)\n", - "\n", - "# close tqdm\n", - "pbar.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "{'@id': 'kg:/m/01008l96',\n", - " 'name': 'Mohamed Guessous',\n", - " '@type': ['Thing', 'Person'],\n", - " 'description': 'Moroccan sociologist',\n", - " 'image': {'contentUrl': 'http://t2.gstatic.com/images?q=tbn:ANd9GcTAHGBU-4ZzSqcMbDPnSHZA10u0L9Hnppdvt_AnFdQzOYnS0aHM',\n", - " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous'},\n", - " 'detailedDescription': {'articleBody': 'Mohamed Guessous was a Moroccan sociologist. He was also an active politician in the Socialist Union of Popular Forces.',\n", - " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous',\n", - " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'},\n", - " 'score': 11.046742}\n", - " ```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:megapixels]", - "language": "python", - "name": "conda-env-megapixels-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/megapixels/notebooks/datasets/pubfig83/identity.ipynb b/megapixels/notebooks/datasets/pubfig83/identity.ipynb new file mode 100644 index 00000000..697d9cee --- /dev/null +++ b/megapixels/notebooks/datasets/pubfig83/identity.ipynb @@ -0,0 +1,656 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PubFig83 Knowledge Graph Identities\n", + "\n", + "- convert filename-names to names\n", + "- fetch Google Knowledge Graph entity IDs for each name\n", + "- save KG IDs to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get List of Names" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83\n" + ] + } + ], + "source": [ + "dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'\n", + "names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n", + "print(len(names))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "julia stiles\n" + ] + } + ], + "source": [ + "print(names[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'Indian film director',\n", + " 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '\n", + " 'script writer, and producer. Adoor Gopalakrishnan '\n", + " 'had a major role in revolutioning Malayalam cinema '\n", + " 'during the 1970s and is regarded as one of the most '\n", + " 'notable filmmakers of India. ',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',\n", + " 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',\n", + " 'kg_id': '/m/07s7wk',\n", + " 'name': 'Adoor Gopalakrishnan',\n", + " 'query': 'Adoor Gopalakrishnan',\n", + " 'score': 501.001862,\n", + " 'url': 'http://www.adoorgopalakrishnan.com'}\n" + ] + } + ], + "source": [ + "# test get from name\n", + "q = 'Adoor Gopalakrishnan'\n", + "obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "for fn in names:\n", + " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "83\n", + "['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']\n" + ] + } + ], + "source": [ + "print(len(mapped_persons))\n", + "print(names[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0af8e1f2d849473f933f506f5c8ced2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "09fa539f1d62416caf7fd217e7cf4892", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9 remaining. Sleeping...\n", + "9/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c22e1ce3e6e441839f12e88846612825", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "6 remaining. Sleeping...\n", + "6/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c7c5af3d562b475ea3420eca594cee85", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "5 remaining. Sleeping...\n", + "5/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7fcb0916185443cbbca9e553923e232f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2 remaining. Sleeping...\n", + "2/83 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7a5b35b2832d4e54bb87241f8bb29390", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "num_threads = 5\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed} remaining. Sleeping...')\n", + " time.sleep(60) # wait X minutes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "updated CC license: 0\n", + "items w/o KG meta: 0\n" + ] + } + ], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(f'updated CC license: {nchanged}')\n", + "\n", + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(f'items w/o KG meta: {n_empty}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
accesseddescriptiondescription_extendeddescription_licensedescription_urlimage_urlkg_idnamequeryscoreurl
index
0TrueAmerican actressJulia O'Hara Stiles is an American actress. Bo...CC BY-SA 3.0https://en.wikipedia.org/wiki/Julia_Stileshttp://t1.gstatic.com/images?q=tbn:ANd9GcToFqB.../m/02jtjzJulia Stilesjulia stiles637.113647http://www.juliastilesblog.com
1TrueActorOrlando Jonathan Blanchard Bloom is an English...CC BY-SA 3.0https://en.wikipedia.org/wiki/Orlando_Bloomhttp://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc.../m/09wj5Orlando Bloomorlando bloom689.364319
\n", + "
" + ], + "text/plain": [ + " accessed description \\\n", + "index \n", + "0 True American actress \n", + "1 True Actor \n", + "\n", + " description_extended description_license \\\n", + "index \n", + "0 Julia O'Hara Stiles is an American actress. Bo... CC BY-SA 3.0 \n", + "1 Orlando Jonathan Blanchard Bloom is an English... CC BY-SA 3.0 \n", + "\n", + " description_url \\\n", + "index \n", + "0 https://en.wikipedia.org/wiki/Julia_Stiles \n", + "1 https://en.wikipedia.org/wiki/Orlando_Bloom \n", + "\n", + " image_url kg_id \\\n", + "index \n", + "0 http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB... /m/02jtjz \n", + "1 http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc... /m/09wj5 \n", + "\n", + " name query score \\\n", + "index \n", + "0 Julia Stiles julia stiles 637.113647 \n", + "1 Orlando Bloom orlando bloom 689.364319 \n", + "\n", + " url \n", + "index \n", + "0 http://www.juliastilesblog.com \n", + "1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check output\n", + "df_mapped_persons.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/umd_faces/identity.ipynb b/megapixels/notebooks/datasets/umd_faces/identity.ipynb new file mode 100644 index 00000000..a3da9d58 --- /dev/null +++ b/megapixels/notebooks/datasets/umd_faces/identity.ipynb @@ -0,0 +1,675 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UMD Faces Knowledge Graph Identities\n", + "\n", + "- convert filename-names to names\n", + "- fetch Google Knowledge Graph entity IDs for each name\n", + "- save KG IDs to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "from pathlib import Path\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load IMDB Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n", + "with open(fp_filenames, 'r') as fp:\n", + " filenames = fp.readlines()\n", + "_ = filenames.pop(0)\n", + "filenames = [x.replace('_', ' ').strip() for x in filenames]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aaron rodgers\n" + ] + } + ], + "source": [ + "print(filenames[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'{\\n \"YourFuckingIPAddress\": \"78.55.72.54\",\\n \"YourFuckingLocation\": \"Berlin, BE, Germany\",\\n \"YourFuckingHostname\": \"x4e374836.dyn.telefonica.de\",\\n \"YourFuckingISP\": \"O2 Deutschland\",\\n \"YourFuckingTorExit\": \"false\",\\n \"YourFuckingCountryCode\": \"DE\"\\n}\\n'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "urllib.request.urlopen('https://wtfismyip.com/json').read()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': False,\n", + " 'description': '',\n", + " 'error': '',\n", + " 'kg_id': '',\n", + " 'query': 'Taylor Swift',\n", + " 'score': 0.0,\n", + " 'url': ''}\n" + ] + } + ], + "source": [ + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'American singer',\n", + " 'description_extended': 'Taylor Alison Swift is an American '\n", + " \"singer-songwriter. As one of the world's leading \"\n", + " 'contemporary recording artists, she is known for '\n", + " 'narrative songs about her personal life, which has '\n", + " 'received widespread media coverage.\\n',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n", + " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n", + " 'kg_id': '/m/0dl567',\n", + " 'name': 'Taylor Swift',\n", + " 'query': 'Taylor Swift',\n", + " 'score': 1241.476318,\n", + " 'url': 'http://taylorswift.com/'}\n" + ] + } + ], + "source": [ + "# test get from name\n", + "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "for fn in filenames:\n", + " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3107\n", + "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n" + ] + } + ], + "source": [ + "print(len(mapped_persons))\n", + "print(filenames[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "667\n" + ] + } + ], + "source": [ + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "print(num_non_accessed)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d38371156f594787ba242f451a3da650", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3/3107 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7c35975a7ad48fba2b9a02eb8ea2277", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "num_threads = 20\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed} remaining. Sleeping...')\n", + " time.sleep(60*10) # wait X minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'query': 'aaron rodgers', 'kg_id': '/m/04q06_', 'score': 919.404602, 'description': 'Football quarterback', 'url': '', 'accessed': True, 'description_extended': 'Aaron Charles Rodgers is an American football quarterback for the Green Bay Packers of the National Football League. Rodgers played college football for the California Golden Bears, where he set several career passing records, including lowest single-season and career interception rates. ', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Aaron_Rodgers', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcTH_uiKmj_Y71Lc1kNCJK5HDiZsUSh3AxEBI9Jz_lp5q_89QZ9d', 'name': 'Aaron Rodgers'}\n" + ] + } + ], + "source": [ + "# test output for a person\n", + "print(mapped_persons[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(nchanged)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(n_empty)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
accesseddescriptiondescription_extendeddescription_licensedescription_urlimage_urlkg_idnamequeryscoreurl
index
0TrueFootball quarterbackAaron Charles Rodgers is an American football ...CC BY-SA 3.0https://en.wikipedia.org/wiki/Aaron_Rodgershttp://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui.../m/04q06_Aaron Rodgersaaron rodgers919.404602
1TrueAmerican directorDerek Aaron Ruell, is an American director and...CC BY-SA 3.0https://en.wikipedia.org/wiki/Aaron_Ruellhttp://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8.../m/05yf80Aaron Ruellaaron ruell439.912476
2TrueAmerican actorAaron Staton is an American actor. He is best ...CC BY-SA 3.0https://en.wikipedia.org/wiki/Aaron_Statonhttp://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV.../m/06_vpyqAaron Statonaaron staton500.833344
3TrueAmerican filmmakerAbel Ferrara is an American filmmaker, known f...CC BY-SA 3.0https://en.wikipedia.org/wiki/Abel_Ferrarahttp://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-.../m/056ryyAbel Ferraraabel ferrara522.177734http://www.abelferrara.com/
4TrueActressNaN/m/0pbm3jfAbigail Kleinabigail klein341.831482
\n", + "
" + ], + "text/plain": [ + " accessed description \\\n", + "index \n", + "0 True Football quarterback \n", + "1 True American director \n", + "2 True American actor \n", + "3 True American filmmaker \n", + "4 True Actress \n", + "\n", + " description_extended description_license \\\n", + "index \n", + "0 Aaron Charles Rodgers is an American football ... CC BY-SA 3.0 \n", + "1 Derek Aaron Ruell, is an American director and... CC BY-SA 3.0 \n", + "2 Aaron Staton is an American actor. He is best ... CC BY-SA 3.0 \n", + "3 Abel Ferrara is an American filmmaker, known f... CC BY-SA 3.0 \n", + "4 \n", + "\n", + " description_url \\\n", + "index \n", + "0 https://en.wikipedia.org/wiki/Aaron_Rodgers \n", + "1 https://en.wikipedia.org/wiki/Aaron_Ruell \n", + "2 https://en.wikipedia.org/wiki/Aaron_Staton \n", + "3 https://en.wikipedia.org/wiki/Abel_Ferrara \n", + "4 \n", + "\n", + " image_url kg_id \\\n", + "index \n", + "0 http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui... /m/04q06_ \n", + "1 http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8... /m/05yf80 \n", + "2 http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV... /m/06_vpyq \n", + "3 http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-... /m/056ryy \n", + "4 NaN /m/0pbm3jf \n", + "\n", + " name query score url \n", + "index \n", + "0 Aaron Rodgers aaron rodgers 919.404602 \n", + "1 Aaron Ruell aaron ruell 439.912476 \n", + "2 Aaron Staton aaron staton 500.833344 \n", + "3 Abel Ferrara abel ferrara 522.177734 http://www.abelferrara.com/ \n", + "4 Abigail Klein abigail klein 341.831482 " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check output\n", + "df_mapped_persons.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/umd_faces/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb index c0051b7b..91ca1626 100644 --- a/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb +++ b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb @@ -2012,7 +2012,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/megapixels/notebooks/datasets/vgg_face2/identity.ipynb b/megapixels/notebooks/datasets/vgg_face2/identity.ipynb new file mode 100644 index 00000000..66eeeb90 --- /dev/null +++ b/megapixels/notebooks/datasets/vgg_face2/identity.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UMD Faces Knowledge Graph Identities\n", + "\n", + "- convert filename-names to names\n", + "- fetch Google Knowledge Graph entity IDs for each name\n", + "- save KG IDs to CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "import os.path as osp\n", + "from os.path import join\n", + "from glob import glob\n", + "import random\n", + "import math\n", + "from datetime import datetime\n", + "import requests\n", + "import json\n", + "import time\n", + "from pprint import pprint\n", + "from multiprocessing.pool import ThreadPool\n", + "import threading\n", + "import urllib.request\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd\n", + "from scipy.io import loadmat\n", + "import numpy as np\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load IMDB Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n", + "with open(fp_filenames, 'r') as fp:\n", + " filenames = fp.readlines()\n", + "_ = filenames.pop(0)\n", + "filenames = [x.replace('_', ' ').strip() for x in filenames]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aaron rodgers\n" + ] + } + ], + "source": [ + "print(filenames[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Google Knowledge Graph API" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# read API key\n", + "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n", + "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def _get_kg_meta(result_obj, params):\n", + " global api_key, url_kg_api\n", + " \n", + " params['indent'] = True\n", + " params['key'] = api_key\n", + " params['limit'] = 1\n", + " \n", + " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n", + " try:\n", + " json_response = urllib.request.urlopen(url).read()\n", + " except Exception as e:\n", + " result['error'] = str(e)\n", + " else:\n", + " try:\n", + " response = json.loads(json_response)\n", + " items = response.get('itemListElement', [])\n", + " result_obj['accessed'] = True\n", + " if items:\n", + " item = items[0]\n", + " item_result = item.get('result', [])\n", + " result_obj['description'] = item_result.get('description', '')\n", + " det_desc = item_result.get('detailedDescription', '')\n", + " if not result_obj['kg_id']:\n", + " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n", + " if det_desc:\n", + " result_obj['description_extended'] = det_desc.get('articleBody','')\n", + " result_obj['description_license'] = det_desc.get('license','')\n", + " result_obj['description_url'] = det_desc.get('url','')\n", + " else:\n", + " result_obj['description_extended'] = ''\n", + " result_obj['description_license'] = ''\n", + " result_obj['description_url'] = ''\n", + " result_img = item_result.get('image', '')\n", + " if result_img:\n", + " result_obj['image_url'] = result_img.get('contentUrl', '')\n", + " result_obj['name'] = item_result.get('name', '')\n", + " result_obj['score'] = item.get('resultScore', 0.0)\n", + " result_obj['url'] = item_result.get('url', '')\n", + " except Exception as e:\n", + " result_obj['error'] = str(e)\n", + " return result_obj\n", + " \n", + "def get_kg_from_name(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'query': obj['query']}\n", + " return _get_kg_meta(obj, params)\n", + " \n", + "def get_kg_from_kg_id(obj):\n", + " if obj['accessed']:\n", + " return obj\n", + " params = {'ids': obj['kg_id']}\n", + " return _get_kg_meta(obj, params)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'accessed': True,\n", + " 'description': 'American singer',\n", + " 'description_extended': 'Taylor Alison Swift is an American '\n", + " \"singer-songwriter. As one of the world's leading \"\n", + " 'contemporary recording artists, she is known for '\n", + " 'narrative songs about her personal life, which has '\n", + " 'received widespread media coverage.\\n',\n", + " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n", + " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n", + " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n", + " 'kg_id': '/m/0dl567',\n", + " 'name': 'Taylor Swift',\n", + " 'query': 'Taylor Swift',\n", + " 'score': 1241.476318,\n", + " 'url': 'http://taylorswift.com/'}\n" + ] + } + ], + "source": [ + "# test get from name\n", + "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n", + "result = get_kg_from_name(obj)\n", + "pprint(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# define thread mapping function\n", + "def pool_map_persons(obj):\n", + " global pbar\n", + " pbar.update(1)\n", + " kg_obj = get_kg_from_name(obj)\n", + " return kg_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# build mapped_person objects\n", + "mapped_persons = []\n", + "for fn in filenames:\n", + " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n", + " mapped_persons.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3107\n", + "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n" + ] + } + ], + "source": [ + "print(len(mapped_persons))\n", + "print(filenames[0:10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4752a8e0280e4a58843a21401d9ed649", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1102/3107 remaining\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "882c60006b0d4a9e809297bbc1e86807", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "num_threads = 20\n", + "pbar = tqdm(total=len(mapped_persons))\n", + "\n", + "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + "\n", + "# convert to thread pool\n", + "while num_non_accessed > 0:\n", + " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n", + " pool = ThreadPool(num_threads)\n", + "\n", + " # start threading\n", + " with tqdm(total=len(mapped_persons)) as pbar:\n", + " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n", + "\n", + " # close tqdm\n", + " pbar.close()\n", + "\n", + " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n", + " if num_non_accessed > 0:\n", + " print(f'{num_non_accessed} remaining. Sleeping...')\n", + " time.sleep(60*20) # wait X minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n" + ] + } + ], + "source": [ + "# test output for a person\n", + "print(mapped_persons[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n", + "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n", + "cc_short = 'CC BY-SA 3.0'\n", + "nchanged = 0\n", + "for mapped_person in mapped_persons:\n", + " license = mapped_person.get('description_license', None)\n", + " if license == cc_long:\n", + " nchanged += 1\n", + " mapped_person['description_license'] = cc_short\n", + "print(nchanged)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# find number not accessed\n", + "n_empty = 0\n", + "for mapped_person in mapped_persons:\n", + " if not mapped_person.get('accessed', False):\n", + " n_empty += 1\n", + " print(mapped_person['kg_id'])\n", + "print(n_empty)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create dataframe for mapped persons\n", + "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n", + "df_mapped_persons.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check output\n", + "df_mapped_persons.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n", + "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create small version\n", + "limit = 1000\n", + "fpp_out = Path(fp_out)\n", + "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n", + "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n", + "df_mapped_persons_sm.index.name = 'index'\n", + "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for later, check similarity score to othyer identity kg CSVs\n", + "from difflib import SequenceMatcher\n", + "def similar(a, b):\n", + " return SequenceMatcher(None, a, b).ratio()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3-70-g09d2