summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
committeradamhrv <adam@ahprojects.com>2019-01-28 18:11:36 +0100
commitdd2c36288aa1e8af14588f9258f6785879b8638c (patch)
tree543564ff7cc9b83ae1ecbc5b0d89bca9a6c17742
parentb0b06be0defe97ef19cf4d0f3328db40d299e110 (diff)
add utils for analyzing identities
-rw-r--r--megapixels/app/settings/types.py2
-rw-r--r--megapixels/app/utils/api_utils.py155
-rw-r--r--megapixels/commands/datasets/count_images.py30
-rw-r--r--megapixels/commands/datasets/fix_identity_key.py58
-rw-r--r--megapixels/commands/datasets/template.py30
-rw-r--r--megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb (renamed from megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb)186
-rw-r--r--megapixels/notebooks/datasets/imdb_wiki/identity.ipynb498
-rw-r--r--megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb468
-rw-r--r--megapixels/notebooks/datasets/knowledge_graph/identity.ipynb792
-rw-r--r--megapixels/notebooks/datasets/lfw/count_images.ipynb247
-rw-r--r--megapixels/notebooks/datasets/lfw/lfw_names.ipynb2
-rw-r--r--megapixels/notebooks/datasets/msceleb/identity.ipynb378
-rw-r--r--megapixels/notebooks/datasets/names_kg.ipynb243
-rw-r--r--megapixels/notebooks/datasets/pubfig83/identity.ipynb656
-rw-r--r--megapixels/notebooks/datasets/umd_faces/identity.ipynb675
-rw-r--r--megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb2
-rw-r--r--megapixels/notebooks/datasets/vgg_face2/identity.ipynb439
-rw-r--r--notes/datasets/new_datasets.md4
-rw-r--r--notes/datasets/youtube_faces.html1065
-rw-r--r--notes/datasets/youtube_faces.md6
-rw-r--r--notes/utils/bash_utils.md6
21 files changed, 5062 insertions, 880 deletions
diff --git a/megapixels/app/settings/types.py b/megapixels/app/settings/types.py
index 208215c2..933d1932 100644
--- a/megapixels/app/settings/types.py
+++ b/megapixels/app/settings/types.py
@@ -44,7 +44,7 @@ class LogLevel(Enum):
class Metadata(Enum):
IDENTITY, FILE_RECORD, FACE_VECTOR, FACE_POSE, \
FACE_ROI, FACE_LANDMARK_2D_68, FACE_LANDMARK_2D_5,FACE_LANDMARK_3D_68, \
- FACE_ATTRIBUTES = range(9)
+ FACE_ATTRIBUTES, IMAGE_COUNT = range(10)
class Dataset(Enum):
LFW, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
diff --git a/megapixels/app/utils/api_utils.py b/megapixels/app/utils/api_utils.py
new file mode 100644
index 00000000..ec00113e
--- /dev/null
+++ b/megapixels/app/utils/api_utils.py
@@ -0,0 +1,155 @@
+import json
+import urllib
+import urllib.request
+
+
+class WikipediaAPI:
+
+ url_base = 'https://en.wikipedia.org/w/api.php'
+
+ def _url_builder(self, q):
+
+ # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters
+
+ params = {
+ 'redirects': '',
+ 'ppprop': 'displaytitle',
+ 'prop': 'pageprops|pageimages|description',
+ 'generator': 'prefixsearch',
+ 'action': 'query',
+ 'format': 'json',
+ 'piprop': 'thumbnail',
+ #'pithumbsize': 160,
+ 'pilimit': 1,
+ 'gpssearch': q,
+ 'gpsnamespace': 0,
+ 'gpslimit': 1
+ }
+ url = f'{self.url_base}?{urllib.parse.urlencode(params)}'
+ return url
+
+ def _api_search(self, url):
+ # set empty object
+ obj = {
+ 'wp_description': '',
+ 'wp_page_id': '',
+ 'wp_name': ''
+ }
+ try:
+ json_response = urllib.request.urlopen(url).read()
+ response = json.loads(json_response)
+ obj['wp_accessed'] = True
+ query = response.get('query', None)
+ if query:
+ pages = query.get('pages',[])
+ if pages:
+ page_id= list(pages.keys())[0]
+ if int(page_id) != -1:
+ page = pages[page_id]
+ # populate with successful result
+ obj['wp_name'] = page['title']
+ obj['wp_page_id'] = page_id
+ obj['wp_description'] = page.get('description', '') # not always available
+ # if fail, return None
+ except Exception as e:
+ obj['wp_error'] = e
+ obj['wp_accessed'] = False
+ return obj
+
+ def get_meta(self, query_obj):
+ '''Searches Wikipedia API for query string'''
+ if query_obj.get('wp_accessed', False):
+ return query_obj
+ else:
+ url = self._url_builder(query_obj['query'])
+ return self._api_search(url)
+
+ def search(self, q):
+ '''Searches Wikipedia API for query string'''
+ url = self._url_builder(q)
+ return self._api_search(url)
+
+
+class GoogleKnowledgeGraph:
+
+ url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'
+
+ def __init__(self, key):
+ self._api_key = key
+
+
+ def _get_kg_meta(self, result_obj, params):
+
+ params['indent'] = True # JSON indent
+ params['key'] = self._api_key
+ params['limit'] = 1
+ '''
+ Restricts returned entities to those of the specified types.
+ For example, you can specify `Person` (as defined in http://schema.org/Person)
+ to restrict the results to entities representing people.
+ If multiple types are specified, returned entities will contain one or more of these type'''
+ params['types'] = 'Person'
+
+ '''Enables prefix (initial substring) match against names and
+ aliases of entities. For example, a prefix `Jung` will match entities
+ and aliases such as `Jung`, `Jungle`, and `Jung-ho Kang`.'''
+ params['prefix'] = False
+
+ url = f'{self.url_kg_api}?{urllib.parse.urlencode(params)}'
+ try:
+ json_response = urllib.request.urlopen(url).read()
+ except Exception as e:
+ result_obj['kg_error'] = str(e)
+ result_obj['kg_accessed'] = False
+ else:
+ response = json.loads(json_response)
+ items = response.get('itemListElement', [])
+ if items:
+ item = items[0]
+ item_result = item.get('result', [])
+ result_obj['kg_url'] = item.get('url', '')
+ result_obj['kg_description'] = item_result.get('description', '')
+ result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')
+ result_obj['kg_name'] = item_result.get('name', '')
+ result_obj['kg_score'] = item.get('resultScore', 0.0)
+ det_desc = item_result.get('detailedDescription', '')
+ if det_desc:
+ result_obj['kg_bio'] = det_desc.get('articleBody','')
+ result_obj['kg_bio_url'] = det_desc.get('url','')
+ else:
+ result_obj['kg_bio'] = ''
+ result_obj['kg_bio_url'] = ''
+ result_img = item_result.get('image', '')
+ if result_img:
+ result_obj['kg_image_url'] = result_img.get('contentUrl', '')
+ else:
+ result_obj['kg_image_url'] = ''
+ result_obj['kg_error'] = ''
+ else:
+ # search was valid but no results
+ result_obj['kg_url'] = ''
+ result_obj['kg_description'] = ''
+ result_obj['kg_id'] = ''
+ result_obj['kg_name'] = ''
+ result_obj['kg_score'] = 0
+ result_obj['kg_bio'] = ''
+ result_obj['kg_bio_url'] = ''
+ result_obj['kg_image_url'] = ''
+
+ result_obj['kg_accessed'] = True
+
+ return result_obj
+
+
+ def get_kg_from_name(self, obj):
+ if obj.get('kg_accessed', False):
+ return obj
+ params = {'query': obj['query']}
+ return self._get_kg_meta(obj, params)
+
+
+ def get_kg_from_kg_id(self, obj):
+ if obj.get('kg_accessed', False):
+ return obj
+ params = {'ids': obj['kg_ig']}
+ return self._get_kg_meta(obj, params)
diff --git a/megapixels/commands/datasets/count_images.py b/megapixels/commands/datasets/count_images.py
new file mode 100644
index 00000000..2e952896
--- /dev/null
+++ b/megapixels/commands/datasets/count_images.py
@@ -0,0 +1,30 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.pass_context
+def cli(ctx, ):
+ """_template_"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+ log.info('template works')
diff --git a/megapixels/commands/datasets/fix_identity_key.py b/megapixels/commands/datasets/fix_identity_key.py
new file mode 100644
index 00000000..ad2b555f
--- /dev/null
+++ b/megapixels/commands/datasets/fix_identity_key.py
@@ -0,0 +1,58 @@
+'''
+
+'''
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+
+identity_sources = ['subdir', 'numeric']
+
+@click.command()
+@click.option('--data_store', 'opt_data_store',
+ type=cfg.DataStoreVar,
+ default=click_utils.get_default(types.DataStore.HDD),
+ show_default=True,
+ help=click_utils.show_help(types.Dataset))
+@click.option('--dataset', 'opt_dataset',
+ type=cfg.DatasetVar,
+ required=True,
+ show_default=True,
+ help=click_utils.show_help(types.Dataset))
+@click.pass_context
+def cli(ctx, opt_dataset, opt_data_store):
+ """Fix identity key to be slug"""
+
+ import sys, os
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+
+ import pandas as pd
+ from glob import glob
+ from slugify import slugify
+ from tqdm import tqdm
+
+ from app.models.data_store import DataStore
+
+
+ data_store = DataStore(opt_data_store, opt_dataset)
+ fp_records = data_store.metadata(types.Metadata.FILE_RECORD)
+
+
+ # ----------------------------------------------------------------
+ # load csv and slugify
+
+ df_records = pd.read_csv(fp_records, dtype=cfg.FILE_RECORD_DTYPES).set_index('index')
+ records = df_records.to_dict('records')
+ for r in tqdm(records):
+ r['identity_key'] = slugify(r['identity_key'], separator='_')
+ df_records = pd.DataFrame.from_dict(records)
+ df_records.index.name = 'index'
+ df_records.to_csv(fp_records)
+ log.info(f'wrote: {fp_records}') \ No newline at end of file
diff --git a/megapixels/commands/datasets/template.py b/megapixels/commands/datasets/template.py
new file mode 100644
index 00000000..2e952896
--- /dev/null
+++ b/megapixels/commands/datasets/template.py
@@ -0,0 +1,30 @@
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.pass_context
+def cli(ctx, ):
+ """_template_"""
+
+ import sys
+ from glob import glob
+ from os.path import join
+ from pathlib import Path
+ import time
+
+ import pandas as pd
+ import cv2 as cv
+ from tqdm import tqdm
+
+ from app.utils import file_utils, im_utils
+ from app.models.data_store import DataStore
+
+ log = Logger.getLogger()
+ log.info('template works')
diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb
index 648fb9ac..1bf7b590 100644
--- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_meta_debug.ipynb
+++ b/megapixels/notebooks/datasets/imdb_wiki/convert_mat.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# 06: Face pose dlib/MTCNN"
+ "# IMDB WIKI: Convert .mat to CSVs"
]
},
{
@@ -42,7 +42,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -52,7 +52,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -61,7 +61,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -107,7 +107,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -120,7 +120,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "d50c6e22d1694b54815a86d85cda6241",
+ "model_id": "8a4a106e3bee4fde89492ceef50b9c05",
"version_major": 2,
"version_minor": 0
},
@@ -145,7 +145,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -154,7 +154,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -276,7 +276,7 @@
"4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 "
]
},
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -286,161 +286,13 @@
]
},
{
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create DataFrame for metadata"
- ]
- },
- {
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "df_results = pd.DataFrame.from_dict(results_meta)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>celeb_id</th>\n",
- " <th>dob</th>\n",
- " <th>filepath</th>\n",
- " <th>gender</th>\n",
- " <th>name</th>\n",
- " <th>x1</th>\n",
- " <th>x2</th>\n",
- " <th>y1</th>\n",
- " <th>y2</th>\n",
- " <th>year_photo</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>1072.926000</td>\n",
- " <td>1214.784000</td>\n",
- " <td>161.838000</td>\n",
- " <td>303.696000</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>477.184000</td>\n",
- " <td>622.592000</td>\n",
- " <td>100.352000</td>\n",
- " <td>245.760000</td>\n",
- " <td>1970</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm577153792_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>114.969643</td>\n",
- " <td>451.686572</td>\n",
- " <td>114.969643</td>\n",
- " <td>451.686572</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm946909184_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>622.885506</td>\n",
- " <td>844.339008</td>\n",
- " <td>424.217504</td>\n",
- " <td>645.671006</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm980463616_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>1013.859002</td>\n",
- " <td>1201.586128</td>\n",
- " <td>233.882042</td>\n",
- " <td>421.609168</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " celeb_id dob filepath gender \\\n",
- "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg m \n",
- "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg m \n",
- "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg m \n",
- "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg m \n",
- "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg m \n",
- "\n",
- " name x1 x2 y1 y2 year_photo \n",
- "0 Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 1968 \n",
- "1 Fred Astaire 477.184000 622.592000 100.352000 245.760000 1970 \n",
- "2 Fred Astaire 114.969643 451.686572 114.969643 451.686572 1968 \n",
- "3 Fred Astaire 622.885506 844.339008 424.217504 645.671006 1968 \n",
- "4 Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 1968 "
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_results.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_results.to_csv(join(dir_out,'imdb_wiki.csv'), index=False)"
+ "df_meta.index.name = 'index'\n",
+ "df_meta.to_csv(join(dir_out,'imdb_mat.csv'))"
]
},
{
@@ -452,16 +304,18 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "df_name_groups = df_results.groupby('name')\n",
+ "# count images per person and save to CSV\n",
+ "df_name_groups = df_meta.groupby('name')\n",
"images_per_person = []\n",
"for name, df_name in df_name_groups:\n",
" images_per_person.append({'name': name, 'num_images': len(df_name)})\n",
"df_images_per_person = pd.DataFrame.from_dict(images_per_person)\n",
- "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'), index=False)"
+ "df_images_per_person.index.name = 'index'\n",
+ "df_images_per_person.to_csv(join(dir_out, 'imdb_images_per_person.csv'))"
]
},
{
@@ -473,7 +327,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -482,7 +336,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -491,7 +345,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
diff --git a/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb
new file mode 100644
index 00000000..40d7bd86
--- /dev/null
+++ b/megapixels/notebooks/datasets/imdb_wiki/identity.ipynb
@@ -0,0 +1,498 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# IMDB-WIKI Knowledge Graph\n",
+ "\n",
+ "- convert names to Knowledge Graph entity IDs\n",
+ "- The `imdb.mat` file contains only full names, need KG ids `/m/12345`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load IMDB Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>celeb_id</th>\n",
+ " <th>dob</th>\n",
+ " <th>filepath</th>\n",
+ " <th>gender</th>\n",
+ " <th>name</th>\n",
+ " <th>x1</th>\n",
+ " <th>x2</th>\n",
+ " <th>y1</th>\n",
+ " <th>y2</th>\n",
+ " <th>year_photo</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>1072.926</td>\n",
+ " <td>1214.784</td>\n",
+ " <td>161.838</td>\n",
+ " <td>303.696</td>\n",
+ " <td>1968</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>6488</td>\n",
+ " <td>1900-5-11</td>\n",
+ " <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
+ " <td>m</td>\n",
+ " <td>Fred Astaire</td>\n",
+ " <td>477.184</td>\n",
+ " <td>622.592</td>\n",
+ " <td>100.352</td>\n",
+ " <td>245.760</td>\n",
+ " <td>1970</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " celeb_id dob filepath \\\n",
+ "index \n",
+ "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n",
+ "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n",
+ "\n",
+ " gender name x1 x2 y1 y2 year_photo \n",
+ "index \n",
+ "0 m Fred Astaire 1072.926 1214.784 161.838 303.696 1968 \n",
+ "1 m Fred Astaire 477.184 622.592 100.352 245.760 1970 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fp_meta_imdb = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_mat.csv'\n",
+ "df_meta_imdb = pd.read_csv(fp_meta_imdb).set_index('index')\n",
+ "df_meta_imdb.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'American singer',\n",
+ " 'description_extended': 'Taylor Alison Swift is an American '\n",
+ " \"singer-songwriter. As one of the world's leading \"\n",
+ " 'contemporary recording artists, she is known for '\n",
+ " 'narrative songs about her personal life, which has '\n",
+ " 'received widespread media coverage.\\n',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
+ " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
+ " 'kg_id': '/m/0dl567',\n",
+ " 'name': 'Taylor Swift',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 1241.476318,\n",
+ " 'url': 'http://taylorswift.com/'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# make a test query to check if API works\n",
+ "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "obj = {'query': 'Taylor Swift', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_id(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "count = 0\n",
+ "df_person_groups = df_meta_imdb.groupby('name')\n",
+ "for group_name, df_name_group in df_person_groups:\n",
+ " obj = {'query': group_name, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "87f6a2be42284199b8a67458f4090497",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=20284), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0/20284 remaining\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_threads = 2\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining. Sleeping...')\n",
+ " time.sleep(60*20) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test output for a person\n",
+ "print(mapped_persons[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb b/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
deleted file mode 100644
index b9a77fda..00000000
--- a/megapixels/notebooks/datasets/imdb_wiki/imdb_wiki_kg.ipynb
+++ /dev/null
@@ -1,468 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# IMDB-WIKI Knowledge Graph"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 110,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import os.path as osp\n",
- "from os.path import join\n",
- "from glob import glob\n",
- "import random\n",
- "import math\n",
- "from datetime import datetime\n",
- "import requests\n",
- "import json\n",
- "import urllib\n",
- "\n",
- "import cv2 as cv\n",
- "import pandas as pd\n",
- "from scipy.io import loadmat\n",
- "import numpy as np\n",
- "%matplotlib inline\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "%reload_ext autoreload\n",
- "%autoreload 2"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load Metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "fp_meta = '/data_store_hdd/datasets/people/imdb_wiki/metadata/imdb_wiki.csv'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_meta = pd.read_csv(fp_meta).set_index('index')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>celeb_id</th>\n",
- " <th>dob</th>\n",
- " <th>filepath</th>\n",
- " <th>gender</th>\n",
- " <th>name</th>\n",
- " <th>x1</th>\n",
- " <th>x2</th>\n",
- " <th>y1</th>\n",
- " <th>y2</th>\n",
- " <th>year_photo</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>index</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm124825600_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>1072.926000</td>\n",
- " <td>1214.784000</td>\n",
- " <td>161.838000</td>\n",
- " <td>303.696000</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm3343756032_1899-5-10_1970.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>477.184000</td>\n",
- " <td>622.592000</td>\n",
- " <td>100.352000</td>\n",
- " <td>245.760000</td>\n",
- " <td>1970</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm577153792_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>114.969643</td>\n",
- " <td>451.686572</td>\n",
- " <td>114.969643</td>\n",
- " <td>451.686572</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm946909184_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>622.885506</td>\n",
- " <td>844.339008</td>\n",
- " <td>424.217504</td>\n",
- " <td>645.671006</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>6488</td>\n",
- " <td>1900-5-11</td>\n",
- " <td>01/nm0000001_rm980463616_1899-5-10_1968.jpg</td>\n",
- " <td>m</td>\n",
- " <td>Fred Astaire</td>\n",
- " <td>1013.859002</td>\n",
- " <td>1201.586128</td>\n",
- " <td>233.882042</td>\n",
- " <td>421.609168</td>\n",
- " <td>1968</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " celeb_id dob filepath \\\n",
- "index \n",
- "0 6488 1900-5-11 01/nm0000001_rm124825600_1899-5-10_1968.jpg \n",
- "1 6488 1900-5-11 01/nm0000001_rm3343756032_1899-5-10_1970.jpg \n",
- "2 6488 1900-5-11 01/nm0000001_rm577153792_1899-5-10_1968.jpg \n",
- "3 6488 1900-5-11 01/nm0000001_rm946909184_1899-5-10_1968.jpg \n",
- "4 6488 1900-5-11 01/nm0000001_rm980463616_1899-5-10_1968.jpg \n",
- "\n",
- " gender name x1 x2 y1 y2 \\\n",
- "index \n",
- "0 m Fred Astaire 1072.926000 1214.784000 161.838000 303.696000 \n",
- "1 m Fred Astaire 477.184000 622.592000 100.352000 245.760000 \n",
- "2 m Fred Astaire 114.969643 451.686572 114.969643 451.686572 \n",
- "3 m Fred Astaire 622.885506 844.339008 424.217504 645.671006 \n",
- "4 m Fred Astaire 1013.859002 1201.586128 233.882042 421.609168 \n",
- "\n",
- " year_photo \n",
- "index \n",
- "0 1968 \n",
- "1 1970 \n",
- "2 1968 \n",
- "3 1968 \n",
- "4 1968 "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df_meta.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ids"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
- "\n",
- "def get_knowledge(q, api_key):\n",
- " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
- " params = {\n",
- " 'query': q,\n",
- " 'limit': 5,\n",
- " 'indent': True,\n",
- " 'key': api_key,\n",
- " }\n",
- " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n",
- " response = json.loads(urllib.request.urlopen(url).read())\n",
- " response = response.get('itemListElement', [])\n",
- " if len(response) > 0:\n",
- " result = response[0].get('result', [])\n",
- " result['score'] = response[0]['resultScore']\n",
- " return result\n",
- " else:\n",
- " return []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 106,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "General Secretary of the Communist Party of China\n",
- "Xi Jinping\n"
- ]
- },
- {
- "ename": "KeyError",
- "evalue": "'url'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-106-654588fe3a11>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'description'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mKeyError\u001b[0m: 'url'"
- ]
- }
- ],
- "source": [
- "# test\n",
- "q = 'Xi Jinping'\n",
- "r = get_knowledge(q, api_key)\n",
- "print(r['description'])\n",
- "print(r['name'])\n",
- "print(r['url'])\n",
- "print(r['score'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 107,
- "metadata": {},
- "outputs": [],
- "source": [
- "from pprint import pprint"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 108,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "kg:/m/06ff60\n"
- ]
- }
- ],
- "source": [
- "print(r['@id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'@id': 'kg:/g/11f4ksbzcm',\n",
- " '@type': ['Thing', 'Event'],\n",
- " 'detailedDescription': {'articleBody': 'On February 14, 2018, a gunman opened '\n",
- " 'fire at Marjory Stoneman Douglas High '\n",
- " 'School in Parkland, Florida, killing '\n",
- " 'seventeen students and staff members '\n",
- " 'and injuring seventeen others. ',\n",
- " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
- " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
- " 'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQmY7VqmGt4zEJU8Rc4EwPWroYd-L0QQ5wkZfiFO-WRqNBC-FPN',\n",
- " 'url': 'https://en.wikipedia.org/wiki/Stoneman_Douglas_High_School_shooting'},\n",
- " 'name': 'Stoneman Douglas High School shooting',\n",
- " 'score': 60.411652}\n"
- ]
- }
- ],
- "source": [
- "pprint(r)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [],
- "source": [
- "kgs_msceleb = os.listdir(dir_msceleb)\n",
- "kgs_msceleb = ['/' + x.replace('.','/') for x in kgs_msceleb]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 109,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 109,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "'/m/06ff60' in kgs_msceleb"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 111,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_kg_by_id(kg_id, api_key):\n",
- " service_url = 'https://kgsearch.googleapis.com/v1/entities:search'\n",
- " params = {\n",
- " 'ids': kg_id,\n",
- " 'limit': 1,\n",
- " 'indent': True,\n",
- " 'key': api_key,\n",
- " }\n",
- " url = service_url + '?' + urllib.parse.urlencode(params) # TODO: use requests\n",
- " try:\n",
- " response = json.loads(urllib.request.urlopen(url).read())\n",
- " response = response.get('itemListElement', [])\n",
- " result = response[0].get('result', [])\n",
- " result['score'] = response[0]['resultScore']\n",
- " return result\n",
- " except Exception as e:\n",
- " return []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 122,
- "metadata": {},
- "outputs": [],
- "source": [
- "a = get_kg_by_id('/m/0100n5bs', api_key)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 123,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[]"
- ]
- },
- "execution_count": 123,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "a"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:megapixels]",
- "language": "python",
- "name": "conda-env-megapixels-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
new file mode 100644
index 00000000..81a74faf
--- /dev/null
+++ b/megapixels/notebooks/datasets/knowledge_graph/identity.ipynb
@@ -0,0 +1,792 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "import difflib\n",
+ "import unidecode\n",
+ "import slugify\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import api_utils\n",
+ "from app.settings import types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get List of Names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_names(enum_dataset):\n",
+ " if enum_dataset == types.Dataset.LFW:\n",
+ " dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'\n",
+ " names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n",
+ " elif enum_dataset == types.Dataset.YOUTUBE_FACES:\n",
+ " names = [x for x in names if 'labeled faces.txt' not in x]\n",
+ " return names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Kim Clijsters', 'William Rosenberg', 'John Brady', 'Juan Ignacio Chela', 'Floyd Keith', 'Sam Gerald', 'Imad Khadduri', 'Anna Kournikova', 'Jacques Rogge', 'Wilbert Elki Meza Majino']\n"
+ ]
+ }
+ ],
+ "source": [
+ "names = get_names(types.Dataset.LFW)\n",
+ "print(names[0:10])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/env/google_knowledge_graph_api.env').read()\n",
+ "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n",
+ "wp_api = api_utils.WikipediaAPI()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 241,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wp\n",
+ "{'wp_accessed': True, 'wp_description': '', 'wp_name': '', 'wp_page_id': ''}\n",
+ "kg\n",
+ "{'kg_accessed': True,\n",
+ " 'kg_bio': '',\n",
+ " 'kg_bio_url': '',\n",
+ " 'kg_description': '',\n",
+ " 'kg_id': '',\n",
+ " 'kg_image_url': '',\n",
+ " 'kg_name': '',\n",
+ " 'kg_score': 0,\n",
+ " 'kg_url': '',\n",
+ " 'query': 'Jeff Dederian'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "#wp_api.test_access()\n",
+ "print('wp')\n",
+ "pprint(wp_api.get_meta({'query': 'Florecita Cobian'}))\n",
+ "print('kg')\n",
+ "pprint(kg_api.get_kg_from_name({'query':'Jeff Dederian'}))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Test Name Similarity Matching"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 242,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def same_person(query, name, sim_min=.9, word_match_min=0.75, verbose=False):\n",
+ " if name == '':\n",
+ " return False\n",
+ " # check and remove if WP added parenthesis\n",
+ " if '(' in name and ')' in name:\n",
+ " name = name.split('(')[0]\n",
+ " \n",
+ " # then strip spaces and split into list\n",
+ " query_strings = [unidecode.unidecode(x.strip().lower()) for x in query.strip().split(' ')] # query\n",
+ " result_strings = [unidecode.unidecode(x.strip().lower()) for x in name.strip().split(' ')] # result\n",
+ " min_str_len = min(len(result_strings), len(query_strings))\n",
+ " # match each word in the query\n",
+ " matched_strings = []\n",
+ " \n",
+ " for i in range(len(query_strings)):\n",
+ " # for each word in the shorter text string\n",
+ " result_strings_tmp = result_strings.copy()\n",
+ " for j in range(len(result_strings_tmp)):\n",
+ " a = query_strings[i]\n",
+ " b = result_strings_tmp[j]\n",
+ " # make a the shorter string\n",
+ " lengths = [len(a), len(b)]\n",
+ " min_ratio = (min(lengths) / max(lengths) * .75)\n",
+ " ratio = difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio()\n",
+ " result = (ratio >= min_ratio)\n",
+ " if verbose:\n",
+ " print(f'comapre \"{a}\" to \"{b}\" ratio was: {ratio:.2f} min: {min_ratio:.2}, passed: {result}')\n",
+ " if result:\n",
+ " # remove this item from result strings\n",
+ " matched_string = result_strings.pop(j)\n",
+ " matched_strings.append(matched_string)\n",
+ " break # exit loop and use shortened result string haystack\n",
+ "\n",
+ " matched = len(matched_strings) >= min_str_len\n",
+ " if verbose:\n",
+ " print(f'{matched} because {len(matched_strings)} >= {min_str_len}')\n",
+ " return matched"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 245,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(Adoor Gopalakrishnan == Adoors Gopalakarishnan ok) = True\n",
+ "\n",
+ "comapre \"dave\" to \"david\" ratio was: 0.67 min: 0.6, passed: True\n",
+ "comapre \"letterman\" to \"letterman\" ratio was: 1.00 min: 0.75, passed: True\n",
+ "True because 2 >= 2\n",
+ "(David Letterman == Dave Letterman) = True\n",
+ "\n",
+ "comapre \"charles\" to \"charles\" ratio was: 1.00 min: 0.75, passed: True\n",
+ "comapre \"dickens\" to \"booker\" ratio was: 0.31 min: 0.64, passed: False\n",
+ "False because 1 >= 2\n",
+ "(Charles Booker == Charles Dickens) = False\n",
+ "\n",
+ "comapre \"donald\" to \"don\" ratio was: 0.67 min: 0.38, passed: True\n",
+ "comapre \"trump\" to \"j.\" ratio was: 0.00 min: 0.3, passed: False\n",
+ "comapre \"trump\" to \"trump\" ratio was: 1.00 min: 0.75, passed: True\n",
+ "True because 2 >= 2\n",
+ "(Don J. Trump == Donald Trump) = True\n",
+ "\n",
+ "comapre \"wang\" to \"wang\" ratio was: 1.00 min: 0.75, passed: True\n",
+ "comapre \"fei\" to \"fei\" ratio was: 1.00 min: 0.75, passed: True\n",
+ "True because 2 >= 2\n",
+ "(Wang Fei (female footballer) == Wang Fei) = True\n"
+ ]
+ }
+ ],
+ "source": [
+ "test_sim_match = True\n",
+ "if test_sim_match:\n",
+ " # Test name similarity search\n",
+ " query = 'Adoors Gopalakarishnan ok'\n",
+ " wp_name = 'Adoor Gopalakrishnan'\n",
+ " matched = same_person(query, wp_name)\n",
+ " print(f'({wp_name} == {query}) = {matched}')\n",
+ " print('')\n",
+ "\n",
+ " query = 'Dave Letterman'\n",
+ " wp_name = 'David Letterman'\n",
+ " matched = same_person(query, wp_name, verbose=True)\n",
+ " print(f'({wp_name} == {query}) = {matched}')\n",
+ " print('')\n",
+ "\n",
+ " query = 'Charles Dickens'\n",
+ " wp_name = 'Charles Booker'\n",
+ " matched = same_person(query, wp_name, verbose=True)\n",
+ " print(f'({wp_name} == {query}) = {matched}')\n",
+ " print('')\n",
+ "\n",
+ " query = 'Donald Trump'\n",
+ " wp_name = 'Don J. Trump'\n",
+ " matched = same_person(query, wp_name, verbose=True)\n",
+ " print(f'({wp_name} == {query}) = {matched}')\n",
+ " print('')\n",
+ " \n",
+ " query = 'Wang Fei'\n",
+ " kg_name = 'Faye Wong'\n",
+ " wp_name = 'Wang Fei (female footballer)'\n",
+ " matched = same_person(query, wp_name, verbose=True)\n",
+ " print(f'({wp_name} == {query}) = {matched}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 246,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = kg_api.get_kg_from_name(obj)\n",
+ " wp_obj = wp_api.get_meta(obj)\n",
+ " person_obj = {**kg_obj, **wp_obj}\n",
+ " return person_obj\n",
+ "\n",
+ "def num_non_accessed(mps):\n",
+ " return sum(0 if (x.get('kg_accessed', False) and x.get('wp_accessed', False)) else 1 for x in mps)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load existing CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# load existing CSV\n",
+ "fp_csv = '/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n",
+ "df = pd.read_csv(fp_csv, encoding = 'utf-16').set_index('index')\n",
+ "# fill nulls\n",
+ "df.fillna('', inplace = True)\n",
+ "mapped_persons = df.to_dict('records')\n",
+ "# add columns\n",
+ "for mp in mapped_persons:\n",
+ " mp['wp_error'] = ''\n",
+ " mp['kg_error'] = ''"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get Knowledge Graph Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5507f5c19de746df94aa5445e3c7cf46",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "832/5749 remaining\n",
+ "832/5749 remaining. Using 5 threads\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "411d08f873174d13a1de1f8b21f9f993",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done. 0 remaining.\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_threads_max = 5\n",
+ "sleep_min = 1\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "nna = num_non_accessed(mapped_persons)\n",
+ "print(f'{nna}/{len(mapped_persons)} remaining')\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while nna > 0:\n",
+ " num_threads = max(1, min(num_threads_max, nna))\n",
+ " print(f'{nna}/{len(mapped_persons)} remaining. Using {num_threads} threads')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " nna = num_non_accessed(mapped_persons)\n",
+ " if nna > 0:\n",
+ " print(f'{nna} remaining. Sleeping for {sleep_min} minutes...')\n",
+ " time.sleep(60 * sleep_min)\n",
+ "\n",
+ "print(f'Done. {nna} remaining.')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get Wikipedia API data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 220,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "for i, mp in enumerate(mapped_persons):\n",
+ " kg_name = mp.get('kg_name')\n",
+ " wp_name = mp.get('wp_name')\n",
+ " query = mp.get('query')\n",
+ " name_orig = mp.get('source_name')\n",
+ " kg_score = int(mp.get('kg_score',0))\n",
+ "\n",
+ " kg_matches = same_person(name_orig, kg_name)\n",
+ " wp_matches = same_person(name_orig, wp_name)\n",
+ "\n",
+ " if kg_matches and wp_matches and kg_score > 100:\n",
+ " # very likely a match, confirm it\n",
+ " match_status = 2 # supermatch\n",
+ " # default to using wp because descriptions are more appropriate/udpated\n",
+ " source = 'wp'\n",
+ " elif kg_matches and wp_matches:\n",
+ " match_status = 1\n",
+ " # default to using wp because descriptions are more appropriate/udpated\n",
+ " source = 'wp'\n",
+ " elif kg_matches and not wp_matches:\n",
+ " # if the KG score is medium-high, but wp failed, needs review\n",
+ " source = 'kg'\n",
+ " match_status = 0\n",
+ " elif wp_matches and not kg_matches:\n",
+ " # if wikipedia text matched the query, then confirm\n",
+ " source = 'wp'\n",
+ " match_status = 0\n",
+ " else:\n",
+ " # no information available\n",
+ " match_status = -1\n",
+ " source = None\n",
+ " \n",
+ " slug = slugify.slugify(name_orig, separator='_')\n",
+ " mp_bio = mp.get('kg_bio', '')\n",
+ " wp_desc = mp.get('wp_description', '')\n",
+ " source_url = f\"http://vis-www.cs.umass.edu/lfw/person/{name_orig.replace(' ', '_')}.html\"\n",
+ " \n",
+ " if source == 'kg':\n",
+ " # google knowledge graph\n",
+ " mp_name = mp['kg_name']\n",
+ " mp_description = mp.get('kg_description', '')\n",
+ " elif source == 'wp':\n",
+ " # wikipedia\n",
+ " mp_name = mp['wp_name']\n",
+ " mp_description = mp.get('wp_description', '')\n",
+ " \n",
+ " if 'disambiguation' in wp_desc.lower():\n",
+ " #print(f\"disambiguate: {name_orig}\")\n",
+ " match_status = 0 # needs review if \"disambiguation appears\"\n",
+ " mp_name = ''\n",
+ " mp_description = ''\n",
+ " mp_bio = ''\n",
+ " \n",
+ " mp['source_url'] = source_url\n",
+ " mp['mp_slug'] = slug\n",
+ " mp['matched'] = match_status\n",
+ " mp['mp_bio'] = mp_bio\n",
+ " mp['mp_name'] = mp_name\n",
+ " mp['mp_description'] = mp_description"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 221,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "match: 4359\n",
+ "review: 718\n",
+ "fail: 672\n",
+ "no kg accessed: 0\n",
+ "no wp accessed: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"match: {sum(1 if (x.get('matched') > 0) else 0 for x in mapped_persons)}\")\n",
+ "print(f\"review: {sum(1 if (x.get('matched') == 0) else 0 for x in mapped_persons)}\")\n",
+ "print(f\"fail: {sum(1 if (x.get('matched') == -1) else 0 for x in mapped_persons)}\")\n",
+ "\n",
+ "print(f\"no kg accessed: {sum(0 if (x.get('kg_accessed', False)) else 1 for x in mapped_persons)}\")\n",
+ "print(f\"no wp accessed: {sum(0 if (x.get('wp_accessed', False)) else 1 for x in mapped_persons)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Save data to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 235,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 236,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = f'/data_store_hdd/datasets/people/lfw/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.drop(['kg_accessed', 'wp_accessed', 'kg_error', 'wp_error'], axis=1, inplace=True)\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')\n",
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 237,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>kg_bio</th>\n",
+ " <th>kg_bio_url</th>\n",
+ " <th>kg_description</th>\n",
+ " <th>kg_id</th>\n",
+ " <th>kg_image_url</th>\n",
+ " <th>kg_name</th>\n",
+ " <th>kg_score</th>\n",
+ " <th>kg_url</th>\n",
+ " <th>matched</th>\n",
+ " <th>mp_bio</th>\n",
+ " <th>mp_description</th>\n",
+ " <th>mp_name</th>\n",
+ " <th>mp_slug</th>\n",
+ " <th>query</th>\n",
+ " <th>source</th>\n",
+ " <th>source_name</th>\n",
+ " <th>source_url</th>\n",
+ " <th>wp_description</th>\n",
+ " <th>wp_name</th>\n",
+ " <th>wp_page_id</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Kim_Clijsters</td>\n",
+ " <td>Belgian tennis player</td>\n",
+ " <td>/m/01m_gh</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK...</td>\n",
+ " <td>Kim Clijsters</td>\n",
+ " <td>618.272705</td>\n",
+ " <td></td>\n",
+ " <td>2</td>\n",
+ " <td>Kim Antonie Lode Clijsters is a Belgian former...</td>\n",
+ " <td>Belgian tennis player</td>\n",
+ " <td>Kim Clijsters</td>\n",
+ " <td>kim_clijsters</td>\n",
+ " <td>Kim Clijsters</td>\n",
+ " <td>lfw</td>\n",
+ " <td>Kim_Clijsters</td>\n",
+ " <td>http://vis-www.cs.umass.edu/lfw/person/Kim_Cli...</td>\n",
+ " <td>Belgian tennis player</td>\n",
+ " <td>Kim Clijsters</td>\n",
+ " <td>262793</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>William Rosenberg was an American entrepreneur...</td>\n",
+ " <td>https://en.wikipedia.org/wiki/William_Rosenberg</td>\n",
+ " <td>American entrepreneur</td>\n",
+ " <td>/m/07dy4z</td>\n",
+ " <td></td>\n",
+ " <td>William Rosenberg</td>\n",
+ " <td>367.879730</td>\n",
+ " <td></td>\n",
+ " <td>2</td>\n",
+ " <td>William Rosenberg was an American entrepreneur...</td>\n",
+ " <td>American businessman</td>\n",
+ " <td>William Rosenberg</td>\n",
+ " <td>william_rosenberg</td>\n",
+ " <td>William Rosenberg</td>\n",
+ " <td>lfw</td>\n",
+ " <td>William_Rosenberg</td>\n",
+ " <td>http://vis-www.cs.umass.edu/lfw/person/William...</td>\n",
+ " <td>American businessman</td>\n",
+ " <td>William Rosenberg</td>\n",
+ " <td>2.44981e+06</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " kg_bio \\\n",
+ "index \n",
+ "0 Kim Antonie Lode Clijsters is a Belgian former... \n",
+ "1 William Rosenberg was an American entrepreneur... \n",
+ "\n",
+ " kg_bio_url kg_description \\\n",
+ "index \n",
+ "0 https://en.wikipedia.org/wiki/Kim_Clijsters Belgian tennis player \n",
+ "1 https://en.wikipedia.org/wiki/William_Rosenberg American entrepreneur \n",
+ "\n",
+ " kg_id kg_image_url \\\n",
+ "index \n",
+ "0 /m/01m_gh http://t3.gstatic.com/images?q=tbn:ANd9GcQ4yRK... \n",
+ "1 /m/07dy4z \n",
+ "\n",
+ " kg_name kg_score kg_url matched \\\n",
+ "index \n",
+ "0 Kim Clijsters 618.272705 2 \n",
+ "1 William Rosenberg 367.879730 2 \n",
+ "\n",
+ " mp_bio \\\n",
+ "index \n",
+ "0 Kim Antonie Lode Clijsters is a Belgian former... \n",
+ "1 William Rosenberg was an American entrepreneur... \n",
+ "\n",
+ " mp_description mp_name mp_slug \\\n",
+ "index \n",
+ "0 Belgian tennis player Kim Clijsters kim_clijsters \n",
+ "1 American businessman William Rosenberg william_rosenberg \n",
+ "\n",
+ " query source source_name \\\n",
+ "index \n",
+ "0 Kim Clijsters lfw Kim_Clijsters \n",
+ "1 William Rosenberg lfw William_Rosenberg \n",
+ "\n",
+ " source_url \\\n",
+ "index \n",
+ "0 http://vis-www.cs.umass.edu/lfw/person/Kim_Cli... \n",
+ "1 http://vis-www.cs.umass.edu/lfw/person/William... \n",
+ "\n",
+ " wp_description wp_name wp_page_id \n",
+ "index \n",
+ "0 Belgian tennis player Kim Clijsters 262793 \n",
+ "1 American businessman William Rosenberg 2.44981e+06 "
+ ]
+ },
+ "execution_count": 237,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_mapped_persons.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clean data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 225,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "for mp in mapped_persons:\n",
+ " mp['source_name'] = mp['source_name'].replace(' ', '_')\n",
+ "# mp['kg_description'] = mp['kg_description'].strip()\n",
+ "# mp['kg_name'] = mp['kg_name'].strip()\n",
+ "# mp['kg_bio_url'] = mp['kg_bio_url'].strip()\n",
+ "# mp['kg_bio'] = mp['kg_bio'].strip()\n",
+ "# mp['kg_url'] = mp['kg_url'].strip()\n",
+ " \n",
+ "# mp['wp_description'] = mp['wp_description'].strip()\n",
+ "# mp['wp_name'] = mp['wp_name'].strip()\n",
+ " \n",
+ "# mp['mp_name'] = ''\n",
+ "# mp['mp_bio'] = ''\n",
+ "# mp['mp_description'] = ''\n",
+ "# mp['mp_slug'] = ''\n",
+ " \n",
+ " #mp.setdefault('kg_description','')\n",
+ "# if mp.get('kg_score', 0) == 0:\n",
+ "# mp['kg_image_url'] = ''\n",
+ "# mp['kg_bio_url'] = ''\n",
+ "# mp['kg_id'] = ''\n",
+ "# mp['kg_url'] = ''\n",
+ "# mp['kg_description'] = ''\n",
+ "# mp['kg_bio_url'] = ''\n",
+ "# mp['kg_name'] = ''\n",
+ "# if mp['kg_url'] == [] or mp['kg_url'] == '[]':\n",
+ "# mp['kg_url'] = ''\n",
+ "\n",
+ " try:\n",
+ " _ = mp.pop('wp_bio')\n",
+ " except:\n",
+ " pass"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/lfw/count_images.ipynb b/megapixels/notebooks/datasets/lfw/count_images.ipynb
new file mode 100644
index 00000000..26682f8b
--- /dev/null
+++ b/megapixels/notebooks/datasets/lfw/count_images.ipynb
@@ -0,0 +1,247 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Count Images for LFW\n",
+ "\n",
+ "- use sub-directory as `identity_key`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "from pprint import pprint\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from slugify import slugify\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels')\n",
+ "from app.utils import file_utils\n",
+ "from app.settings import types, app_cfg\n",
+ "from app.models.data_store import DataStore"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get image counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "opt_dataset = types.Dataset.LFW\n",
+ "opt_data_store = types.DataStore.HDD\n",
+ "data_store = DataStore(opt_data_store, opt_dataset)\n",
+ "# get filepath out\n",
+ "fp_records = data_store.metadata(types.Metadata.FILE_RECORD)\n",
+ "fp_img_counts = data_store.metadata(types.Metadata.IMAGE_COUNT)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_records = pd.read_csv(fp_records).set_index('index')\n",
+ "records = df_records.to_dict('records')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# paths\n",
+ "fp_dirs = '/data_store_hdd/datasets/people/lfw/media/original/'\n",
+ "\n",
+ "fp_out = '/data_store_hdd/datasets/people/lfw/metadata/image_counts.csv'\n",
+ "\n",
+ "# glob\n",
+ "dirs = glob(join(fp_dirs,'*'))\n",
+ "\n",
+ "# count images\n",
+ "image_counts = []\n",
+ "\n",
+ "for d in tqdm(dirs):\n",
+ " # get number of images\n",
+ " files = file_utils.glob_multi(d, ['jpg', 'png'], recursive=False)\n",
+ " count = len(files)\n",
+ " name = Path(d).stem\n",
+ " image_counts.append({'identity_key': name, 'count': count})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_counts = pd.DataFrame.from_dict(image_counts)\n",
+ "df_counts.index.name = 'index'\n",
+ "df_counts.to_csv(fp_out)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>count</th>\n",
+ " <th>identity_key</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>14</td>\n",
+ " <td>Kim_Clijsters</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>1</td>\n",
+ " <td>William_Rosenberg</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>2</td>\n",
+ " <td>John_Brady</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>3</td>\n",
+ " <td>Juan_Ignacio_Chela</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>1</td>\n",
+ " <td>Floyd_Keith</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " count identity_key\n",
+ "index \n",
+ "0 14 Kim_Clijsters\n",
+ "1 1 William_Rosenberg\n",
+ "2 2 John_Brady\n",
+ "3 3 Juan_Ignacio_Chela\n",
+ "4 1 Floyd_Keith"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_counts.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
index 37a1bd8f..8c474dd7 100644
--- a/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
+++ b/megapixels/notebooks/datasets/lfw/lfw_names.ipynb
@@ -218,7 +218,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.6.6"
}
},
"nbformat": 4,
diff --git a/megapixels/notebooks/datasets/msceleb/identity.ipynb b/megapixels/notebooks/datasets/msceleb/identity.ipynb
new file mode 100644
index 00000000..d330badb
--- /dev/null
+++ b/megapixels/notebooks/datasets/msceleb/identity.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Knowledge Graph MS Celeb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "import time\n",
+ "from datetime import datetime\n",
+ "\n",
+ "import requests\n",
+ "\n",
+ "import json\n",
+ "import urllib\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "from urllib.request import urlopen\n",
+ "import urllib.request\n",
+ "\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "import sys\n",
+ "sys.path.append('/work/megapixels_dev/megapixels/')\n",
+ "from app.utils import file_utils, im_utils"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n",
+ "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_kg_meta(obj, url):\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " # TODO detect 503 service unavailable\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " global api_key, url_kg_api\n",
+ " kg_id = obj['kg_id']\n",
+ " params = {\n",
+ " 'query': q,\n",
+ " 'limit': 5,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " \n",
+ " params = {\n",
+ " 'ids': kg_id,\n",
+ " 'limit': 1,\n",
+ " 'indent': True,\n",
+ " 'key': api_key,\n",
+ " }\n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if det_desc:\n",
+ " result['description_extended'] = det_desc.get('articleBody','')\n",
+ " result['description_license'] = det_desc.get('license','')\n",
+ " result['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result['description_extended'] = ''\n",
+ " result['description_license'] = ''\n",
+ " result['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result['image_url'] = result_img.get('contentUrl', '')\n",
+ " result['name'] = item_result.get('name', '')\n",
+ " result['score'] = item.get('resultScore', 0.0)\n",
+ " result['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]\n",
+ "opt_threads = 10\n",
+ "pbar = tqdm(total=len(unmapped_persons))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_kg_obj(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#mapped_persons_bkup = mapped_persons.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "886ce68bd7484d2fa4ab2da0beec5359",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# convert to thread pool\n",
+ "#mapped_persons = []\n",
+ "pool = ThreadPool(opt_threads)\n",
+ "\n",
+ "# start threading\n",
+ "with tqdm(total=len(unmapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ "# close tqdm\n",
+ "pbar.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "93418"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(mapped_persons)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'kg_id': '/m/0dlnwb0', 'score': 14.806737, 'description': 'American internet celebrity', 'url': '', 'accessed': True, 'description_extended': 'Keenan Cahill is an American Internet celebrity from Chicago, Illinois who lip-syncs to popular songs on YouTube.\\nCahill launched his first famous lipsynced YouTube video on August 28, 2010 on the Katy Perry song Teenage Dream. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Keenan_Cahill', 'name': 'Keenan Cahill'}, {'kg_id': '/m/047rtd1', 'score': 12.298853, 'description': 'Canadian film actor', 'url': '', 'accessed': True, 'description_extended': '', 'description_license': '', 'description_url': '', 'name': 'Nicholas Elia'}, {'kg_id': '/m/04j9rz9', 'score': 11.539564, 'description': 'Investor', 'url': '', 'accessed': True, 'description_extended': 'Nick Leslau is an English commercial property investor, with an estimated fortune in the Sunday Times Rich List of £350 million. Leslau is Chairman and Chief Executive of Prestbury Investment Holdings Limited and Chairman of Prestbury Investments LLP. ', 'description_license': 'CC BY-SA 3.0', 'description_url': 'https://en.wikipedia.org/wiki/Nick_Leslau', 'name': 'Nick Leslau'}]"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mapped_persons[93415:]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license',None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'\n",
+ "fp_mapped_persons = '/data_store_hdd/datasets/people/msceleb/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_mapped_persons, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fp_mapped_persons_sm = f'/data_store_hdd/datasets/people/msceleb/metadata/identity_kg_0_{limit}.csv'\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_mapped_persons_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'kg_id': '/m/03c2nqz', 'score': 14.279573, 'description': 'Brazilian soccer player', 'url': '', 'accessed': True, 'description_extended': 'Cleiton Ribeiro Xavier is a Brazilian professional footballer who plays as an attacking midfielder for Vitória. He is known by his powerful and accurate free kicks, dribbling skills and passes.', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Cleiton_Xavier', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcSPzkNDBjtWX3f_oov7vOTlTxBNFrfIqEaIwJR26AsLfsBbP8H9', 'name': 'Cleiton Xavier'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "#a = get_kg_from_kg_obj({'kg_id': '/m/03c2nqz', 'accessed': False})\n",
+ "#print(a)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/names_kg.ipynb b/megapixels/notebooks/datasets/names_kg.ipynb
deleted file mode 100644
index ab4edc4b..00000000
--- a/megapixels/notebooks/datasets/names_kg.ipynb
+++ /dev/null
@@ -1,243 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Find Knowledge Graph Names"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import os.path as osp\n",
- "from os.path import join\n",
- "from glob import glob\n",
- "import random\n",
- "import math\n",
- "import time\n",
- "from datetime import datetime\n",
- "\n",
- "import requests\n",
- "\n",
- "import json\n",
- "import urllib\n",
- "from multiprocessing.pool import ThreadPool\n",
- "import threading\n",
- "from urllib.request import urlopen\n",
- "import urllib.request\n",
- "\n",
- "\n",
- "import cv2 as cv\n",
- "import pandas as pd\n",
- "from scipy.io import loadmat\n",
- "import numpy as np\n",
- "%matplotlib inline\n",
- "import matplotlib.pyplot as plt\n",
- "\n",
- "from tqdm import tqdm_notebook as tqdm\n",
- "%reload_ext autoreload\n",
- "%autoreload 2\n",
- "import sys\n",
- "sys.path.append('/work/megapixels_dev/megapixels/')\n",
- "from app.utils import file_utils, im_utils"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load Metadata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "dir_msceleb = '/data_store_hdd/datasets/people/msceleb/media/original/'\n",
- "kg_ids_msceleb = [x.replace('m.', '/m/') for x in os.listdir(dir_msceleb)]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
- "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_kg_from_kg_obj(obj):\n",
- " # TODO detect 503 service unavailable\n",
- " timeout_error_msg = b'HTTP Error 503: Service Unavailable'\n",
- " url_error_msg = b'HTTP Error 400: Bad Request'\n",
- " global api_key, url_kg_api\n",
- " kg_id = obj['kg_id']\n",
- " params = {\n",
- " 'ids': kg_id,\n",
- " 'limit': 1,\n",
- " 'indent': True,\n",
- " 'key': api_key,\n",
- " }\n",
- " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
- " result = {'kg_id': kg_id, 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
- " try:\n",
- " json_response = urllib.request.urlopen(url).read()\n",
- " except Exception as e:\n",
- " result['error'] = str(e)\n",
- " else:\n",
- " try:\n",
- " response = json.loads(json_response)\n",
- " items = response.get('itemListElement', [])\n",
- " result['accessed'] = True\n",
- " if items:\n",
- " item = items[0]\n",
- " item_result = item.get('result', [])\n",
- " result['description'] = item_result.get('description', '')\n",
- " det_desc = item_result.get('detailedDescription', '')\n",
- " if det_desc:\n",
- " result['description_extended'] = det_desc.get('articleBody','')\n",
- " result['description_license'] = det_desc.get('license','')\n",
- " result['description_url'] = det_desc.get('url','')\n",
- " else:\n",
- " result['description_extended'] = ''\n",
- " result['description_license'] = ''\n",
- " result['description_url'] = ''\n",
- " result_img = item_result.get('image', '')\n",
- " if result_img:\n",
- " result['image_url'] = result_img.get('contentUrl', '')\n",
- " result['name'] = item_result.get('name', '')\n",
- " result['score'] = item.get('resultScore', 0.0)\n",
- " result['url'] = item_result.get('url', '')\n",
- " except Exception as e:\n",
- " result['error'] = str(e)\n",
- " return result"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "unmapped_persons = [{'kg_id': x} for x in kg_ids_msceleb]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "2d0733764379489aa82ed20f20edbb9b",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "f38b47614c5b4894b7e026b6a46a5057",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=93418), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "opt_threads = 10\n",
- "pbar = tqdm(total=len(unmapped_persons))\n",
- "\n",
- "# define thread mapping function\n",
- "def pool_map_persons(obj):\n",
- " global pbar\n",
- " pbar.update(1)\n",
- " kg_obj = get_kg_from_kg_obj(obj)\n",
- " return kg_obj\n",
- "\n",
- "# convert to thread pool\n",
- "mapped_persons = []\n",
- "pool = ThreadPool(opt_threads)\n",
- "\n",
- "# start threading\n",
- "with tqdm(total=len(unmapped_persons)) as pbar:\n",
- " mapped_persons = pool.map(pool_map_persons, unmapped_persons)\n",
- "\n",
- "# close tqdm\n",
- "pbar.close()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "```\n",
- "{'@id': 'kg:/m/01008l96',\n",
- " 'name': 'Mohamed Guessous',\n",
- " '@type': ['Thing', 'Person'],\n",
- " 'description': 'Moroccan sociologist',\n",
- " 'image': {'contentUrl': 'http://t2.gstatic.com/images?q=tbn:ANd9GcTAHGBU-4ZzSqcMbDPnSHZA10u0L9Hnppdvt_AnFdQzOYnS0aHM',\n",
- " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous'},\n",
- " 'detailedDescription': {'articleBody': 'Mohamed Guessous was a Moroccan sociologist. He was also an active politician in the Socialist Union of Popular Forces.',\n",
- " 'url': 'https://en.wikipedia.org/wiki/Mohamed_Guessous',\n",
- " 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'},\n",
- " 'score': 11.046742}\n",
- " ```"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:megapixels]",
- "language": "python",
- "name": "conda-env-megapixels-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/megapixels/notebooks/datasets/pubfig83/identity.ipynb b/megapixels/notebooks/datasets/pubfig83/identity.ipynb
new file mode 100644
index 00000000..697d9cee
--- /dev/null
+++ b/megapixels/notebooks/datasets/pubfig83/identity.ipynb
@@ -0,0 +1,656 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PubFig83 Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "from pathlib import Path\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get List of Names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "83\n"
+ ]
+ }
+ ],
+ "source": [
+ "dir_lfw = '/data_store_hdd/datasets/people/pubfig83/media/original/'\n",
+ "names = [x.replace('_', ' ') for x in os.listdir(dir_lfw)]\n",
+ "print(len(names))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "julia stiles\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(names[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'Indian film director',\n",
+ " 'description_extended': 'Adoor Gopalakrishnan is an Indian film director, '\n",
+ " 'script writer, and producer. Adoor Gopalakrishnan '\n",
+ " 'had a major role in revolutioning Malayalam cinema '\n",
+ " 'during the 1970s and is regarded as one of the most '\n",
+ " 'notable filmmakers of India. ',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Adoor_Gopalakrishnan',\n",
+ " 'image_url': 'http://t2.gstatic.com/images?q=tbn:ANd9GcQA-_aEYy_goHLhGJjmn558S1VEwcALB98m83I9HwUTV_gUsded',\n",
+ " 'kg_id': '/m/07s7wk',\n",
+ " 'name': 'Adoor Gopalakrishnan',\n",
+ " 'query': 'Adoor Gopalakrishnan',\n",
+ " 'score': 501.001862,\n",
+ " 'url': 'http://www.adoorgopalakrishnan.com'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test get from name\n",
+ "q = 'Adoor Gopalakrishnan'\n",
+ "obj = {'query': q, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "for fn in names:\n",
+ " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "83\n",
+ "['julia stiles', 'orlando bloom', 'adam sandler', 'victoria beckham', 'martha stewart', 'george clooney', 'steve carell', 'jennifer lopez', 'harrison ford', 'jessica alba']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(mapped_persons))\n",
+ "print(names[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0af8e1f2d849473f933f506f5c8ced2b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "12/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "09fa539f1d62416caf7fd217e7cf4892",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9 remaining. Sleeping...\n",
+ "9/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c22e1ce3e6e441839f12e88846612825",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "6 remaining. Sleeping...\n",
+ "6/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c7c5af3d562b475ea3420eca594cee85",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "5 remaining. Sleeping...\n",
+ "5/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7fcb0916185443cbbca9e553923e232f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "2 remaining. Sleeping...\n",
+ "2/83 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7a5b35b2832d4e54bb87241f8bb29390",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=83), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_threads = 5\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed} remaining. Sleeping...')\n",
+ " time.sleep(60) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Clean data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "updated CC license: 0\n",
+ "items w/o KG meta: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(f'updated CC license: {nchanged}')\n",
+ "\n",
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(f'items w/o KG meta: {n_empty}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>accessed</th>\n",
+ " <th>description</th>\n",
+ " <th>description_extended</th>\n",
+ " <th>description_license</th>\n",
+ " <th>description_url</th>\n",
+ " <th>image_url</th>\n",
+ " <th>kg_id</th>\n",
+ " <th>name</th>\n",
+ " <th>query</th>\n",
+ " <th>score</th>\n",
+ " <th>url</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>True</td>\n",
+ " <td>American actress</td>\n",
+ " <td>Julia O'Hara Stiles is an American actress. Bo...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Julia_Stiles</td>\n",
+ " <td>http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB...</td>\n",
+ " <td>/m/02jtjz</td>\n",
+ " <td>Julia Stiles</td>\n",
+ " <td>julia stiles</td>\n",
+ " <td>637.113647</td>\n",
+ " <td>http://www.juliastilesblog.com</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>True</td>\n",
+ " <td>Actor</td>\n",
+ " <td>Orlando Jonathan Blanchard Bloom is an English...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Orlando_Bloom</td>\n",
+ " <td>http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc...</td>\n",
+ " <td>/m/09wj5</td>\n",
+ " <td>Orlando Bloom</td>\n",
+ " <td>orlando bloom</td>\n",
+ " <td>689.364319</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " accessed description \\\n",
+ "index \n",
+ "0 True American actress \n",
+ "1 True Actor \n",
+ "\n",
+ " description_extended description_license \\\n",
+ "index \n",
+ "0 Julia O'Hara Stiles is an American actress. Bo... CC BY-SA 3.0 \n",
+ "1 Orlando Jonathan Blanchard Bloom is an English... CC BY-SA 3.0 \n",
+ "\n",
+ " description_url \\\n",
+ "index \n",
+ "0 https://en.wikipedia.org/wiki/Julia_Stiles \n",
+ "1 https://en.wikipedia.org/wiki/Orlando_Bloom \n",
+ "\n",
+ " image_url kg_id \\\n",
+ "index \n",
+ "0 http://t1.gstatic.com/images?q=tbn:ANd9GcToFqB... /m/02jtjz \n",
+ "1 http://t0.gstatic.com/images?q=tbn:ANd9GcQ2eYc... /m/09wj5 \n",
+ "\n",
+ " name query score \\\n",
+ "index \n",
+ "0 Julia Stiles julia stiles 637.113647 \n",
+ "1 Orlando Bloom orlando bloom 689.364319 \n",
+ "\n",
+ " url \n",
+ "index \n",
+ "0 http://www.juliastilesblog.com \n",
+ "1 "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/pubfig83/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/umd_faces/identity.ipynb b/megapixels/notebooks/datasets/umd_faces/identity.ipynb
new file mode 100644
index 00000000..a3da9d58
--- /dev/null
+++ b/megapixels/notebooks/datasets/umd_faces/identity.ipynb
@@ -0,0 +1,675 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# UMD Faces Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "from pathlib import Path\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load IMDB Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n",
+ "with open(fp_filenames, 'r') as fp:\n",
+ " filenames = fp.readlines()\n",
+ "_ = filenames.pop(0)\n",
+ "filenames = [x.replace('_', ' ').strip() for x in filenames]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "aaron rodgers\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(filenames[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "b'{\\n \"YourFuckingIPAddress\": \"78.55.72.54\",\\n \"YourFuckingLocation\": \"Berlin, BE, Germany\",\\n \"YourFuckingHostname\": \"x4e374836.dyn.telefonica.de\",\\n \"YourFuckingISP\": \"O2 Deutschland\",\\n \"YourFuckingTorExit\": \"false\",\\n \"YourFuckingCountryCode\": \"DE\"\\n}\\n'"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "urllib.request.urlopen('https://wtfismyip.com/json').read()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': False,\n",
+ " 'description': '',\n",
+ " 'error': '<urlopen error [Errno -2] Name or service not known>',\n",
+ " 'kg_id': '',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 0.0,\n",
+ " 'url': ''}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'American singer',\n",
+ " 'description_extended': 'Taylor Alison Swift is an American '\n",
+ " \"singer-songwriter. As one of the world's leading \"\n",
+ " 'contemporary recording artists, she is known for '\n",
+ " 'narrative songs about her personal life, which has '\n",
+ " 'received widespread media coverage.\\n',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
+ " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
+ " 'kg_id': '/m/0dl567',\n",
+ " 'name': 'Taylor Swift',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 1241.476318,\n",
+ " 'url': 'http://taylorswift.com/'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test get from name\n",
+ "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "for fn in filenames:\n",
+ " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3107\n",
+ "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(mapped_persons))\n",
+ "print(filenames[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "667\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "print(num_non_accessed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d38371156f594787ba242f451a3da650",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3/3107 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d7c35975a7ad48fba2b9a02eb8ea2277",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "num_threads = 20\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed} remaining. Sleeping...')\n",
+ " time.sleep(60*10) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'query': 'aaron rodgers', 'kg_id': '/m/04q06_', 'score': 919.404602, 'description': 'Football quarterback', 'url': '', 'accessed': True, 'description_extended': 'Aaron Charles Rodgers is an American football quarterback for the Green Bay Packers of the National Football League. Rodgers played college football for the California Golden Bears, where he set several career passing records, including lowest single-season and career interception rates. ', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Aaron_Rodgers', 'image_url': 'http://t3.gstatic.com/images?q=tbn:ANd9GcTH_uiKmj_Y71Lc1kNCJK5HDiZsUSh3AxEBI9Jz_lp5q_89QZ9d', 'name': 'Aaron Rodgers'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test output for a person\n",
+ "print(mapped_persons[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>accessed</th>\n",
+ " <th>description</th>\n",
+ " <th>description_extended</th>\n",
+ " <th>description_license</th>\n",
+ " <th>description_url</th>\n",
+ " <th>image_url</th>\n",
+ " <th>kg_id</th>\n",
+ " <th>name</th>\n",
+ " <th>query</th>\n",
+ " <th>score</th>\n",
+ " <th>url</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>True</td>\n",
+ " <td>Football quarterback</td>\n",
+ " <td>Aaron Charles Rodgers is an American football ...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Rodgers</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui...</td>\n",
+ " <td>/m/04q06_</td>\n",
+ " <td>Aaron Rodgers</td>\n",
+ " <td>aaron rodgers</td>\n",
+ " <td>919.404602</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>True</td>\n",
+ " <td>American director</td>\n",
+ " <td>Derek Aaron Ruell, is an American director and...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Ruell</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8...</td>\n",
+ " <td>/m/05yf80</td>\n",
+ " <td>Aaron Ruell</td>\n",
+ " <td>aaron ruell</td>\n",
+ " <td>439.912476</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>True</td>\n",
+ " <td>American actor</td>\n",
+ " <td>Aaron Staton is an American actor. He is best ...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Aaron_Staton</td>\n",
+ " <td>http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV...</td>\n",
+ " <td>/m/06_vpyq</td>\n",
+ " <td>Aaron Staton</td>\n",
+ " <td>aaron staton</td>\n",
+ " <td>500.833344</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>True</td>\n",
+ " <td>American filmmaker</td>\n",
+ " <td>Abel Ferrara is an American filmmaker, known f...</td>\n",
+ " <td>CC BY-SA 3.0</td>\n",
+ " <td>https://en.wikipedia.org/wiki/Abel_Ferrara</td>\n",
+ " <td>http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-...</td>\n",
+ " <td>/m/056ryy</td>\n",
+ " <td>Abel Ferrara</td>\n",
+ " <td>abel ferrara</td>\n",
+ " <td>522.177734</td>\n",
+ " <td>http://www.abelferrara.com/</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>True</td>\n",
+ " <td>Actress</td>\n",
+ " <td></td>\n",
+ " <td></td>\n",
+ " <td></td>\n",
+ " <td>NaN</td>\n",
+ " <td>/m/0pbm3jf</td>\n",
+ " <td>Abigail Klein</td>\n",
+ " <td>abigail klein</td>\n",
+ " <td>341.831482</td>\n",
+ " <td></td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " accessed description \\\n",
+ "index \n",
+ "0 True Football quarterback \n",
+ "1 True American director \n",
+ "2 True American actor \n",
+ "3 True American filmmaker \n",
+ "4 True Actress \n",
+ "\n",
+ " description_extended description_license \\\n",
+ "index \n",
+ "0 Aaron Charles Rodgers is an American football ... CC BY-SA 3.0 \n",
+ "1 Derek Aaron Ruell, is an American director and... CC BY-SA 3.0 \n",
+ "2 Aaron Staton is an American actor. He is best ... CC BY-SA 3.0 \n",
+ "3 Abel Ferrara is an American filmmaker, known f... CC BY-SA 3.0 \n",
+ "4 \n",
+ "\n",
+ " description_url \\\n",
+ "index \n",
+ "0 https://en.wikipedia.org/wiki/Aaron_Rodgers \n",
+ "1 https://en.wikipedia.org/wiki/Aaron_Ruell \n",
+ "2 https://en.wikipedia.org/wiki/Aaron_Staton \n",
+ "3 https://en.wikipedia.org/wiki/Abel_Ferrara \n",
+ "4 \n",
+ "\n",
+ " image_url kg_id \\\n",
+ "index \n",
+ "0 http://t3.gstatic.com/images?q=tbn:ANd9GcTH_ui... /m/04q06_ \n",
+ "1 http://t3.gstatic.com/images?q=tbn:ANd9GcSzGg8... /m/05yf80 \n",
+ "2 http://t3.gstatic.com/images?q=tbn:ANd9GcTTmBV... /m/06_vpyq \n",
+ "3 http://t2.gstatic.com/images?q=tbn:ANd9GcRAhy-... /m/056ryy \n",
+ "4 NaN /m/0pbm3jf \n",
+ "\n",
+ " name query score url \n",
+ "index \n",
+ "0 Aaron Rodgers aaron rodgers 919.404602 \n",
+ "1 Aaron Ruell aaron ruell 439.912476 \n",
+ "2 Aaron Staton aaron staton 500.833344 \n",
+ "3 Abel Ferrara abel ferrara 522.177734 http://www.abelferrara.com/ \n",
+ "4 Abigail Klein abigail klein 341.831482 "
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/umd_faces/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb
index c0051b7b..91ca1626 100644
--- a/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb
+++ b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_identity_meta_kg.ipynb
@@ -2012,7 +2012,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.6.6"
}
},
"nbformat": 4,
diff --git a/megapixels/notebooks/datasets/vgg_face2/identity.ipynb b/megapixels/notebooks/datasets/vgg_face2/identity.ipynb
new file mode 100644
index 00000000..66eeeb90
--- /dev/null
+++ b/megapixels/notebooks/datasets/vgg_face2/identity.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# UMD Faces Knowledge Graph Identities\n",
+ "\n",
+ "- convert filename-names to names\n",
+ "- fetch Google Knowledge Graph entity IDs for each name\n",
+ "- save KG IDs to CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%reload_ext autoreload\n",
+ "%autoreload 2\n",
+ "\n",
+ "import os\n",
+ "import os.path as osp\n",
+ "from os.path import join\n",
+ "from glob import glob\n",
+ "import random\n",
+ "import math\n",
+ "from datetime import datetime\n",
+ "import requests\n",
+ "import json\n",
+ "import time\n",
+ "from pprint import pprint\n",
+ "from multiprocessing.pool import ThreadPool\n",
+ "import threading\n",
+ "import urllib.request\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd\n",
+ "from scipy.io import loadmat\n",
+ "import numpy as np\n",
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load IMDB Metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_filenames = '/data_store_hdd/datasets/people/umd_faces/downloads/filenames.txt'\n",
+ "with open(fp_filenames, 'r') as fp:\n",
+ " filenames = fp.readlines()\n",
+ "_ = filenames.pop(0)\n",
+ "filenames = [x.replace('_', ' ').strip() for x in filenames]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "aaron rodgers\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(filenames[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Google Knowledge Graph API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read API key\n",
+ "api_key = open('/work/megapixels_dev/3rdparty/knowledge-graph-api/.api_key').read()\n",
+ "url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def _get_kg_meta(result_obj, params):\n",
+ " global api_key, url_kg_api\n",
+ " \n",
+ " params['indent'] = True\n",
+ " params['key'] = api_key\n",
+ " params['limit'] = 1\n",
+ " \n",
+ " url = f'{url_kg_api}?{urllib.parse.urlencode(params)}'\n",
+ " try:\n",
+ " json_response = urllib.request.urlopen(url).read()\n",
+ " except Exception as e:\n",
+ " result['error'] = str(e)\n",
+ " else:\n",
+ " try:\n",
+ " response = json.loads(json_response)\n",
+ " items = response.get('itemListElement', [])\n",
+ " result_obj['accessed'] = True\n",
+ " if items:\n",
+ " item = items[0]\n",
+ " item_result = item.get('result', [])\n",
+ " result_obj['description'] = item_result.get('description', '')\n",
+ " det_desc = item_result.get('detailedDescription', '')\n",
+ " if not result_obj['kg_id']:\n",
+ " result_obj['kg_id'] = item_result.get('@id', '').replace('kg:','')\n",
+ " if det_desc:\n",
+ " result_obj['description_extended'] = det_desc.get('articleBody','')\n",
+ " result_obj['description_license'] = det_desc.get('license','')\n",
+ " result_obj['description_url'] = det_desc.get('url','')\n",
+ " else:\n",
+ " result_obj['description_extended'] = ''\n",
+ " result_obj['description_license'] = ''\n",
+ " result_obj['description_url'] = ''\n",
+ " result_img = item_result.get('image', '')\n",
+ " if result_img:\n",
+ " result_obj['image_url'] = result_img.get('contentUrl', '')\n",
+ " result_obj['name'] = item_result.get('name', '')\n",
+ " result_obj['score'] = item.get('resultScore', 0.0)\n",
+ " result_obj['url'] = item_result.get('url', '')\n",
+ " except Exception as e:\n",
+ " result_obj['error'] = str(e)\n",
+ " return result_obj\n",
+ " \n",
+ "def get_kg_from_name(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'query': obj['query']}\n",
+ " return _get_kg_meta(obj, params)\n",
+ " \n",
+ "def get_kg_from_kg_id(obj):\n",
+ " if obj['accessed']:\n",
+ " return obj\n",
+ " params = {'ids': obj['kg_id']}\n",
+ " return _get_kg_meta(obj, params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'accessed': True,\n",
+ " 'description': 'American singer',\n",
+ " 'description_extended': 'Taylor Alison Swift is an American '\n",
+ " \"singer-songwriter. As one of the world's leading \"\n",
+ " 'contemporary recording artists, she is known for '\n",
+ " 'narrative songs about her personal life, which has '\n",
+ " 'received widespread media coverage.\\n',\n",
+ " 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',\n",
+ " 'description_url': 'https://en.wikipedia.org/wiki/Taylor_Swift',\n",
+ " 'image_url': 'http://t0.gstatic.com/images?q=tbn:ANd9GcST848UJ0u31E6aoQfb2nnKZFyad7rwNF0ZLOCACGpu4jnboEzV',\n",
+ " 'kg_id': '/m/0dl567',\n",
+ " 'name': 'Taylor Swift',\n",
+ " 'query': 'Taylor Swift',\n",
+ " 'score': 1241.476318,\n",
+ " 'url': 'http://taylorswift.com/'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test get from name\n",
+ "obj = {'query': 'Taylor Swift', 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False} # default\n",
+ "result = get_kg_from_name(obj)\n",
+ "pprint(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define thread mapping function\n",
+ "def pool_map_persons(obj):\n",
+ " global pbar\n",
+ " pbar.update(1)\n",
+ " kg_obj = get_kg_from_name(obj)\n",
+ " return kg_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build mapped_person objects\n",
+ "mapped_persons = []\n",
+ "for fn in filenames:\n",
+ " obj = {'query': fn, 'kg_id': '', 'score': 0.0, 'description': '', 'url':'', 'accessed': False}\n",
+ " mapped_persons.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3107\n",
+ "['aaron rodgers', 'aaron ruell', 'aaron staton', 'abel ferrara', 'abigail klein', 'abraham benrubi', 'abyshamble', 'adabel guerrero', 'adam ant', 'adam buxton']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(mapped_persons))\n",
+ "print(filenames[0:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4752a8e0280e4a58843a21401d9ed649",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1102/3107 remaining\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "882c60006b0d4a9e809297bbc1e86807",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "num_threads = 20\n",
+ "pbar = tqdm(total=len(mapped_persons))\n",
+ "\n",
+ "num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ "\n",
+ "# convert to thread pool\n",
+ "while num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed}/{len(mapped_persons)} remaining')\n",
+ " pool = ThreadPool(num_threads)\n",
+ "\n",
+ " # start threading\n",
+ " with tqdm(total=len(mapped_persons)) as pbar:\n",
+ " mapped_persons = pool.map(pool_map_persons, mapped_persons)\n",
+ "\n",
+ " # close tqdm\n",
+ " pbar.close()\n",
+ "\n",
+ " num_non_accessed = sum(0 if x['accessed'] else 1 for x in mapped_persons)\n",
+ " if num_non_accessed > 0:\n",
+ " print(f'{num_non_accessed} remaining. Sleeping...')\n",
+ " time.sleep(60*20) # wait X minutes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'query': \"'Lee' George Quinones\", 'kg_id': '/m/08hvx1', 'score': 280.322754, 'description': 'Artist', 'url': 'http://www.leequinones.com/', 'accessed': True, 'description_extended': 'George Lee Quiñones is a Puerto Rican artist and actor. He is one of several artists to gain fame from the New York City Subway graffiti movement.\\n', 'description_license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License', 'description_url': 'https://en.wikipedia.org/wiki/Lee_Qui%C3%B1ones', 'name': 'Lee Quiñones'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# test output for a person\n",
+ "print(mapped_persons[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# reduce CC attribution string. the default strinf from Google Knowledge Graph is too verbose\n",
+ "cc_long = 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'\n",
+ "cc_short = 'CC BY-SA 3.0'\n",
+ "nchanged = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " license = mapped_person.get('description_license', None)\n",
+ " if license == cc_long:\n",
+ " nchanged += 1\n",
+ " mapped_person['description_license'] = cc_short\n",
+ "print(nchanged)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# find number not accessed\n",
+ "n_empty = 0\n",
+ "for mapped_person in mapped_persons:\n",
+ " if not mapped_person.get('accessed', False):\n",
+ " n_empty += 1\n",
+ " print(mapped_person['kg_id'])\n",
+ "print(n_empty)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dataframe for mapped persons\n",
+ "df_mapped_persons = pd.DataFrame.from_dict(mapped_persons)\n",
+ "df_mapped_persons.index.name = 'index'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# check output\n",
+ "df_mapped_persons.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save\n",
+ "fp_out = '/data_store_hdd/datasets/people/imdb_wiki/metadata/identity_kg.csv'\n",
+ "df_mapped_persons.to_csv(fp_out, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create small version\n",
+ "limit = 1000\n",
+ "fpp_out = Path(fp_out)\n",
+ "fp_out_sm = join(fpp_out.parent, f'{fpp_out.stem}_0_{limit}.csv')\n",
+ "df_mapped_persons_sm = pd.DataFrame.from_dict(mapped_persons[0:limit])\n",
+ "df_mapped_persons_sm.index.name = 'index'\n",
+ "df_mapped_persons_sm.to_csv(fp_out_sm, encoding = 'utf-16')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# for later, check similarity score to othyer identity kg CSVs\n",
+ "from difflib import SequenceMatcher\n",
+ "def similar(a, b):\n",
+ " return SequenceMatcher(None, a, b).ratio()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notes/datasets/new_datasets.md b/notes/datasets/new_datasets.md
new file mode 100644
index 00000000..2704077b
--- /dev/null
+++ b/notes/datasets/new_datasets.md
@@ -0,0 +1,4 @@
+# New Datasets
+
+- EmotionNet: <http://cbcsl.ece.ohio-state.edu/dbform_emotionet.html>
+- ChaLearn <http://chalearnlap.cvc.uab.es/dataset/18/data/26/metric/> \ No newline at end of file
diff --git a/notes/datasets/youtube_faces.html b/notes/datasets/youtube_faces.html
new file mode 100644
index 00000000..ce8422c6
--- /dev/null
+++ b/notes/datasets/youtube_faces.html
@@ -0,0 +1,1065 @@
+<!DOCTYPE html><html><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><style>body {
+ max-width: 980px;
+ border: 1px solid #ddd;
+ outline: 1300px solid #fff;
+ margin: 16px auto;
+}
+
+body .markdown-body
+{
+ padding: 45px;
+}
+
+@font-face {
+ font-family: fontawesome-mini;
+ src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAABE0AA8AAAAAHWwAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAABHU1VCAAABWAAAADsAAABUIIslek9TLzIAAAGUAAAAQwAAAFY3d1HZY21hcAAAAdgAAACqAAACOvWLi0FjdnQgAAAChAAAABMAAAAgBtX/BGZwZ20AAAKYAAAFkAAAC3CKkZBZZ2FzcAAACCgAAAAIAAAACAAAABBnbHlmAAAIMAAABdQAAAjkYT9TNWhlYWQAAA4EAAAAMwAAADYQ6WvNaGhlYQAADjgAAAAfAAAAJAc6A1pobXR4AAAOWAAAACAAAAA0Kmz/7mxvY2EAAA54AAAAHAAAABwQPBJubWF4cAAADpQAAAAgAAAAIAEHC/NuYW1lAAAOtAAAAYQAAALxhQT4h3Bvc3QAABA4AAAAfgAAAMS3SYh9cHJlcAAAELgAAAB6AAAAhuVBK7x4nGNgZGBg4GIwYLBjYHJx8wlh4MtJLMljkGJgYYAAkDwymzEnMz2RgQPGA8qxgGkOIGaDiAIAJjsFSAB4nGNgZHZmnMDAysDAVMW0h4GBoQdCMz5gMGRkAooysDIzYAUBaa4pDA4Pwz+yMwf9z2KIYg5imAYUZgTJAQDcoQvQAHic7ZHNDYJAFIRnBXf94cDRIiyCKkCpwFCPJ092RcKNDoYKcN4+EmMPvpdvk539zQyAPYBCXEUJhBcCrJ5SQ9YLnLJe4qF5rdb+uWPDngNHTkta101pNyWa8lMhn6xx2dqUnW4q9YOIhAOOeueMSgsR/6ry+P7O5s6xVNg4chBsHUuFnWNJ8uZYwrw7chrsHXkODo7cB0dHOYCTY8kv0VE2WJKD6gOlWjsxAAB4nGNgQAMSEMgc9D8LhAESbAPdAHicrVZpd9NGFB15SZyELCULLWphxMRpsEYmbMGACUGyYyBdnK2VoIsUO+m+8Ynf4F/zZNpz6Dd+Wu8bLySQtOdwmpOjd+fN1czbZRJaktgL65GUmy/F1NYmjew8CemGTctRfCg7eyFlisnfBVEQrZbatx2HREQiULWusEQQ+x5ZmmR86FFGy7akV03KLT3pLlvjQb1V334aOsqxO6GkZjN0aD2yJVUYVaJIpj1S0qZlqPorSSu8v8LMV81QwohOImm8GcbQSN4bZ7TKaDW24yiKbLLcKFIkmuFBFHmU1RLn5IoJDMoHzZDyyqcR5cP8iKzYo5xWsEu20/y+L3mndzk/sV9vUbbkQB/Ijuzg7HQlX4RbW2HctJPtKFQRdtd3QmzZ7FT/Zo/ymkYDtysyvdCMYKl8hRArP6HM/iFZLZxP+ZJHo1qykRNB62VO7Es+gdbjiClxzRhZ0N3RCRHU/ZIzDPaYPh788d4plgsTAngcy3pHJZwIEylhczRJ2jByYCVliyqp9a6YOOV1WsRbwn7t2tGXzmjjUHdiPFsPHVs5UcnxaFKnmUyd2knNoykNopR0JnjMrwMoP6JJXm1jNYmVR9M4ZsaERCICLdxLU0EsO7GkKQTNoxm9uRumuXYtWqTJA/Xco/f05la4udNT2g70s0Z/VqdiOtgL0+lp5C/xadrlIkXp+ukZfkziQdYCMpEtNsOUgwdv/Q7Sy9eWHIXXBtju7fMrqH3WRPCkAfsb0B5P1SkJTIWYVYhWQGKta1mWydWsFqnI1HdDmla+rNMEinIcF8e+jHH9XzMzlpgSvt+J07MjLj1z7UsI0xx8m3U9mtepxXIBcWZ5TqdZlu/rNMfyA53mWZ7X6QhLW6ejLD/UaYHlRzodY3lBC5p038GQizDkAg6QMISlA0NYXoIhLBUMYbkIQ1gWYQjLJRjC8mMYwnIZhrC8rGXV1FNJ49qZWAZsQmBijh65zEXlaiq5VEK7aFRqQ54SbpVUFM+qf2WgXjzyhjmwFkiXyJpfMc6Vj0bl+NYVLW8aO1fAsepvH472OfFS1ouFPwX/1dZUJb1izcOTq/Abhp5sJ6o2qXh0TZfPVT26/l9UVFgL9BtIhVgoyrJscGcihI86nYZqoJVDzGzMPLTrdcuan8P9NzFCFlD9+DcUGgvcg05ZSVnt4KzV19uy3DuDcjgTLEkxN/P6VvgiI7PSfpFZyp6PfB5wBYxKZdhqA60VvNknMQ+Z3iTPBHFbUTZI2tjOBIkNHPOAefOdBCZh6qoN5E7hhg34BWFuwXknXKJ6oyyH7kXs8yik/Fun4kT2qGiMwLPZG2Gv70LKb3EMJDT5pX4MVBWhqRg1FdA0Um6oBl/G2bptQsYO9CMqdsOyrOLDxxb3lZJtGYR8pIjVo6Of1l6iTqrcfmYUl++dvgXBIDUxf3vfdHGQyrtayTJHbQNTtxqVU9eaQ+NVh+rmUfW94+wTOWuabronHnpf06rbwcVcLLD2bQ7SUiYX1PVhhQ2iy8WlUOplNEnvuAcYFhjQ71CKjf+r+th8nitVhdFxJN9O1LfR52AM/A/Yf0f1A9D3Y+hyDS7P95oTn2704WyZrqIX66foNzBrrblZugbc0HQD4iFHrY64yg18pwZxeqS5HOkh4GPdFeIBwCaAxeAT3bWM5lMAo/mMOT7A58xh0GQOgy3mMNhmzhrADnMY7DKHwR5zGHzBnHWAL5nDIGQOg4g5DJ4wJwB4yhwGXzGHwdfMYfANc+4DfMscBjFzGCTMYbCv6dYwzC1e0F2gtkFVoANTT1jcw+JQU2XI/o4Xhv29Qcz+wSCm/qjp9pD6Ey8M9WeDmPqLQUz9VdOdIfU3Xhjq7wYx9Q+DmPpMvxjLZQa/jHyXCgeUXWw+5++J9w/bxUC5AAEAAf//AA94nIVVX2hbZRQ/5/t7893s5ja9f7ouzdZ0TTqz3bRJmogbWya6bG6Cq0VbSV2ddIJjFtfIQHEig80Hda8yUN/0YQz8AyriiyD+xQd92R4HCnaCb3samnpumrpsCsLlfPf7zvedc37nL3CAtc/5W/wQZGA3tOBSY/g+TMjHmwzEoM1Q8+ZjRZY4oJhmBw5/YB6Za0yC5AkhlwA1A1yCBIBOwCII0Cj0U8BAMdUCzq05sKwkP7SlUY6fcJk4Fb/RyE79/6P5hjM/F4aZiXBoeMgzcqQ4Xi1hPqfDLG5FT+lchCVU3lYMyvuwhl1mqndQL0RsuloLywHtthLXI06OblTrhfWVnpSJ5+mwu/JdbtuN3IAnkW0LLMcRwaC7ktrlzridM6kVdyf9uO1UNBByI7JhwtG2sEwab07ORBeilWhqavJCqV0qzZTOl/7ZXQ5TbTcdcFelyGhhRDAQpdqp1FEX3w3cFTc1k9pJQkmm4ySCbSikxRP2QOfN+0tHS5MrpQuTU1Mk5nw0E5Xa0WvrOwDyGax9yB9ma6DAg82wHc43SAGTI4GjBWebOePAERFE8/AHaQpZASSTy8A4WwZiLQMQ82mFKATO0ILicRAoDm9p5P99E5b/fXG+kQYY3TYUuqmERWYoT0u/GNYL2q/4WB3LaVS+VynXsVYIcWw6DkCh3nX1D+VzlYN4LClF5yexSQos8exqZ3KVP+wtrC54u4Nznq6cq+xpMpUUnZ8FUYzE86ud0g28NOIv3Gj5/rmA3ABs7S/ywzFuQ4qyd6QxfNtiQIaEgp3w/entQg4Vcbqa16M5FfpeUB8t1+qeg7mI7cUyOe79wOk86gSxkVec4KPTX69++5x68Yubn5/F+w52z7u08sJX7fZXv8ekT/d2mILJxq6sn+SC6qEJknzLJCxyZEKwWVqYmAPBxBE/9DLeZiWHu7lcr/VytrCRuHojncNuTt9h46tmacmYisnSamdN2bZptcsmSysdVsy1PrOvOzF3xN64Rb937t/og9KHxYdcjIUqFAmIAHGHNzlns+RTPgeUYAQm9DwpNxfxbhhBHPaw3/gfTcXO2L+eJVIx5nsyGkvm9X4/f+bGkH45G0PaSjcMXTjcZyTvi3UdHoCDjQd3IDUVsgwYmUoJK/gp4JJxeRI0MKHZIkgynyIBqBTOUs6rOVCojvjZ4mCQz49ZMlMcp8QoYk6NoBfsxnJtsBohpa8iGJS+ZH7gU7NxME6cmF+t7cO9vB8d3jTWSct0ycW9ranXmolNDwmVkNnxe+8JtoztwS5rKJ0xWS95tQ/1zMYzg69MzUZnNtl1ofNbsml/OJm6f9wjRjpnu2o4MzHzn77IQkRd+1DjwMQ2pqSjGMMhyjrgTbBAKksuUm0iU7hI0aN2wOKOq7WYBSH0HGihj/jkiPxAfmwsEbfYrjMG+j3ij932Db/LV7I/xruNrhnroxjR9HRMb2nTvO0ZXOoHPk8H2ZhDPx93qcE/53sH5np/dkIP7zzhTVKdR/BAY/9ElkkR+A6lJGsqpJ4oQcTxpvBT3Kn58VkaJjgHyPEIws57xkaHh9KuVpDEpJZeMbZ5w/zBHi5NMQ4r5VphsFqID7TyB9eR4pX216c3AHxpdAwoqU9qg0ZJ6yVLKmMSz1iG2z27ifx18NkY0LPx1W/wCc2l5LrznrIsiKsqbmB78A9wIGx4tI8rjihVHJyY9pgMirenVq0yWg7Iw7eogG7ZgYM3qR9959A/fZkg6MnD/exlkmc+jWV4SB15XUR+eqC6l6ZmgPtN9z5JMfik05OV8ljylunJ4J+wA/FUaQSSKotsYsCWqaPBidBLcxkWx7XKFRIb45TGaEhjlF9uUVPqXOtcIwsXbBvfoZXIyRYFdkfnqjExH98xpnPczqzjX/uNdO1Y17Wpi5+6Ts8BXtjVFasp9KZ1mOiNbH65c5w6HgmyF2jFCZywM8mWjRc7T5Pmt0lRy7Y71+jYbpGyvwG4sH0XeJxjYGRgYADiwBB/53h+m68M3MwvgCIM1z5N/g6j///9v5H5BbMnkMvBwAQSBQCIcA9gAHicY2BkYGAO+p8FJF/8//v/F/MLBqAICuAFALYQB5kAeJxjfsHAwLwAiCNB+P9fbJjJmoGBMRUo/wKCAfO2EnQAAAAAANoBXgGcAgICVALaA1IDvAPkBAYEPARyAAEAAAANAF0ABAAAAAAAAgAUACQAcwAAAG4LcAAAAAB4nHWRzWrCQBSFT+pPqUIXLXTTzayKUohGKIibCoLuhbrrYtTRxCYZmYyKyz5Fd32HvlDfoO/QkziIFJtw9bvnnpl7ZwLgBt/wcHieGAf2UGd24Atcou+4RH3kuEweO66QXx1XyaHjGh6ROa7jFp/cwStfMVvhy7GHO+/e8QWuvcBxifqz4zL5xXGF/Oa4Sn53XMPE+3Bcx4P3M9DrvYmWoRWNQVN02kFXTPdCU4pSGQu5saE2meiLhU6timPtz3SSs9ypTCdqrJabWJoT5QQnymSRTkXgt0/UkUqVkVbN807ZdtmxdiEWRidi6HqItdErNbN+aO2612qd9sYAGmvsYRBhyUu0EGhQbfK/gzYCdElTOgSdB1eEFBIxFYkNV4RFJWPeZyyYpVQVHTHZx4y/yVGX2LGWFZri51TccUOn5B7nPefVCSPvGhVVwUl9znveO2KkhV8Wk82PZ8qwZf8OVcu1+fSmWCMw/HMOwXvKaysqM+p+cVuWag8tvv+c+xdd+4+teJxtjUEOwiAURJla24KliQfhUA2g/Sl+CKXx+loNrpzVezOLEY34Ron/0WhwQoszOvQYIKFwwQiNSbSBeO2SZ0tBP4j3zVjKNng32ZmtD1VVXCuOiw/pJ8S3WOU6l+K5UOTaDC4+2TjKMtN9KQf1ezLx/Sg/00FCvABHhjDjAAB4nGPw3sFwIihiIyNjX+QGxp0cDBwMyQUbGVidNjEwMmiBGJu5mBg5ICw+BjCLzWkX0wGgNCeQze60i8EBwmZmcNmowtgRGLHBoSNiI3OKy0Y1EG8XRwMDI4tDR3JIBEhJJBBs5mFi5NHawfi/dQNL70YmBhcADHYj9AAA) format('woff');
+}
+
+.markdown-body {
+ font-family: sans-serif;
+ -ms-text-size-adjust: 100%;
+ -webkit-text-size-adjust: 100%;
+ color: #333333;
+ overflow: hidden;
+ font-family: "Helvetica Neue", Helvetica, "Segoe UI", Arial, freesans, sans-serif;
+ font-size: 16px;
+ line-height: 1.6;
+ word-wrap: break-word;
+}
+
+.markdown-body a {
+ background: transparent;
+}
+
+.markdown-body a:active,
+.markdown-body a:hover {
+ outline: 0;
+}
+
+.markdown-body b,
+.markdown-body strong {
+ font-weight: bold;
+}
+
+.markdown-body mark {
+ background: #ff0;
+ color: #000;
+ font-style: italic;
+ font-weight: bold;
+}
+
+.markdown-body sub,
+.markdown-body sup {
+ font-size: 75%;
+ line-height: 0;
+ position: relative;
+ vertical-align: baseline;
+}
+.markdown-body sup {
+ top: -0.5em;
+}
+.markdown-body sub {
+ bottom: -0.25em;
+}
+
+.markdown-body h1 {
+ font-size: 2em;
+ margin: 0.67em 0;
+}
+
+.markdown-body img {
+ border: 0;
+}
+
+.markdown-body hr {
+ -moz-box-sizing: content-box;
+ box-sizing: content-box;
+ height: 0;
+}
+
+.markdown-body pre {
+ overflow: auto;
+}
+
+.markdown-body code,
+.markdown-body kbd,
+.markdown-body pre,
+.markdown-body samp {
+ font-family: monospace, monospace;
+ font-size: 1em;
+}
+
+.markdown-body input {
+ color: inherit;
+ font: inherit;
+ margin: 0;
+}
+
+.markdown-body html input[disabled] {
+ cursor: default;
+}
+
+.markdown-body input {
+ line-height: normal;
+}
+
+.markdown-body input[type="checkbox"] {
+ box-sizing: border-box;
+ padding: 0;
+}
+
+.markdown-body table {
+ border-collapse: collapse;
+ border-spacing: 0;
+}
+
+.markdown-body td,
+.markdown-body th {
+ padding: 0;
+}
+
+.markdown-body .codehilitetable {
+ border: 0;
+ border-spacing: 0;
+}
+
+.markdown-body .codehilitetable tr {
+ border: 0;
+}
+
+.markdown-body .codehilitetable pre,
+.markdown-body .codehilitetable div.codehilite {
+ margin: 0;
+}
+
+.markdown-body .linenos,
+.markdown-body .code,
+.markdown-body .codehilitetable td {
+ border: 0;
+ padding: 0;
+}
+
+.markdown-body td:not(.linenos) .linenodiv {
+ padding: 0 !important;
+}
+
+.markdown-body .code {
+ width: 100%;
+}
+
+.markdown-body .linenos div pre,
+.markdown-body .linenodiv pre,
+.markdown-body .linenodiv {
+ border: 0;
+ -webkit-border-radius: 0;
+ -moz-border-radius: 0;
+ border-radius: 0;
+ -webkit-border-top-left-radius: 3px;
+ -webkit-border-bottom-left-radius: 3px;
+ -moz-border-radius-topleft: 3px;
+ -moz-border-radius-bottomleft: 3px;
+ border-top-left-radius: 3px;
+ border-bottom-left-radius: 3px;
+}
+
+.markdown-body .code div pre,
+.markdown-body .code div {
+ border: 0;
+ -webkit-border-radius: 0;
+ -moz-border-radius: 0;
+ border-radius: 0;
+ -webkit-border-top-right-radius: 3px;
+ -webkit-border-bottom-right-radius: 3px;
+ -moz-border-radius-topright: 3px;
+ -moz-border-radius-bottomright: 3px;
+ border-top-right-radius: 3px;
+ border-bottom-right-radius: 3px;
+}
+
+.markdown-body * {
+ -moz-box-sizing: border-box;
+ box-sizing: border-box;
+}
+
+.markdown-body input {
+ font: 13px Helvetica, arial, freesans, clean, sans-serif, "Segoe UI Emoji", "Segoe UI Symbol";
+ line-height: 1.4;
+}
+
+.markdown-body a {
+ color: #4183c4;
+ text-decoration: none;
+}
+
+.markdown-body a:hover,
+.markdown-body a:focus,
+.markdown-body a:active {
+ text-decoration: underline;
+}
+
+.markdown-body hr {
+ height: 0;
+ margin: 15px 0;
+ overflow: hidden;
+ background: transparent;
+ border: 0;
+ border-bottom: 1px solid #ddd;
+}
+
+.markdown-body hr:before,
+.markdown-body hr:after {
+ display: table;
+ content: " ";
+}
+
+.markdown-body hr:after {
+ clear: both;
+}
+
+.markdown-body h1,
+.markdown-body h2,
+.markdown-body h3,
+.markdown-body h4,
+.markdown-body h5,
+.markdown-body h6 {
+ margin-top: 15px;
+ margin-bottom: 15px;
+ line-height: 1.1;
+}
+
+.markdown-body h1 {
+ font-size: 30px;
+}
+
+.markdown-body h2 {
+ font-size: 21px;
+}
+
+.markdown-body h3 {
+ font-size: 16px;
+}
+
+.markdown-body h4 {
+ font-size: 14px;
+}
+
+.markdown-body h5 {
+ font-size: 12px;
+}
+
+.markdown-body h6 {
+ font-size: 11px;
+}
+
+.markdown-body blockquote {
+ margin: 0;
+}
+
+.markdown-body ul,
+.markdown-body ol {
+ padding: 0;
+ margin-top: 0;
+ margin-bottom: 0;
+}
+
+.markdown-body ol ol,
+.markdown-body ul ol {
+ list-style-type: lower-roman;
+}
+
+.markdown-body ul ul ol,
+.markdown-body ul ol ol,
+.markdown-body ol ul ol,
+.markdown-body ol ol ol {
+ list-style-type: lower-alpha;
+}
+
+.markdown-body dd {
+ margin-left: 0;
+}
+
+.markdown-body code,
+.markdown-body pre,
+.markdown-body samp {
+ font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
+ font-size: 12px;
+}
+
+.markdown-body pre {
+ margin-top: 0;
+ margin-bottom: 0;
+}
+
+.markdown-body kbd {
+ background-color: #e7e7e7;
+ background-image: -moz-linear-gradient(#fefefe, #e7e7e7);
+ background-image: -webkit-linear-gradient(#fefefe, #e7e7e7);
+ background-image: linear-gradient(#fefefe, #e7e7e7);
+ background-repeat: repeat-x;
+ border-radius: 2px;
+ border: 1px solid #cfcfcf;
+ color: #000;
+ padding: 3px 5px;
+ line-height: 10px;
+ font: 11px Consolas, "Liberation Mono", Menlo, Courier, monospace;
+ display: inline-block;
+}
+
+.markdown-body>*:first-child {
+ margin-top: 0 !important;
+}
+
+.markdown-body>*:last-child {
+ margin-bottom: 0 !important;
+}
+
+.markdown-body .headerlink {
+ font: normal 400 16px fontawesome-mini;
+ vertical-align: middle;
+ margin-left: -16px;
+ float: left;
+ display: inline-block;
+ text-decoration: none;
+ opacity: 0;
+ color: #333;
+}
+
+.markdown-body .headerlink:focus {
+ outline: none;
+}
+
+.markdown-body h1 .headerlink {
+ margin-top: 0.8rem;
+}
+
+.markdown-body h2 .headerlink,
+.markdown-body h3 .headerlink {
+ margin-top: 0.6rem;
+}
+
+.markdown-body h4 .headerlink {
+ margin-top: 0.2rem;
+}
+
+.markdown-body h5 .headerlink,
+.markdown-body h6 .headerlink {
+ margin-top: 0;
+}
+
+.markdown-body .headerlink:hover,
+.markdown-body h1:hover .headerlink,
+.markdown-body h2:hover .headerlink,
+.markdown-body h3:hover .headerlink,
+.markdown-body h4:hover .headerlink,
+.markdown-body h5:hover .headerlink,
+.markdown-body h6:hover .headerlink {
+ opacity: 1;
+ text-decoration: none;
+}
+
+.markdown-body h1 {
+ padding-bottom: 0.3em;
+ font-size: 2.25em;
+ line-height: 1.2;
+ border-bottom: 1px solid #eee;
+}
+
+.markdown-body h2 {
+ padding-bottom: 0.3em;
+ font-size: 1.75em;
+ line-height: 1.225;
+ border-bottom: 1px solid #eee;
+}
+
+.markdown-body h3 {
+ font-size: 1.5em;
+ line-height: 1.43;
+}
+
+.markdown-body h4 {
+ font-size: 1.25em;
+}
+
+.markdown-body h5 {
+ font-size: 1em;
+}
+
+.markdown-body h6 {
+ font-size: 1em;
+ color: #777;
+}
+
+.markdown-body p,
+.markdown-body blockquote,
+.markdown-body ul,
+.markdown-body ol,
+.markdown-body dl,
+.markdown-body table,
+.markdown-body pre,
+.markdown-body .admonition {
+ margin-top: 0;
+ margin-bottom: 16px;
+}
+
+.markdown-body hr {
+ height: 4px;
+ padding: 0;
+ margin: 16px 0;
+ background-color: #e7e7e7;
+ border: 0 none;
+}
+
+.markdown-body ul,
+.markdown-body ol {
+ padding-left: 2em;
+}
+
+.markdown-body ul ul,
+.markdown-body ul ol,
+.markdown-body ol ol,
+.markdown-body ol ul {
+ margin-top: 0;
+ margin-bottom: 0;
+}
+
+.markdown-body li>p {
+ margin-top: 16px;
+}
+
+.markdown-body dl {
+ padding: 0;
+}
+
+.markdown-body dl dt {
+ padding: 0;
+ margin-top: 16px;
+ font-size: 1em;
+ font-style: italic;
+ font-weight: bold;
+}
+
+.markdown-body dl dd {
+ padding: 0 16px;
+ margin-bottom: 16px;
+}
+
+.markdown-body blockquote {
+ padding: 0 15px;
+ color: #777;
+ border-left: 4px solid #ddd;
+}
+
+.markdown-body blockquote>:first-child {
+ margin-top: 0;
+}
+
+.markdown-body blockquote>:last-child {
+ margin-bottom: 0;
+}
+
+.markdown-body table {
+ display: block;
+ width: 100%;
+ overflow: auto;
+ word-break: normal;
+ word-break: keep-all;
+}
+
+.markdown-body table th {
+ font-weight: bold;
+}
+
+.markdown-body table th,
+.markdown-body table td {
+ padding: 6px 13px;
+ border: 1px solid #ddd;
+}
+
+.markdown-body table tr {
+ background-color: #fff;
+ border-top: 1px solid #ccc;
+}
+
+.markdown-body table tr:nth-child(2n) {
+ background-color: #f8f8f8;
+}
+
+.markdown-body img {
+ max-width: 100%;
+ -moz-box-sizing: border-box;
+ box-sizing: border-box;
+}
+
+.markdown-body code,
+.markdown-body samp {
+ padding: 0;
+ padding-top: 0.2em;
+ padding-bottom: 0.2em;
+ margin: 0;
+ font-size: 85%;
+ background-color: rgba(0,0,0,0.04);
+ border-radius: 3px;
+}
+
+.markdown-body code:before,
+.markdown-body code:after {
+ letter-spacing: -0.2em;
+ content: "\00a0";
+}
+
+.markdown-body pre>code {
+ padding: 0;
+ margin: 0;
+ font-size: 100%;
+ word-break: normal;
+ white-space: pre;
+ background: transparent;
+ border: 0;
+}
+
+.markdown-body .codehilite {
+ margin-bottom: 16px;
+}
+
+.markdown-body .codehilite pre,
+.markdown-body pre {
+ padding: 16px;
+ overflow: auto;
+ font-size: 85%;
+ line-height: 1.45;
+ background-color: #f7f7f7;
+ border-radius: 3px;
+}
+
+.markdown-body .codehilite pre {
+ margin-bottom: 0;
+ word-break: normal;
+}
+
+.markdown-body pre {
+ word-wrap: normal;
+}
+
+.markdown-body pre code {
+ display: inline;
+ max-width: initial;
+ padding: 0;
+ margin: 0;
+ overflow: initial;
+ line-height: inherit;
+ word-wrap: normal;
+ background-color: transparent;
+ border: 0;
+}
+
+.markdown-body pre code:before,
+.markdown-body pre code:after {
+ content: normal;
+}
+
+/* Admonition */
+.markdown-body .admonition {
+ -webkit-border-radius: 3px;
+ -moz-border-radius: 3px;
+ position: relative;
+ border-radius: 3px;
+ border: 1px solid #e0e0e0;
+ border-left: 6px solid #333;
+ padding: 10px 10px 10px 30px;
+}
+
+.markdown-body .admonition table {
+ color: #333;
+}
+
+.markdown-body .admonition p {
+ padding: 0;
+}
+
+.markdown-body .admonition-title {
+ font-weight: bold;
+ margin: 0;
+}
+
+.markdown-body .admonition>.admonition-title {
+ color: #333;
+}
+
+.markdown-body .attention>.admonition-title {
+ color: #a6d796;
+}
+
+.markdown-body .caution>.admonition-title {
+ color: #d7a796;
+}
+
+.markdown-body .hint>.admonition-title {
+ color: #96c6d7;
+}
+
+.markdown-body .danger>.admonition-title {
+ color: #c25f77;
+}
+
+.markdown-body .question>.admonition-title {
+ color: #96a6d7;
+}
+
+.markdown-body .note>.admonition-title {
+ color: #d7c896;
+}
+
+.markdown-body .admonition:before,
+.markdown-body .attention:before,
+.markdown-body .caution:before,
+.markdown-body .hint:before,
+.markdown-body .danger:before,
+.markdown-body .question:before,
+.markdown-body .note:before {
+ font: normal normal 16px fontawesome-mini;
+ -moz-osx-font-smoothing: grayscale;
+ -webkit-user-select: none;
+ -moz-user-select: none;
+ -ms-user-select: none;
+ user-select: none;
+ line-height: 1.5;
+ color: #333;
+ position: absolute;
+ left: 0;
+ top: 0;
+ padding-top: 10px;
+ padding-left: 10px;
+}
+
+.markdown-body .admonition:before {
+ content: "\f056\00a0";
+ color: 333;
+}
+
+.markdown-body .attention:before {
+ content: "\f058\00a0";
+ color: #a6d796;
+}
+
+.markdown-body .caution:before {
+ content: "\f06a\00a0";
+ color: #d7a796;
+}
+
+.markdown-body .hint:before {
+ content: "\f05a\00a0";
+ color: #96c6d7;
+}
+
+.markdown-body .danger:before {
+ content: "\f057\00a0";
+ color: #c25f77;
+}
+
+.markdown-body .question:before {
+ content: "\f059\00a0";
+ color: #96a6d7;
+}
+
+.markdown-body .note:before {
+ content: "\f040\00a0";
+ color: #d7c896;
+}
+
+.markdown-body .admonition::after {
+ content: normal;
+}
+
+.markdown-body .attention {
+ border-left: 6px solid #a6d796;
+}
+
+.markdown-body .caution {
+ border-left: 6px solid #d7a796;
+}
+
+.markdown-body .hint {
+ border-left: 6px solid #96c6d7;
+}
+
+.markdown-body .danger {
+ border-left: 6px solid #c25f77;
+}
+
+.markdown-body .question {
+ border-left: 6px solid #96a6d7;
+}
+
+.markdown-body .note {
+ border-left: 6px solid #d7c896;
+}
+
+.markdown-body .admonition>*:first-child {
+ margin-top: 0 !important;
+}
+
+.markdown-body .admonition>*:last-child {
+ margin-bottom: 0 !important;
+}
+
+/* progress bar*/
+.markdown-body .progress {
+ display: block;
+ width: 300px;
+ margin: 10px 0;
+ height: 24px;
+ -webkit-border-radius: 3px;
+ -moz-border-radius: 3px;
+ border-radius: 3px;
+ background-color: #ededed;
+ position: relative;
+ box-shadow: inset -1px 1px 3px rgba(0, 0, 0, .1);
+}
+
+.markdown-body .progress-label {
+ position: absolute;
+ text-align: center;
+ font-weight: bold;
+ width: 100%; margin: 0;
+ line-height: 24px;
+ color: #333;
+ text-shadow: 1px 1px 0 #fefefe, -1px -1px 0 #fefefe, -1px 1px 0 #fefefe, 1px -1px 0 #fefefe, 0 1px 0 #fefefe, 0 -1px 0 #fefefe, 1px 0 0 #fefefe, -1px 0 0 #fefefe, 1px 1px 2px #000;
+ -webkit-font-smoothing: antialiased !important;
+ white-space: nowrap;
+ overflow: hidden;
+}
+
+.markdown-body .progress-bar {
+ height: 24px;
+ float: left;
+ -webkit-border-radius: 3px;
+ -moz-border-radius: 3px;
+ border-radius: 3px;
+ background-color: #96c6d7;
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, .5), inset 0 -1px 0 rgba(0, 0, 0, .1);
+ background-size: 30px 30px;
+ background-image: -webkit-linear-gradient(
+ 135deg, rgba(255, 255, 255, .4) 27%,
+ transparent 27%,
+ transparent 52%, rgba(255, 255, 255, .4) 52%,
+ rgba(255, 255, 255, .4) 77%,
+ transparent 77%, transparent
+ );
+ background-image: -moz-linear-gradient(
+ 135deg,
+ rgba(255, 255, 255, .4) 27%, transparent 27%,
+ transparent 52%, rgba(255, 255, 255, .4) 52%,
+ rgba(255, 255, 255, .4) 77%, transparent 77%,
+ transparent
+ );
+ background-image: -ms-linear-gradient(
+ 135deg,
+ rgba(255, 255, 255, .4) 27%, transparent 27%,
+ transparent 52%, rgba(255, 255, 255, .4) 52%,
+ rgba(255, 255, 255, .4) 77%, transparent 77%,
+ transparent
+ );
+ background-image: -o-linear-gradient(
+ 135deg,
+ rgba(255, 255, 255, .4) 27%, transparent 27%,
+ transparent 52%, rgba(255, 255, 255, .4) 52%,
+ rgba(255, 255, 255, .4) 77%, transparent 77%,
+ transparent
+ );
+ background-image: linear-gradient(
+ 135deg,
+ rgba(255, 255, 255, .4) 27%, transparent 27%,
+ transparent 52%, rgba(255, 255, 255, .4) 52%,
+ rgba(255, 255, 255, .4) 77%, transparent 77%,
+ transparent
+ );
+}
+
+.markdown-body .progress-100plus .progress-bar {
+ background-color: #a6d796;
+}
+
+.markdown-body .progress-80plus .progress-bar {
+ background-color: #c6d796;
+}
+
+.markdown-body .progress-60plus .progress-bar {
+ background-color: #d7c896;
+}
+
+.markdown-body .progress-40plus .progress-bar {
+ background-color: #d7a796;
+}
+
+.markdown-body .progress-20plus .progress-bar {
+ background-color: #d796a6;
+}
+
+.markdown-body .progress-0plus .progress-bar {
+ background-color: #c25f77;
+}
+
+.markdown-body .candystripe-animate .progress-bar{
+ -webkit-animation: animate-stripes 3s linear infinite;
+ -moz-animation: animate-stripes 3s linear infinite;
+ animation: animate-stripes 3s linear infinite;
+}
+
+@-webkit-keyframes animate-stripes {
+ 0% {
+ background-position: 0 0;
+ }
+
+ 100% {
+ background-position: 60px 0;
+ }
+}
+
+@-moz-keyframes animate-stripes {
+ 0% {
+ background-position: 0 0;
+ }
+
+ 100% {
+ background-position: 60px 0;
+ }
+}
+
+@keyframes animate-stripes {
+ 0% {
+ background-position: 0 0;
+ }
+
+ 100% {
+ background-position: 60px 0;
+ }
+}
+
+.markdown-body .gloss .progress-bar {
+ box-shadow:
+ inset 0 4px 12px rgba(255, 255, 255, .7),
+ inset 0 -12px 0 rgba(0, 0, 0, .05);
+}
+
+/* MultiMarkdown Critic Blocks */
+.markdown-body .critic_mark {
+ background: #ff0;
+}
+
+.markdown-body .critic_delete {
+ color: #c82829;
+ text-decoration: line-through;
+}
+
+.markdown-body .critic_insert {
+ color: #718c00 ;
+ text-decoration: underline;
+}
+
+.markdown-body .critic_comment {
+ color: #8e908c;
+ font-style: italic;
+}
+
+.markdown-body .headeranchor {
+ font: normal normal 16px fontawesome-mini;
+ line-height: 1;
+ display: inline-block;
+ text-decoration: none;
+ -webkit-font-smoothing: antialiased;
+ -moz-osx-font-smoothing: grayscale;
+ -webkit-user-select: none;
+ -moz-user-select: none;
+ -ms-user-select: none;
+ user-select: none;
+}
+
+.headeranchor:before {
+ content: '\e157';
+}
+
+.markdown-body .task-list-item {
+ list-style-type: none;
+}
+
+.markdown-body .task-list-item+.task-list-item {
+ margin-top: 3px;
+}
+
+.markdown-body .task-list-item input {
+ margin: 0 4px 0.25em -20px;
+ vertical-align: middle;
+}
+
+/* Media */
+@media only screen and (min-width: 480px) {
+ .markdown-body {
+ font-size:14px;
+ }
+}
+
+@media only screen and (min-width: 768px) {
+ .markdown-body {
+ font-size:16px;
+ }
+}
+
+@media print {
+ .markdown-body * {
+ background: transparent !important;
+ color: black !important;
+ filter:none !important;
+ -ms-filter: none !important;
+ }
+
+ .markdown-body {
+ font-size:12pt;
+ max-width:100%;
+ outline:none;
+ border: 0;
+ }
+
+ .markdown-body a,
+ .markdown-body a:visited {
+ text-decoration: underline;
+ }
+
+ .markdown-body .headeranchor-link {
+ display: none;
+ }
+
+ .markdown-body a[href]:after {
+ content: " (" attr(href) ")";
+ }
+
+ .markdown-body abbr[title]:after {
+ content: " (" attr(title) ")";
+ }
+
+ .markdown-body .ir a:after,
+ .markdown-body a[href^="javascript:"]:after,
+ .markdown-body a[href^="#"]:after {
+ content: "";
+ }
+
+ .markdown-body pre {
+ white-space: pre;
+ white-space: pre-wrap;
+ word-wrap: break-word;
+ }
+
+ .markdown-body pre,
+ .markdown-body blockquote {
+ border: 1px solid #999;
+ padding-right: 1em;
+ page-break-inside: avoid;
+ }
+
+ .markdown-body .progress,
+ .markdown-body .progress-bar {
+ -moz-box-shadow: none;
+ -webkit-box-shadow: none;
+ box-shadow: none;
+ }
+
+ .markdown-body .progress {
+ border: 1px solid #ddd;
+ }
+
+ .markdown-body .progress-bar {
+ height: 22px;
+ border-right: 1px solid #ddd;
+ }
+
+ .markdown-body tr,
+ .markdown-body img {
+ page-break-inside: avoid;
+ }
+
+ .markdown-body img {
+ max-width: 100% !important;
+ }
+
+ .markdown-body p,
+ .markdown-body h2,
+ .markdown-body h3 {
+ orphans: 3;
+ widows: 3;
+ }
+
+ .markdown-body h2,
+ .markdown-body h3 {
+ page-break-after: avoid;
+ }
+}
+</style><style>/*GitHub*/
+.codehilite {background-color:#fff;color:#333333;}
+.codehilite .hll {background-color:#ffffcc;}
+.codehilite .c{color:#999988;font-style:italic}
+.codehilite .err{color:#a61717;background-color:#e3d2d2}
+.codehilite .k{font-weight:bold}
+.codehilite .o{font-weight:bold}
+.codehilite .cm{color:#999988;font-style:italic}
+.codehilite .cp{color:#999999;font-weight:bold}
+.codehilite .c1{color:#999988;font-style:italic}
+.codehilite .cs{color:#999999;font-weight:bold;font-style:italic}
+.codehilite .gd{color:#000000;background-color:#ffdddd}
+.codehilite .ge{font-style:italic}
+.codehilite .gr{color:#aa0000}
+.codehilite .gh{color:#999999}
+.codehilite .gi{color:#000000;background-color:#ddffdd}
+.codehilite .go{color:#888888}
+.codehilite .gp{color:#555555}
+.codehilite .gs{font-weight:bold}
+.codehilite .gu{color:#800080;font-weight:bold}
+.codehilite .gt{color:#aa0000}
+.codehilite .kc{font-weight:bold}
+.codehilite .kd{font-weight:bold}
+.codehilite .kn{font-weight:bold}
+.codehilite .kp{font-weight:bold}
+.codehilite .kr{font-weight:bold}
+.codehilite .kt{color:#445588;font-weight:bold}
+.codehilite .m{color:#009999}
+.codehilite .s{color:#dd1144}
+.codehilite .n{color:#333333}
+.codehilite .na{color:teal}
+.codehilite .nb{color:#0086b3}
+.codehilite .nc{color:#445588;font-weight:bold}
+.codehilite .no{color:teal}
+.codehilite .ni{color:purple}
+.codehilite .ne{color:#990000;font-weight:bold}
+.codehilite .nf{color:#990000;font-weight:bold}
+.codehilite .nn{color:#555555}
+.codehilite .nt{color:navy}
+.codehilite .nv{color:teal}
+.codehilite .ow{font-weight:bold}
+.codehilite .w{color:#bbbbbb}
+.codehilite .mf{color:#009999}
+.codehilite .mh{color:#009999}
+.codehilite .mi{color:#009999}
+.codehilite .mo{color:#009999}
+.codehilite .sb{color:#dd1144}
+.codehilite .sc{color:#dd1144}
+.codehilite .sd{color:#dd1144}
+.codehilite .s2{color:#dd1144}
+.codehilite .se{color:#dd1144}
+.codehilite .sh{color:#dd1144}
+.codehilite .si{color:#dd1144}
+.codehilite .sx{color:#dd1144}
+.codehilite .sr{color:#009926}
+.codehilite .s1{color:#dd1144}
+.codehilite .ss{color:#990073}
+.codehilite .bp{color:#999999}
+.codehilite .vc{color:teal}
+.codehilite .vg{color:teal}
+.codehilite .vi{color:teal}
+.codehilite .il{color:#009999}
+.codehilite .gc{color:#999;background-color:#EAF2F5}
+</style><title>youtube_faces</title></head><body><article class="markdown-body"><h1 id="youtube-faces">YouTube Faces<a class="headerlink" href="#youtube-faces" title="Permanent link"></a></h1>
+<ul>
+<li>Ellen Saracini was captured into the LFW dataset because her husband was killed in the 9/11 attack<ul>
+<li><a href="https://chicago.cbslocal.com/2013/09/11/911-widow-recalls-last-words-with-pilot-husband-i-love-you/">https://chicago.cbslocal.com/2013/09/11/911-widow-recalls-last-words-with-pilot-husband-i-love-you/</a></li>
+<li><a href="https://www.pennmedicine.org/updates/blogs/giving-blog/2014/may/breast-cancer-ellen">https://www.pennmedicine.org/updates/blogs/giving-blog/2014/may/breast-cancer-ellen</a></li>
+</ul>
+</li>
+</ul></article></body></html> \ No newline at end of file
diff --git a/notes/datasets/youtube_faces.md b/notes/datasets/youtube_faces.md
new file mode 100644
index 00000000..3b73a69b
--- /dev/null
+++ b/notes/datasets/youtube_faces.md
@@ -0,0 +1,6 @@
+# YouTube Faces
+
+- Ellen Saracini was captured into the LFW dataset because her husband was killed in the 9/11 attack
+ - http://vis-www.cs.umass.edu/lfw/person/Ellen_Saracini.html
+ - https://chicago.cbslocal.com/2013/09/11/911-widow-recalls-last-words-with-pilot-husband-i-love-you/
+ - https://www.pennmedicine.org/updates/blogs/giving-blog/2014/may/breast-cancer-ellen \ No newline at end of file
diff --git a/notes/utils/bash_utils.md b/notes/utils/bash_utils.md
new file mode 100644
index 00000000..00623280
--- /dev/null
+++ b/notes/utils/bash_utils.md
@@ -0,0 +1,6 @@
+
+#### Grep
+
+Grep utf-16 unicode files
+
+`iconv -f utf-16 -t utf-8 file.txt |grep yourtext` \ No newline at end of file