updates

author: adamhrv <adam@ahprojects.com> 2019-02-14 14:45:18 +0100
committer: adamhrv <adam@ahprojects.com> 2019-02-14 14:45:18 +0100
commit: 3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (patch)
tree: 436347b8466422a1019209f9f04937ea1ce0e4eb
parent: 41247c08ea359d0a72a247992d2019ae2120536c (diff)
8 files changed, 813 insertions, 84 deletions
diff --git a/megapixels/app/models/data_store.py b/megapixels/app/models/data_store.py
index a8d6916f..b4260b9c 100644
--- a/megapixels/app/models/data_store.py
+++ b/megapixels/app/models/data_store.py
@@ -2,6 +2,7 @@ import os
 from os.path import join
 import logging
 
+from app.utils.logger_utils import Logger
 from app.settings import app_cfg as cfg
 from app.settings import types
 
@@ -11,41 +12,62 @@ from app.settings import types
 # -------------------------------------------------------------------------
 
 class DataStore:
+  
   # local data store
+  log = Logger.getLogger()
+
   def __init__(self, opt_data_store, opt_dataset):
     self.data_store = join(f'/data_store_{opt_data_store.name.lower()}')
-    self.dir_dataset = join(self.data_store, 'datasets', cfg.DIR_PEOPLE, opt_dataset.name.lower())
-    self.dir_media = join(self.dir_dataset, 'media')
-    self.dir_metadata = join(self.dir_dataset, 'metadata')
+    self._dir_dataset = join(self.data_store, 'datasets', cfg.DIR_PEOPLE, opt_dataset.name.lower())
+    self._dir_media = join(self._dir_dataset, 'media')
+    self._dir_metadata = join(self._dir_dataset, 'metadata')
 
   def metadata(self, enum_type):
-    return join(self.dir_metadata, f'{enum_type.name.lower()}.csv')
+    return join(self._dir_metadata, f'{enum_type.name.lower()}.csv')
+
+  @property
+  def dir_dataset(self):
+    return self._dir_dataset
+
+  @property
+  def dir_media(self):
+    return self._dir_media
+
+  @property
+  def dir_media_original(self):
+    return join(self._dir_media, 'original')
+
+  @property
+  def dir_metadata(self):
+    return self._dir_metadata
 
   def metadata_dir(self):
-    return join(self.dir_metadata)
+    self.log.warn('deprecated. use dir_metadata')
+    return self._dir_metadata
 
   def media_dir(self):
-    return join(self.dir_media)
+    self.log.warn('deprecated. use dir_media')
+    return self._dir_media
 
   def media_images_original(self):
-    return join(self.dir_media, 'original')
+    return join(self._dir_media, 'original')
 
   def face(self, subdir, fn, ext):
     if subdir == '' or subdir is None:
       subdir = '.'
-    return join(self.dir_media, 'original', subdir, f'{fn}.{ext}')
+    return join(self._dir_media, 'original', subdir, f'{fn}.{ext}')
 
   def face_crop(self, subdir, fn, ext):
-    return join(self.dir_media, 'cropped', subdir, f'{fn}.{ext}')
+    return join(self._dir_media, 'cropped', subdir, f'{fn}.{ext}')
 
   def face_uuid(self, uuid, ext):
-    return join(self.dir_media, 'uuid',f'{uuid}.{ext}')
+    return join(self._dir_media, 'uuid',f'{uuid}.{ext}')
 
   def face_crop_uuid(self, uuid, ext):
-    return join(self.dir_media, 'uuid', f'{uuid}.{ext}')
+    return join(self._dir_media, 'uuid', f'{uuid}.{ext}')
 
   def uuid_dir(self):
-    return join(self.dir_media, 'uuid')
+    return join(self._dir_media, 'uuid')
 
 
 class DataStoreS3:
diff --git a/megapixels/app/settings/app_cfg.py b/megapixels/app/settings/app_cfg.py
index 2b10f9f0..0b1fb69d 100644
--- a/megapixels/app/settings/app_cfg.py
+++ b/megapixels/app/settings/app_cfg.py
@@ -6,6 +6,7 @@ from dotenv import load_dotenv
 
 from app.settings import types
 from app.utils import click_utils
+from pathlib import Path
 
 import codecs
 codecs.register(lambda name: codecs.lookup('utf8') if name == 'utf8mb4' else None)
@@ -26,6 +27,10 @@ FaceLandmark2D_5Var = click_utils.ParamVar(types.FaceLandmark2D_5)
 FaceLandmark2D_68Var = click_utils.ParamVar(types.FaceLandmark2D_68)
 FaceLandmark3D_68Var = click_utils.ParamVar(types.FaceLandmark3D_68)
 
+# base path
+DIR_SELF = os.path.dirname(os.path.realpath(__file__))
+DIR_ROOT = Path(DIR_SELF).parent.parent.parent
+
 # # data_store
 DATA_STORE = '/data_store_hdd/'
 DATA_STORE_NAS = '/data_store_nas/'
@@ -64,7 +69,7 @@ DIR_TEST_IMAGES = join(DIR_APP, 'test', 'images')
 # -----------------------------------------------------------------------------
 # .env config for keys
 # -----------------------------------------------------------------------------
-
+FP_KNOWLEDGE_GRAPH_ENV = join(DIR_ROOT, 'env/google_knowledge_graph_api.env')
 # DIR_DOTENV = join(DIR_APP, '.env')
 load_dotenv() # dotenv_path=DIR_DOTENV)
 
diff --git a/megapixels/app/settings/types.py b/megapixels/app/settings/types.py
index 933d1932..3d7e96c0 100644
--- a/megapixels/app/settings/types.py
+++ b/megapixels/app/settings/types.py
@@ -47,8 +47,9 @@ class Metadata(Enum):
     FACE_ATTRIBUTES, IMAGE_COUNT = range(10)
 
 class Dataset(Enum):
-  LFW, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
-    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
+  LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
+    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI, \
+    LARGE_AGE_GAP = range(18)
 
 
 # ---------------------------------------------------------------------
diff --git a/megapixels/app/utils/api_utils.py b/megapixels/app/utils/api_utils.py
index ec00113e..d9d67425 100644
--- a/megapixels/app/utils/api_utils.py
+++ b/megapixels/app/utils/api_utils.py
@@ -2,15 +2,21 @@ import json
 import urllib
 import urllib.request
 
+from app.settings import app_cfg
+from app.utils import file_utils, im_utils, logger_utils
+
 
 class WikipediaAPI:
   
   url_base = 'https://en.wikipedia.org/w/api.php'
-  
+  log = logger_utils.Logger.getLogger()
+  # https://en.wikipedia.org/w/api.php?redirects=&
+    # ppprop=displaytitle&prop=pageprops|pageimages|description&generator=prefixsearch
+    # &action=query&format=json&piprop=thumbnail&pithumbsize=160&pilimit=6&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=6
+
   def _url_builder(self, q):
     
     # https://www.mediawiki.org/wiki/API%3aProperties#Info%3a_Parameters
-
     params = {
       'redirects': '',
       'ppprop': 'displaytitle',
@@ -56,12 +62,16 @@ class WikipediaAPI:
       obj['wp_accessed'] = False
     return obj
 
-  def get_meta(self, query_obj):
+  def get_meta(self, query_obj, verbose=False):
     '''Searches Wikipedia API for query string'''
+
     if query_obj.get('wp_accessed', False):
       return query_obj
     else:
       url = self._url_builder(query_obj['query'])
+      if verbose:
+        self.log.debug(f'querying: {url}')
+        print(url)
       return self._api_search(url)
 
   def search(self, q):
@@ -73,9 +83,14 @@ class WikipediaAPI:
 class GoogleKnowledgeGraph:
 
   url_kg_api = 'https://kgsearch.googleapis.com/v1/entities:search'
+  log = logger_utils.Logger.getLogger()
+  fp_api_key = app_cfg.FP_KNOWLEDGE_GRAPH_ENV
 
-  def __init__(self, key):
-    self._api_key = key
+  def __init__(self, api_key=None):
+    if api_key is not None:
+      self._api_key = api_key
+    else:
+      self._api_key = open(self.fp_api_key).read()
 
 
   def _get_kg_meta(self, result_obj, params):
diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py
index e090d16e..f9ed009e 100644
--- a/megapixels/app/utils/identity_utils.py
+++ b/megapixels/app/utils/identity_utils.py
@@ -5,22 +5,82 @@ import unidecode
 import difflib
 
 from app.settings import types
+from app.models.data_store import DataStore
 from app.utils import logger_utils
 
 log = logger_utils.Logger.getLogger()
 
+'''
+class Dataset(Enum):
+  LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \
+    CASIA_WEBFACE, AFW, PUBFIG83, HELEN, PIPA, MEGAFACE, BRAINWASH, IMDB_WIKI = range(16)
+'''
 # Get list of names based on Dataset type
-def get_names(enum_dataset):
-  if enum_dataset == types.Dataset.LFW:
-    dir_lfw = '/data_store_hdd/datasets/people/lfw/media/original/'
-    names_orig = [x for x in os.listdir(dir_lfw)]
+def get_names(opt_dataset, opt_data_store=types.DataStore.HDD):
+  data_store = DataStore(opt_data_store, opt_dataset)
+  dir_dataset = data_store.dir_dataset  # path to dataset root
+  dir_media_orig = data_store.dir_media_original
+  if opt_dataset == types.Dataset.AFW:
+    # Annotated Faces in the Wild
+    pass
+  elif opt_dataset == types.Dataset.BRAINWASH:
+    # Brainwash IP Cam dataset
+    pass
+  elif opt_dataset == types.Dataset.CASIA_WEBFACE:
+    # 
+    pass
+  elif opt_dataset == types.Dataset.HELEN:
+    # Helen
+    pass
+  elif opt_dataset == types.Dataset.IMDB_WIKI:
+    # University of Tennessee Knoxville
+    pass
+  elif opt_dataset == types.Dataset.LAG:
+    # Large Age Gap
+    pass
+  elif opt_dataset == types.Dataset.LFW:
+    # Labeled Faces in The Wild
+    names_orig = [x for x in os.listdir(dir_media_orig)]
     names_query = [x.replace('_', ' ') for x in names_orig]
-    result = {'names_orig': names_orig, 'names_query': names_query}
-  elif enum_dataset == types.Dataset.YOUTUBE_FACES:
-    names = [x for x in names if 'labeled faces.txt' not in x]
+  elif opt_dataset == types.Dataset.MEGAFACE:
+    # MegaFace
+    pass
+  elif opt_dataset == types.Dataset.MSCELEB:
+    # MS Celeb
+    pass
+  elif opt_dataset == types.Dataset.PIPA:
+    # People in Photo Albums
+    pass
+  elif opt_dataset == types.Dataset.PUBFIG83:
+    # PubFig83
+    names_orig = [x for x in os.listdir(dir_media_orig) if Path(x).suffix is not '.txt']
+    names_query = [x.replace('_', ' ') for x in names_orig]
+  elif opt_dataset == types.Dataset.SCUT_FBP:
+    # SCUT Facial Beauty Perception
+    pass
+  elif opt_dataset == types.Dataset.UCCS:
+    # Unconstrianed College Students
+    pass
+  elif opt_dataset == types.Dataset.UMD_FACES:
+    # University of Maryland Faces
+    pass
+  elif opt_dataset == types.Dataset.UTK:
+    # University of Tennessee Knoxville
+    pass
+  elif opt_dataset == types.Dataset.UCF_SELFIE:
+    # University of Central Florida Selfie
+    pass
+  elif opt_dataset == types.Dataset.VGG_FACE:
+    # Visual Geometry Group Face 1
+    pass
+  elif opt_dataset == types.Dataset.VGG_FACE2:
+    # Visual Geometry Group Face 2
+    pass
   else:
-    log.warn(f'{enum_dataset} not yet implemented')
-    result = {}
+    log.warn(f'{opt_dataset} not yet implemented')
+    names_orig = []
+    names_query = []
+  result = {'names_orig': names_orig, 'names_query': names_query}
   return result
 
 def similarity(a, b):
diff --git a/megapixels/commands/datasets/citations_to_csv.py b/megapixels/commands/datasets/citations_to_csv.py
index d96748e5..e54d0dac 100644
--- a/megapixels/commands/datasets/citations_to_csv.py
+++ b/megapixels/commands/datasets/citations_to_csv.py
@@ -8,11 +8,11 @@ log = Logger.getLogger()
 
 @click.command()
 @click.option('-i', '--input', 'opt_fp_in', required=True,
-  help='Input license data CSV')
-@click.option('-o', '--output', 'opt_fp_out',
+  help='Input citation data file or folder')
+@click.option('-o', '--output', 'opt_dir_out',
   help='Output directory')
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out):
+def cli(ctx, opt_fp_in, opt_dir_out):
   """Convert JSON to CSV"""
   
   import sys
@@ -30,27 +30,38 @@ def cli(ctx, opt_fp_in, opt_fp_out):
   log.info('Convert JSON to CSV')
 
   # load
-  with open(opt_fp_in, 'r') as fp:
-    json_data = json.load(fp)
+  if Path(opt_fp_in).is_dir():
+    fps_in = glob(join(opt_fp_in, '*.json'))
+  else:
+    fps_in = [opt_fp_in]
 
-  # parse
-  papers = []
-  dataset_key = json_data['paper']['key']
-  dataset_name = json_data['paper']['name']
-  papers_main = get_orig_paper(json_data)
-  papers += papers_main
-  papers_citations = get_citations(dataset_key, dataset_name, json_data)
-  papers += papers_citations
-  papers = [p.to_dict() for p in papers]
+  log.info(f'{fps_in}')
+
+  for fp_in in fps_in:
+    with open(fp_in, 'r') as fp:
+      json_data = json.load(fp)
 
-  # save
-  if not opt_fp_out:
-    fp_out = opt_fp_in.replace('.json','.csv')
-  log.info(fp_out)
+    # parse
+    papers = []
+    dataset_key = json_data['paper']['key']
+    dataset_name = json_data['paper']['name']
+    papers_main = get_orig_paper(json_data)
+    papers += papers_main
+    papers_citations = get_citations(dataset_key, dataset_name, json_data)
+    papers += papers_citations
+    papers = [p.to_dict() for p in papers]
+
+    # save
+    if not opt_dir_out:
+      # save to same directory replacing ext
+      fp_out = fp_in.replace('.json','.csv')
+    else:
+      fp_out = join(opt_dir_out, Path(fp_in).name)
 
-  df_papers = pd.DataFrame.from_dict(papers)
-  df_papers.index.name = 'index'
-  df_papers.to_csv(fp_out)
+    df_papers = pd.DataFrame.from_dict(papers)
+    df_papers.index.name = 'id'
+    df_papers.to_csv(fp_out)
+    log.info(f'Wrote {len(df_papers):,} lines to {fp_out}')
 
   
 
diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb
new file mode 100644
index 00000000..a48a7ba1
--- /dev/null
+++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb
@@ -0,0 +1,633 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Identity Master List\n",
+    "\n",
+    "- start with MS Celeb Top1M\n",
+    "- then progressively add smaller datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import os\n",
+    "from os.path import join\n",
+    "from glob import glob\n",
+    "from pathlib import Path\n",
+    "import requests\n",
+    "import json\n",
+    "from pprint import pprint\n",
+    "from multiprocessing.pool import ThreadPool\n",
+    "import threading\n",
+    "import urllib.request\n",
+    "import difflib\n",
+    "import unidecode\n",
+    "\n",
+    "import slugify\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd\n",
+    "from scipy.io import loadmat\n",
+    "import numpy as np\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('/work/megapixels_dev/megapixels')\n",
+    "from app.utils import api_utils, identity_utils\n",
+    "from app.settings import app_cfg\n",
+    "from app.settings import types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MS Celeb Top 1M\n",
+    "\n",
+    "- add column for each spelling of name\n",
+    "- convert kg id to standard google format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n",
+    "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n",
+    "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n",
+    "n_groups = df_msceleb_top1m_groups.ngroups\n",
+    "print(f'{n_groups} groups')\n",
+    "df_msceleb_top1m.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#df_msceleb_top1m.head(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abbrev_mappings = {\n",
+    "  'en-US': 'en',\n",
+    "  'en-GB': 'en',\n",
+    "  'es-419': 'es-419',\n",
+    "  'es'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msceleb_identities = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_name_lang(name_lang):\n",
+    "  '''Split name into name and language'''\n",
+    "  if '@' in name_lang:\n",
+    "    indexes = [i for i,x in enumerate(name_lang) if x == '@']\n",
+    "    idx_max = (max(indexes))\n",
+    "    lang = name_lang[(idx_max + 1):]\n",
+    "    name = name_lang[:(idx_max)]\n",
+    "  else:\n",
+    "    name = name_lang\n",
+    "    lang = ''\n",
+    "  return {'name': name, 'lang': lang}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'name': 'r@destiny', 'lang': 'en-417'}"
+      ]
+     },
+     "execution_count": 122,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "split_name_lang('r@destiny@en')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0120e006a7564f5c82729a7050ef0386",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "msceleb_identities = {}\n",
+    "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n",
+    "  id_kg = mseleb_top1m_record['id_kg']\n",
+    "  if not id_kg in msceleb_identities.keys():\n",
+    "    msceleb_identities[id_kg] = {}\n",
+    "  name_lang = split_name_lang(mseleb_top1m_record['name'])\n",
+    "  name = name_lang['name']\n",
+    "  lang = name_lang['lang']\n",
+    "  msceleb_identities[id_kg][lang] = name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Patrick Cummins en\n",
+      "Patrick Cummins pt\n",
+      "Mohamed Guessous en\n",
+      "Mohamed Guessous fr\n",
+      "محمد جسوس ar\n",
+      "Tsvetta Kaleynska en\n",
+      "Tsvetta Kaleynska es\n",
+      "Tsvetta Kaleynska fr\n",
+      "Цвета Калейнска bg\n",
+      "Цвета Калейнска ru\n",
+      "Caio Henrique Siqueira Sanchez en\n",
+      "Кајо Санчез sr\n",
+      "Julio Ríos Gallego ca\n",
+      "Julio Ríos Gallego en\n",
+      "Julio Ríos Gallego es\n",
+      "Nilson Ricardo da Silva Júnior en\n",
+      "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n",
+      "니우송 히카르두 다 시우바 주니오르 ko\n",
+      "Aleksej Aleksandrovič Starobinski sl\n",
+      "Alexei Alexandrowitsch Starobinski de\n",
+      "Alexei Starobinski pt\n",
+      "Alexei Starobinsky en\n",
+      "Alexeï Starobinski fr\n",
+      "Алексей Александрович Старобинский ru\n",
+      "Старобінський Олексій Олександрович uk\n",
+      "アレクセイ・スタロビンスキー ja\n",
+      "Hilda Rix Nicholas en\n",
+      "هیلدا ریکس نیکولاس fa\n",
+      "Behrouz Makvandi en\n",
+      "Бехруз Макванди ru\n",
+      "بهروز مکوندی fa\n",
+      "Borislav Terzić en\n",
+      "Борислав Терзић sr\n"
+     ]
+    }
+   ],
+   "source": [
+    "# de-duplicate names that use same spelling for multiple languages\n",
+    "for id_kg, name_langs in msceleb_identities_sm.items():\n",
+    "  if 'en' in name_langs.keys():\n",
+    "    name_en = name_langs['en']\n",
+    "  for lang, name in name_langs.items():\n",
+    "    print(name, lang)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "374a55f504084f14bd4d77fed0e2f4e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "n2 split is long: zh-Hant\n",
+      "n2 split is long: es-419\n",
+      "n2 split is long: fil\n",
+      "n2 split is long: en-GB\n",
+      "n2 split is long: en-US\n",
+      "n2 split is long: zh-HK\n",
+      "n2 split is long: fr-CA\n",
+      "n2 split is long: pt-PT\n",
+      "n2 split is long: ceb\n",
+      "n2 split is long: zorbla.de\n",
+      "n2 split is long: N\n",
+      "n2 split is long: hu\n",
+      "m.03zytg\tΑστέριος\"\n",
+      "n2 split is long: destiny\n",
+      "n2 split is long:  Teng Boon Soon\n",
+      "n2 split is long:  Yong Khoon Seng\n",
+      "n2 split is long:  Tiki Anak Lafe\n",
+      "n2 split is long:  Marcus Mojigoh\n",
+      "n2 split is long:  Nyallau Anak Badak\n",
+      "n2 split is long: Bousou P\n",
+      "n2 split is long: evleaks\n"
+     ]
+    }
+   ],
+   "source": [
+    "messages = []\n",
+    "\n",
+    "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
+    "  id_kg = id_kg.replace('m.', '/m/')\n",
+    "  for df_row in msceleb_group.itertuples():\n",
+    "    if '@' in df_row.name:\n",
+    "      splits = df_row.name.split('@')\n",
+    "      if not len(splits) > 1:\n",
+    "        msg = f'only one split: {df_row.name}'\n",
+    "        if not msg in messages:\n",
+    "          print(msg)\n",
+    "          messages.append(msg)\n",
+    "      elif len(splits) > 1:\n",
+    "        if len(splits[1]) != 2:\n",
+    "          msg = f'n2 split is long: {splits[1]}'\n",
+    "          if not msg in messages:\n",
+    "            print(msg)\n",
+    "            messages.append(msg)\n",
+    "    else:\n",
+    "      print(df_row.name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "475871ac6d08484cbec44d5ccf099bd8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# iterate groups and flatten language variations into named columns\n",
+    "identities = []\n",
+    "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
+    "  id_kg = id_kg.replace('m.', '/m/')\n",
+    "  for df_row in msceleb_group.itertuples():\n",
+    "    if '@' in df_row.name:\n",
+    "      splits = df_row.name.split('@')\n",
+    "      name = splits[0]\n",
+    "      lang = splits[1] if len(splits) > 0 else 'en'\n",
+    "    else:\n",
+    "      # default to 'en'\n",
+    "      lang = 'en'\n",
+    "      name = df_row.name\n",
+    "    col_name = f'ms_name_{lang}'\n",
+    "    identities.append({'id_kg': id_kg, col_name: name})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(identities[0:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# temp save DataFrame to CSV\n",
+    "def save_identity_master(identities, fp_out=fp_master_identities):\n",
+    "  df_identities_master = pd.DataFrame.from_dict(identities)\n",
+    "  df_identities_master.index.name = 'id'\n",
+    "  df_identities_master.to_csv(fp_master_identities)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add image count data for MS Celeb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load lines\n",
+    "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n",
+    "with open(fp_msceleb_clean,'r') as fp:\n",
+    "  msceleb_lines = fp.readlines()\n",
+    "msceleb_files = {}\n",
+    "\n",
+    "# iterate lines and append all files\n",
+    "for filepath in msceleb_lines:\n",
+    "  id_kg, fname = filepath.split('/')\n",
+    "  id_kg = id_kg.replace('m.', '/m/')\n",
+    "  if not id_kg in msceleb_files.keys():\n",
+    "    msceleb_files[id_kg] = []\n",
+    "  msceleb_files[id_kg].append(fname)\n",
+    "\n",
+    "  # add count\n",
+    "for identity in identities:\n",
+    "  id_kg = identity['id_kg']\n",
+    "  if id_kg in msceleb_files.keys():\n",
+    "    identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n",
+    "  else:\n",
+    "    identity['msceleb_count'] = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save (takes 30 seconds)\n",
+    "save_identity_master(identities)  # encoding='utf-16' ??"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['id_kg',\n",
+       " 'ms_name_ Marcus Mojigoh',\n",
+       " 'ms_name_ Nyallau Anak Badak',\n",
+       " 'ms_name_ Teng Boon Soon',\n",
+       " 'ms_name_ Tiki Anak Lafe',\n",
+       " 'ms_name_ Yong Khoon Seng',\n",
+       " 'ms_name_Bousou P',\n",
+       " 'ms_name_N',\n",
+       " 'ms_name_af',\n",
+       " 'ms_name_am',\n",
+       " 'ms_name_ar',\n",
+       " 'ms_name_az',\n",
+       " 'ms_name_be',\n",
+       " 'ms_name_bg',\n",
+       " 'ms_name_bm',\n",
+       " 'ms_name_bn',\n",
+       " 'ms_name_bo',\n",
+       " 'ms_name_br',\n",
+       " 'ms_name_bs',\n",
+       " 'ms_name_ca',\n",
+       " 'ms_name_ceb',\n",
+       " 'ms_name_ck',\n",
+       " 'ms_name_co',\n",
+       " 'ms_name_cr',\n",
+       " 'ms_name_cs',\n",
+       " 'ms_name_cy',\n",
+       " 'ms_name_da',\n",
+       " 'ms_name_de',\n",
+       " 'ms_name_destiny',\n",
+       " 'ms_name_dz',\n",
+       " 'ms_name_el',\n",
+       " 'ms_name_en',\n",
+       " 'ms_name_en-GB',\n",
+       " 'ms_name_en-US',\n",
+       " 'ms_name_eo',\n",
+       " 'ms_name_es',\n",
+       " 'ms_name_es-419',\n",
+       " 'ms_name_et',\n",
+       " 'ms_name_eu',\n",
+       " 'ms_name_evleaks',\n",
+       " 'ms_name_fa',\n",
+       " 'ms_name_fi',\n",
+       " 'ms_name_fil',\n",
+       " 'ms_name_fo',\n",
+       " 'ms_name_fr',\n",
+       " 'ms_name_fr-CA',\n",
+       " 'ms_name_fy',\n",
+       " 'ms_name_ga',\n",
+       " 'ms_name_gd',\n",
+       " 'ms_name_gl',\n",
+       " 'ms_name_gn',\n",
+       " 'ms_name_gu',\n",
+       " 'ms_name_ha',\n",
+       " 'ms_name_hi',\n",
+       " 'ms_name_hr',\n",
+       " 'ms_name_ht',\n",
+       " 'ms_name_hu',\n",
+       " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n",
+       " 'ms_name_hy',\n",
+       " 'ms_name_id',\n",
+       " 'ms_name_ig',\n",
+       " 'ms_name_is',\n",
+       " 'ms_name_it',\n",
+       " 'ms_name_iw',\n",
+       " 'ms_name_ja',\n",
+       " 'ms_name_ka',\n",
+       " 'ms_name_kk',\n",
+       " 'ms_name_kl',\n",
+       " 'ms_name_km',\n",
+       " 'ms_name_kn',\n",
+       " 'ms_name_ko',\n",
+       " 'ms_name_ku',\n",
+       " 'ms_name_ky',\n",
+       " 'ms_name_la',\n",
+       " 'ms_name_lb',\n",
+       " 'ms_name_lo',\n",
+       " 'ms_name_lt',\n",
+       " 'ms_name_lv',\n",
+       " 'ms_name_mg',\n",
+       " 'ms_name_mi',\n",
+       " 'ms_name_mk',\n",
+       " 'ms_name_ml',\n",
+       " 'ms_name_mn',\n",
+       " 'ms_name_mr',\n",
+       " 'ms_name_ms',\n",
+       " 'ms_name_mt',\n",
+       " 'ms_name_my',\n",
+       " 'ms_name_ne',\n",
+       " 'ms_name_nl',\n",
+       " 'ms_name_nn',\n",
+       " 'ms_name_no',\n",
+       " 'ms_name_nv',\n",
+       " 'ms_name_ny',\n",
+       " 'ms_name_oc',\n",
+       " 'ms_name_or',\n",
+       " 'ms_name_pa',\n",
+       " 'ms_name_pl',\n",
+       " 'ms_name_ps',\n",
+       " 'ms_name_pt',\n",
+       " 'ms_name_pt-PT',\n",
+       " 'ms_name_ro',\n",
+       " 'ms_name_ru',\n",
+       " 'ms_name_rw',\n",
+       " 'ms_name_sa',\n",
+       " 'ms_name_sc',\n",
+       " 'ms_name_se',\n",
+       " 'ms_name_si',\n",
+       " 'ms_name_sk',\n",
+       " 'ms_name_sl',\n",
+       " 'ms_name_sn',\n",
+       " 'ms_name_so',\n",
+       " 'ms_name_sq',\n",
+       " 'ms_name_sr',\n",
+       " 'ms_name_st',\n",
+       " 'ms_name_su',\n",
+       " 'ms_name_sv',\n",
+       " 'ms_name_sw',\n",
+       " 'ms_name_ta',\n",
+       " 'ms_name_te',\n",
+       " 'ms_name_tg',\n",
+       " 'ms_name_th',\n",
+       " 'ms_name_tr',\n",
+       " 'ms_name_ug',\n",
+       " 'ms_name_uk',\n",
+       " 'ms_name_ur',\n",
+       " 'ms_name_uz',\n",
+       " 'ms_name_vi',\n",
+       " 'ms_name_xh',\n",
+       " 'ms_name_yi',\n",
+       " 'ms_name_yo',\n",
+       " 'ms_name_zh',\n",
+       " 'ms_name_zh-HK',\n",
+       " 'ms_name_zh-Hant',\n",
+       " 'ms_name_zorbla.de',\n",
+       " 'ms_name_zu']"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(df_identities_master.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "megapixels",
+   "language": "python",
+   "name": "megapixels"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/megapixels/notebooks/datasets/identity/identity_testing.ipynb b/megapixels/notebooks/datasets/identity/identity_testing.ipynb
index 384cca93..3975d0c6 100644
--- a/megapixels/notebooks/datasets/identity/identity_testing.ipynb
+++ b/megapixels/notebooks/datasets/identity/identity_testing.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
+   "execution_count": 186,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,23 +54,6 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 159,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/media/adam/ah8tb/work/megapixels_dev/env/google_knowledge_graph_api.env\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(app_cfg.FP_KNOWLEDGE_GRAPH_ENV)"
-   ]
-  },
-  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -79,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 188,
    "metadata": {},
    "outputs": [
     {
@@ -92,7 +75,8 @@
     }
    ],
    "source": [
-    "names = identity_utils.get_names(types.Dataset.LFW)\n",
+    "names = identity_utils.get_names(types.Dataset.\n",
+    "                                )\n",
     "print(names['names_query'][0:10])\n",
     "print(names['names_orig'][0:10])"
    ]
@@ -108,14 +92,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 161,
+   "execution_count": 164,
    "metadata": {},
    "outputs": [],
    "source": [
     "# read API key\n",
-    "\n",
-    "api_key = open(app_cfg.FP_KNOWLEDGE_GRAPH_ENV).read()\n",
-    "kg_api = api_utils.GoogleKnowledgeGraph(api_key)\n",
+    "kg_api = api_utils.GoogleKnowledgeGraph()\n",
     "wp_api = api_utils.WikipediaAPI()"
    ]
   },
@@ -128,25 +110,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 165,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "wp\n",
+      "wp----\n",
       "https://en.wikipedia.org/w/api.php?redirects=&ppprop=displaytitle&prop=pageprops%7Cpageimages%7Cdescription&generator=prefixsearch&action=query&format=json&piprop=thumbnail&pilimit=1&gpssearch=Vicente+Fox&gpsnamespace=0&gpslimit=1\n",
       "{'wp_accessed': True,\n",
       " 'wp_description': 'President of Mexico',\n",
       " 'wp_name': 'Vicente Fox',\n",
       " 'wp_page_id': '32836'}\n",
-      "kg\n",
+      "kg----\n",
       "{'kg_accessed': True,\n",
-      " 'kg_bio': 'Vicente Fox Quesada, RSerafO is a Mexican businessman and '\n",
-      "           'politician who served as the 55th President of Mexico from 1 '\n",
-      "           'December 2000 to 30 November 2006.\\n',\n",
-      " 'kg_bio_url': 'https://en.wikipedia.org/wiki/Vicente_Fox',\n",
+      " 'kg_bio': '',\n",
+      " 'kg_bio_url': '',\n",
       " 'kg_description': 'Former President of Mexico',\n",
       " 'kg_error': '',\n",
       " 'kg_id': '/m/081f4',\n",
@@ -174,14 +154,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
+   "execution_count": 168,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.7714285714285716\n"
+      "0.7714285714285716\n",
+      "0.7142857142857143\n"
      ]
     }
    ],
@@ -189,7 +170,8 @@
     "#print(identity_utils.names_match('Andréss Iniestas', 'Andres Iniestalossas Jr.', as_float=True))\n",
     "#print(identity_utils.names_match('Adoor Gopalakrishnan', 'Adoors Gopalakarishnan', as_float=True))\n",
     "#print(identity_utils.names_match('Dave Letterman', 'David Letterman', as_float=True))\n",
-    "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True))\n",
+    "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=True))\n",
+    "print(identity_utils.names_match('Charles Dickens', 'Charles Boooker', as_float=True, compound_score=False))\n",
     "#print(identity_utils.names_match('Donald Trump', 'Donald J. Trump', as_float=True))\n",
     "#print(identity_utils.names_match('Wang Fei', 'Fei Wang  III', as_float=True))"
    ]
author	adamhrv <adam@ahprojects.com>	2019-02-14 14:45:18 +0100
committer	adamhrv <adam@ahprojects.com>	2019-02-14 14:45:18 +0100
commit	3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (patch)
tree	436347b8466422a1019209f9f04937ea1ce0e4eb
parent	41247c08ea359d0a72a247992d2019ae2120536c (diff)