# Identity Master List

- [x] MS Celeb 1M
- UMD Faces
- FaceScrub
- LFW
- PubFig
- PubFig83
- VGG Face
- VGG Face2
- IJB-C
- CASIA Webface
- IMDB-Face
- IMDB-Wiki

In [6]:
%reload_ext autoreload
%autoreload 2

import os
from os.path import join
from glob import glob
from pathlib import Path
import requests
import json
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request
import difflib
import unidecode

import slugify
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append('/work/megapixels_dev/megapixels')
from app.utils import api_utils, identity_utils
from app.settings import app_cfg
from app.settings import types

## MS Celeb Top 1M

- add column for each spelling of name
- convert kg id to standard google format

In [7]:
fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'
dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'
fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')

In [8]:
fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'
df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])
df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')
n_groups = df_msceleb_top1m_groups.ngroups

In [38]:
# create alphabetically sorted dict
msceleb_top1m_az = {}
for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):
  name_lang = split_name_lang(msceleb_row.name_lang)
  name = name_lang['name']
  c = name[0].lower()
  if not c in msceleb_top1m_az.keys():
    msceleb_top1m_az[c] = []
  msceleb_top1m_az[c].append({'name': name, 'id_kg': msceleb_row.id_kg})

HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))

In [39]:
df_msceleb_top1m.head()

Unnamed: 0,id_kg,name_lang
0,m.01008l47,Patrick Cummins@en
1,m.01008l47,Patrick Cummins@pt
2,m.01008l96,Mohamed Guessous@en
3,m.01008l96,Mohamed Guessous@fr
4,m.01008l96,محمد جسوس@ar


In [40]:
print(f'There are {len(df_msceleb_top1m):,} total name variations')
print(f'There are {n_groups:,} unique identities')

There are 3,481,186 total name variations
There are 1,000,000 unique identities


In [12]:
# convert DataFrame to dict
mseleb_top1m_records = df_msceleb_top1m.to_dict('records')

In [13]:
# store all identity info here, until creating dataframe
msceleb_identities = {}

In [14]:
# utility functions
def split_name_lang(name_lang):
  '''Split name into name and language'''
  if '@' in name_lang:
    indexes = [i for i,x in enumerate(name_lang) if x == '@']
    idx_max = (max(indexes))
    lang = name_lang[(idx_max + 1):]
    name = name_lang[:(idx_max)]
  else:
    name = name_lang
    lang = ''
  return {'name': name, 'lang': lang}

# temp save DataFrame to CSV
def save_identity_master(identities, fp_out=fp_master_identities):
  df_identities_master = pd.DataFrame.from_dict(identities)
  df_identities_master.index.name = 'id'
  df_identities_master.to_csv(fp_master_identities)

In [15]:
# convert to "name@lang" to dict format
msceleb_identities = {}
for mseleb_top1m_record in tqdm(mseleb_top1m_records):
  id_kg = mseleb_top1m_record['id_kg'].replace('m.','/m/')
  if not id_kg in msceleb_identities.keys():
    msceleb_identities[id_kg] = {'names': {}}
  name_lang = split_name_lang(mseleb_top1m_record['name_lang'])
  name = name_lang['name']
  lang = name_lang['lang']
  if lang == 'en':
      msceleb_identities[id_kg]['names']['canonical'] = name
  msceleb_identities[id_kg]['names'][lang] = name

HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))




### Patch @en names

In [16]:
# check for missing english names
for id_kg, attrs in tqdm(msceleb_identities.items()):
  lang_attrs = attrs['names']
  name_en = lang_attrs.get('en', None)
  if not name_en:
    print(f'no english name for {id_kg}')

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

no english name for /m/017vbn
no english name for /m/026q0k_
no english name for /m/02k2kw
no english name for /m/0bwhrg1



In [17]:
# patch en name exception: 4 names missing english
en_exceptions = {
  '/m/017vbn': 'de',
  '/m/026q0k_': 'nl',
  '/m/02k2kw': 'de',
  '/m/0bwhrg1': 'it'
}
for id_kg, lang in en_exceptions.items():
  msceleb_identities[id_kg]['names']['en'] = msceleb_identities[id_kg]['names'][lang]
  msceleb_identities[id_kg]['names']['canonical'] = msceleb_identities[id_kg]['names']['en']
  print(f'patched {id_kg} {lang} to en')

patched /m/017vbn de to en
patched /m/026q0k_ nl to en
patched /m/02k2kw de to en
patched /m/0bwhrg1 it to en


### Remove duplicate names

In [18]:
# de-duplicate names that use same spelling for multiple languages
items_removed = []
msceleb_identities_copy = msceleb_identities.copy()

for id_kg, attrs in tqdm(msceleb_identities_copy.items()):
  lang_attrs = attrs['names']
  name_main = lang_attrs.get('canonical', None)
  if not name_en:
    print('error. all names need "en"')
    break
  lang_attrs_copy = attrs['names'].copy()
  for lang, name in lang_attrs_copy.items():
    if name == name_main and lang != 'en' and lang != 'canonical':
      # remove it
      items_removed.append(msceleb_identities[id_kg]['names'].pop(lang))
  del lang_attrs_copy

del msceleb_identities_copy
print(f'removed {len(items_removed):,} duplicate names')
del items_removed

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))


removed 1,485,336 duplicate names


### Count images per person for ms celeb

In [19]:
# calculate total images per id
msceleb_files = {}
# load text file
with open(fp_msceleb_clean_txt,'r') as fp:
  msceleb_lines = fp.readlines()
  
# iterate lines and append all files
for filepath in tqdm(msceleb_lines):
  id_kg, fname = filepath.split('/')
  id_kg = id_kg.replace('m.', '/m/')
  if not id_kg in msceleb_files.keys():
    msceleb_files[id_kg] = []
  msceleb_files[id_kg].append(fname)

HBox(children=(IntProgress(value=0, max=5049824), HTML(value='')))




In [20]:
# add count to 
for id_kg, attrs in tqdm(msceleb_identities.items()):
  if id_kg in msceleb_files.keys():
    count = len(msceleb_files[id_kg])
  else:
    count = 0
  msceleb_identities[id_kg]['count_msceleb'] = count

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




In [21]:
im_counts_idxs = [attrs['count_msceleb'] for id_kg, attrs in msceleb_identities.items()]
im_counts_id_kg = [id_kg for id_kg, _ in msceleb_identities.items()]

In [22]:
# print stats
idx_max = np.argmax(im_counts_idxs)
id_kg_max = im_counts_id_kg[idx_max]
count_max = im_counts_idxs[idx_max]
name_max = msceleb_identities[id_kg_max]['names']['canonical']
print(f'Most images {count_max:,} for {name_max}')
# distribution
im_counts_idxs = np.array(im_counts_idxs)
print(f'{len(im_counts_idxs[im_counts_idxs > 10]):,} more than 10')
print(f'{len(im_counts_idxs[im_counts_idxs > 20]):,} more than 20')
print(f'{len(im_counts_idxs[im_counts_idxs > 50]):,} more than 50')
print(f'{len(im_counts_idxs[im_counts_idxs > 100]):,} more than 100')

Most images 130 for Leelee Sobieski
88,244 more than 10
78,027 more than 20
49,042 more than 50
5,025 more than 100


In [23]:
# awkward conversion of msceleb_identities to a list of dicts
identities_flat = []
for id_kg, attrs in tqdm(msceleb_identities.items()):
  obj = {'id_kg': id_kg}
  for lang, name in attrs['names'].items():
    if lang != 'canonical':
        col_name = f'name_msceleb_{lang}'
    elif lang == 'canonical':
      col_name = 'name_msceleb'
    obj[col_name] = name
  obj['count_msceleb'] = attrs['count_msceleb']
  identities_flat.append(obj)

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




In [24]:
# convert to dataframe
df_identities = pd.DataFrame.from_dict(identities_flat)

In [25]:
# save checkpoint CSV
save_identity_master(identities_flat)  # encoding='utf-16' ??

In [26]:
# copy to master and delete ref to msceleb
identities = msceleb_identities.copy()
del msceleb_identities

## LFW

In [28]:
# add LFW data
fp_lfw = '/data_store_hdd/datasets/people/lfw/downloads/lfw-names.txt'
with open(fp_lfw,'r') as fp:
  lfw_lines = fp.readlines()
lfw_lines = [x.strip() for x in lfw_lines]

lfw_meta = []
for lfw_line in lfw_lines:
  name_orig, count = lfw_line.split('\t')
  name_clean = name_orig.replace('_',' ')
  obj = {'name_orig': name_orig, 'name': name_clean, 'count':count}
  lfw_meta.append(obj)

In [29]:
identities_tmp = identities.copy()

In [46]:
# make exact name matches
lfw_name_matches_exact = {}
for lfw_item in tqdm(lfw_meta):
  lfw_name = lfw_item['name']  # name is transformed original name
  lfwnl = lfw_name.lower()
  c = lfwnl[0]
  for name_id_kg in msceleb_top1m_az[c]:
    name = name_id_kg['name']
    id_kg = name_id_kg['id_kg']
    if lfwnl == name.lower():
      lfw_name_matches_exact[lfw_name] = id_kg
      print(f"matched: {lfw_name} to {name} with id: {id_kg}")
      break

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

matched: Aaron Eckhart to Aaron Eckhart with id: m.03t4cz
matched: Aaron Guiel to Aaron Guiel with id: m.0bsy4r
matched: Aaron Peirsol to Aaron Peirsol with id: m.03p4zn
matched: Aaron Sorkin to Aaron Sorkin with id: m.01d8yn
matched: Aaron Tippin to Aaron Tippin with id: m.01k8mzv
matched: Abba Eban to Abba Eban with id: m.01341q
matched: Abbas Kiarostami to Abbas Kiarostami with id: m.023t0q
matched: Abdoulaye Wade to Abdoulaye Wade with id: m.023066
matched: Abdul Rahman to Abdul Rahman with id: m.01_t7m
matched: Abdullah to Abdullah with id: m.081x04
matched: Abdullah Ahmad Badawi to Abdullah Ahmad Badawi with id: m.01yynm
matched: Abel Aguilar to Abel Aguilar with id: m.08q415
matched: Abel Pacheco to Abel Pacheco with id: m.022vvl
matched: Abid Hamid Mahmud Al-Tikriti to Abid Hamid Mahmud al-Tikriti with id: m.03h33y
matched: Abraham Foxman to Abraham Foxman with id: m.03rtwj
matched: Adam Ant to Adam Ant with id: m.01wgjj5
matched: Adam Freier to Adam Freier with id: m.0f3mns
ma

matched: Ana Palacio to Ana Palacio with id: m.0606x3
matched: Anastasia Kelesidou to Anastasia Kelesidou with id: m.07ztqs
matched: Anastasia Myskina to Anastasia Myskina with id: m.031h9y
matched: Anatoliy Kinakh to Anatoliy Kinakh with id: m.04hl61
matched: Anders Fogh Rasmussen to Anders Fogh Rasmussen with id: m.01p9wn
matched: Andre Agassi to Andre Agassi with id: m.0hdr
matched: Andre Lange to Andre Lange with id: m.0bsp5t
matched: Andrea Bocelli to Andrea Bocelli with id: m.02b25y
matched: Andrea De Cruz to Andrea De Cruz with id: m.0j7z2dh
matched: Andrea Yates to Andrea Yates with id: m.01tr5l
matched: Andreas Vinciguerra to Andreas Vinciguerra with id: m.0bdr9r
matched: Andrei Konchalovsky to Andrei Konchalovsky with id: m.03lhhq
matched: Andrei Mikhnevich to Andrei Mikhnevich with id: m.09dlrc
matched: Andrei Nikolishin to Andrei Nikolishin with id: m.0927tv
matched: Andrew Bernard to Andrew Bernard with id: m.02r6722
matched: Andrew Caldecott to Andrew Caldecott with id: m

matched: Bernardo Segura to Bernardo Segura with id: m.0bnr7m
matched: Bertie Ahern to Bertie Ahern with id: m.09_9nl
matched: Bertrand Bonello to Bertrand Bonello with id: m.02rzksd
matched: Beth Jones to Beth Jones with id: m.05v_jh0
matched: Bettina Rheims to Bettina Rheims with id: m.03pfyf
matched: Betty Williams to Betty Williams with id: m.02848d
matched: Bianca Jagger to Bianca Jagger with id: m.04l_7q
matched: Bijan Namdar Zangeneh to Bijan Namdar Zangeneh with id: m.07wh4x
matched: Bill Belichick to Bill Belichick with id: m.02_fs7
matched: Bill Butler to Bill Butler with id: m.025v92r
matched: Bill Callahan to Bill Callahan with id: m.05v5sz
matched: Bill Cartwright to Bill Cartwright with id: m.05zr5r1
matched: Bill Clancy to Bill Clancy with id: m.0fqq2b9
matched: Bill Clinton to Bill Clinton with id: m.0157m
matched: Bill Curry to Bill Curry with id: m.09zdxg
matched: Bill Doba to Bill Doba with id: m.07jd41
matched: Bill Elliott to Bill Elliott with id: m.04dzh1r
matched

matched: Candice Bergen to Candice Bergen with id: m.04p_yxz
matched: Candie Kung to Candie Kung with id: m.075hcr
matched: Carey Lowell to Carey Lowell with id: m.041l94
matched: Carin Koch to Carin Koch with id: m.0djjc7
matched: Carl Levin to Carl Levin with id: m.01xh6j
matched: Carl Pope to Carl Pope with id: m.0dnqf0
matched: Carl Reiner to Carl Reiner with id: m.0pnf3
matched: Carla Del Ponte to Carla Del Ponte with id: m.04y3tl
matched: Carla Gay Balingit to Carla Gay Balingit with id: m.0bz3cc
matched: Carla Gugino to Carla Gugino with id: m.06qgvf
matched: Carla Moreno to Carla Moreno with id: m.05tg6p
matched: Carla Sullivan to Carla Sullivan with id: m.0h7mj1v
matched: Carlo Ancelotti to Carlo Ancelotti with id: m.049_yr
matched: Carlo Azeglio Ciampi to Carlo Azeglio Ciampi with id: m.01l__3
matched: Carlos Alberto Parreira to Carlos Alberto Parreira with id: m.037n_q
matched: Carlos Arroyo to Carlos Arroyo with id: m.076x8t8
matched: Carlos Barra to Carlos Barra with id: m

matched: Cindy Klassen to Cindy Klassen with id: m.08z9xg
matched: Cindy Margolis to Cindy Margolis with id: m.05vxct
matched: Cindy Taylor to Cindy Taylor with id: m.07dn76
matched: Ciro Gomes to Ciro Gomes with id: m.0b4fnb
matched: Claire Danes to Claire Danes with id: m.01gq0b
matched: Claire Tomalin to Claire Tomalin with id: m.0253pn
matched: Clare Short to Clare Short with id: m.01bl0h
matched: Claude Jorda to Claude Jorda with id: m.0gjcvjj
matched: Claudia Cardinale to Claudia Cardinale with id: m.01j5sv
matched: Claudia Coslovich to Claudia Coslovich with id: m.0269h89
matched: Claudia Pechstein to Claudia Pechstein with id: m.07hxkb
matched: Claudia Schiffer to Claudia Schiffer with id: m.0m2wm
matched: Claudine Farrell to Claudine Farrell with id: m.0b67b9d
matched: Claudio Abbado to Claudio Abbado with id: m.01l9pz
matched: Claudio Ranieri to Claudio Ranieri with id: m.02_jy3
matched: Clay Aiken to Clay Aiken with id: m.01y3qy
matched: Cliff Ellis to Cliff Ellis with id: m

matched: David Modell to David Modell with id: m.064p_85
matched: David Montoya to David Montoya with id: m.05bzzst
matched: David Myers to David Myers with id: m.03cndkb
matched: David Nalbandian to David Nalbandian with id: m.031gr8
matched: David Obey to David Obey with id: m.024tlf
matched: David Oh to David Oh with id: m.0j25c_3
matched: David Provost to David Provost with id: m.02rtdpx
matched: David Shayler to David Shayler with id: m.02cgqp
matched: David Siegel to David Siegel with id: m.0k78yt9
matched: David Spade to David Spade with id: m.02dlfh
matched: David Suazo to David Suazo with id: m.043ftz
matched: David Wells to David Wells with id: m.05x3yv6
matched: David Wolf to David Wolf with id: m.02d2pt
matched: Davis Love III to Davis Love III with id: m.04ymst
matched: Dawn Staley to Dawn Staley with id: m.03mq61
matched: Dean Barker to Dean Barker with id: m.05b4zh7
matched: Dean Barkley to Dean Barkley with id: m.0126_6
matched: Dean Sheremet to Dean Sheremet with id: m

matched: Elgin Baylor to Elgin Baylor with id: m.0267d_
matched: Eli Broad to Eli Broad with id: m.04sj1_
matched: Eli Rosenbaum to Eli Rosenbaum with id: m.0f3zzx
matched: Eliane Karp to Eliane Karp with id: m.04ps8y
matched: Elijah Wood to Elijah Wood with id: m.015t56
matched: Elin Nordegren to Elin Nordegren with id: m.03sxrx
matched: Elinor Caplan to Elinor Caplan with id: m.02g5nw
matched: Elisabeth Schumacher to Elisabeth Schumacher with id: m.0887_w
matched: Elisabeth Welch to Elisabeth Welch with id: m.05b0k_2
matched: Elisha Cuthbert to Elisha Cuthbert with id: m.01yhvv
matched: Eliza Dushku to Eliza Dushku with id: m.01jb26
matched: Eliza Manningham-Buller to Eliza Manningham-Buller with id: m.02yqds
matched: Elizabeth Dole to Elizabeth Dole with id: m.01f0yh
matched: Elizabeth Hill to Elizabeth Hill with id: m.03wgyj9
matched: Elizabeth Hurley to Elizabeth Hurley with id: m.01pnn3
matched: Elizabeth Smart to Elizabeth Smart with id: m.01ddh7
matched: Ellen Barkin to Ellen B

matched: Frederique van der Wal to Frederique van der Wal with id: m.048qvh
matched: Fruit Chan to Fruit Chan with id: m.042l27
matched: Fujio Cho to Fujio Cho with id: m.09z2gr
matched: Fujio Mitarai to Fujio Mitarai with id: m.05c4qxx
matched: Gabi Zimmer to Gabi Zimmer with id: m.0jl1v
matched: Gabriel Batistuta to Gabriel Batistuta with id: m.02pt11
matched: Gabrielle Rose to Gabrielle Rose with id: m.02qptzd
matched: Gabrielle Union to Gabrielle Union with id: m.02xbw2
matched: Galen Rowell to Galen Rowell with id: m.01jgqg
matched: Gao Qiang to Gao Qiang with id: m.04160s4
matched: Garry Kasparov to Garry Kasparov with id: m.03c5y
matched: Garry McCoy to Garry McCoy with id: m.07tmq9
matched: Garry Trudeau to Garry Trudeau with id: m.037w1
matched: Garth Drabinsky to Garth Drabinsky with id: m.0499yh
matched: Gary Barnett to Gary Barnett with id: m.0chjvp
matched: Gary Bauer to Gary Bauer with id: m.03kn29
matched: Gary Bettman to Gary Bettman with id: m.02d2q4
matched: Gary Cart

matched: Habib Rizieq to Habib Rizieq with id: m.0v_sx80
matched: Hal McCoy to Hal McCoy with id: m.0dc05b
matched: Hal Sutton to Hal Sutton with id: m.02_yd9
matched: Halle Berry to Halle Berry with id: m.03knl
matched: Ham Pong-sil to Ham Pong-sil with id: m.027dn8v
matched: Hama Arba Diallo to Hama Arba Diallo with id: m.03d2bph
matched: Hamad Bin Isa al-Khalifa to Hamad bin Isa al-Khalifa with id: m.022vq5
matched: Hamid Karzai to Hamid Karzai with id: m.0kxrb
matched: Hamzah Haz to Hamzah Haz with id: m.04c9qm
matched: Hana Makhmalbaf to Hana Makhmalbaf with id: m.06f4jm
matched: Hana Sadiq to Hana Sadiq with id: m.02w752_
matched: Hanan Ashrawi to Hanan Ashrawi with id: m.036df9
matched: Hank Aaron to Hank Aaron with id: m.03q8y
matched: Hank Azaria to Hank Azaria with id: m.0sw6g
matched: Hank Stram to Hank Stram with id: m.01wtqh
matched: Hannah Stockbauer to Hannah Stockbauer with id: m.072jm3
matched: Hans-Christian Schmid to Hans-Christian Schmid with id: m.03k5zw
matched: H

matched: Jackie Chan to Jackie Chan with id: m.0v39r93
matched: Jackie Dennis to Jackie Dennis with id: m.065z8gk
matched: Jackie Sherrill to Jackie Sherrill with id: m.0bgjc8
matched: Jacky Cheung to Jacky Cheung with id: m.01qgbxt
matched: Jacqueline Gold to Jacqueline Gold with id: m.027brng
matched: Jacqueline Obradors to Jacqueline Obradors with id: m.04jlpk
matched: Jacques Chirac to Jacques Chirac with id: m.09xg8
matched: Jacques Kallis to Jacques Kallis with id: m.02r8yc
matched: Jacques Rogge to Jacques Rogge with id: m.0n15h
matched: Jacques Villeneuve to Jacques Villeneuve with id: m.0h633
matched: Jada Pinkett Smith to Jada Pinkett Smith with id: m.01j7z7
matched: Jade Jagger to Jade Jagger with id: m.066xc8
matched: Jafar Umar Thalib to Jafar Umar Thalib with id: m.010qk38s
matched: Jaime Pressly to Jaime Pressly with id: m.03c5bz
matched: Jake Gyllenhaal to Jake Gyllenhaal with id: m.02js6_
matched: Jake Plummer to Jake Plummer with id: m.03v3j_
matched: Jakob Kellenberg

matched: Jeremy Greenstock to Jeremy Greenstock with id: m.0387qf
matched: Jeremy Shockey to Jeremy Shockey with id: m.076ltd
matched: Jeremy Wotherspoon to Jeremy Wotherspoon with id: m.04zcwx
matched: Jeri Ryan to Jeri Ryan with id: m.023pzh
matched: Jerome Jenkins to Jerome Jenkins with id: m.02qxw5z
matched: Jerry Angelo to Jerry Angelo with id: m.0ccyjp
matched: Jerry Bruckheimer to Jerry Bruckheimer with id: m.01t6b4
matched: Jerry Colangelo to Jerry Colangelo with id: m.01cj88
matched: Jerry Falwell to Jerry Falwell with id: m.046l2
matched: Jerry Hall to Jerry Hall with id: m.01ycq7
matched: Jerry Jones to Jerry Jones with id: m.011lt204
matched: Jerry Lewis to Jerry Lewis with id: m.0427y
matched: Jerry Regier to Jerry Regier with id: m.0bh7n9p
matched: Jerry Rice to Jerry Rice with id: m.0240vt
matched: Jerry Seinfeld to Jerry Seinfeld with id: m.0q5hw
matched: Jerry Sloan to Jerry Sloan with id: m.02h73f
matched: Jerry Springer to Jerry Springer with id: m.01v0432
matched: J

matched: John Petty to John Petty with id: m.07k5c03
matched: John Prescott to John Prescott with id: m.010qg14c
matched: John Reid to John Reid with id: m.01hps7
matched: John Rigas to John Rigas with id: m.03ms0t
matched: John Rowe to John Rowe with id: m.03d4g86
matched: John Rowland to John Rowland with id: m.05mxxpn
matched: John Ruiz to John Ruiz with id: m.0hq_m4_
matched: John Rusnak to John Rusnak with id: m.02jvb3
matched: John Salazar to John Salazar with id: m.04cv8k
matched: John Scarlett to John Scarlett with id: m.02r514r
matched: John Stallworth to John Stallworth with id: m.04s_9g
matched: John Starks to John Starks with id: m.09q0hm
matched: John Stockton to John Stockton with id: m.01r6lw
matched: John Sweeney to John Sweeney with id: m.02r3jft
matched: John Swofford to John Swofford with id: m.025x140
matched: John Taylor to John Taylor with id: m.018yvh
matched: John Thune to John Thune with id: m.03ybyn
matched: John Timoney to John Timoney with id: m.04ctvcy
matc

matched: Keith Bogans to Keith Bogans with id: m.0674p6
matched: Keith Foulke to Keith Foulke with id: m.037721
matched: Keith Lockhart to Keith Lockhart with id: m.08l0r7
matched: Keith Olbermann to Keith Olbermann with id: m.01p0m6
matched: Keith Osik to Keith Osik with id: m.02vrcpv
matched: Keith Snyder to Keith Snyder with id: m.0dm2_ws
matched: Keith Tyson to Keith Tyson with id: m.01c23p
matched: Keith Urban to Keith Urban with id: m.05cljf
matched: Keith Van Horn to Keith Van Horn with id: m.02_bdt
matched: Keizo Yamada to Keizo Yamada with id: m.04ydptq
matched: Kelli White to Kelli White with id: m.06br7z
matched: Kellie Coffey to Kellie Coffey with id: m.01pfh3w
matched: Kelly Clarkson to Kelly Clarkson with id: m.025ldg
matched: Kelly Osbourne to Kelly Osbourne with id: m.0p3r8
matched: Kelly Ripa to Kelly Ripa with id: m.0164c4
matched: Kelly Santos to Kelly Santos with id: m.04gtwm5
matched: Kelsey Grammer to Kelsey Grammer with id: m.04cl1
matched: Kelvin Sampson to Kelv

matched: Lee Ann Womack to Lee Ann Womack with id: m.01tfj0
matched: Lee Baca to Lee Baca with id: m.0d9vhb
matched: Lee Chang-dong to Lee Chang-dong with id: m.01pxrx
matched: Lee Hoi-chang to Lee Hoi-chang with id: m.08cd2r
matched: Lee Hong-ki to Lee Hong-ki with id: m.0bmhgrg
matched: Lee Hyung-taik to Lee Hyung-taik with id: m.03sb3n
matched: Lee Tae-sik to Lee Tae-sik with id: m.0ftc93
matched: Leisel Jones to Leisel Jones with id: m.03rn7c
matched: Lela Rochon to Lela Rochon with id: m.07jtyq
matched: Leland Chapman to Leland Chapman with id: m.05yqrl
matched: Lena Katina to Lena Katina with id: m.02q40x
matched: Lena Olin to Lena Olin with id: m.01y64_
matched: Lene Espersen to Lene Espersen with id: m.05g4sr
matched: Lennart Johansson to Lennart Johansson with id: m.067r7h
matched: Lennox Lewis to Lennox Lewis with id: m.0fjbs
matched: Lenny Kravitz to Lenny Kravitz with id: m.0161sp
matched: Lenny Wilkens to Lenny Wilkens with id: m.03h_x0
matched: Leon Barmore to Leon Barmor

matched: Maria Wetterstrand to Maria Wetterstrand with id: m.03mx19
matched: Mariah Carey to Mariah Carey with id: m.04xrx
matched: Mariana Ohata to Mariana Ohata with id: m.05tg6b
matched: Mariano Zabaleta to Mariano Zabaleta with id: m.081b7r
matched: Marie-Reine Le Gougne to Marie-Reine Le Gougne with id: m.025xvg2
matched: Marilyn Monroe to Marilyn Monroe with id: m.04wqr
matched: Marina Anissina to Marina Anissina with id: m.092sw2
matched: Marina Canetti to Marina Canetti with id: m.03ykpgg
matched: Marina Hands to Marina Hands with id: m.02rxd04
matched: Marina Kuptsova to Marina Kuptsova with id: m.09j3ql
matched: Marina Silva to Marina Silva with id: m.047dbx9
matched: Mario Austin to Mario Austin with id: m.0cm_lw
matched: Mario Cipollini to Mario Cipollini with id: m.02fwy3
matched: Mario Dominguez to Mario Dominguez with id: m.076334
matched: Mario Dumont to Mario Dumont with id: m.0230y5
matched: Mario Lemieux to Mario Lemieux with id: m.013339
matched: Mario Puzo to Mario

matched: Michael Boyce to Michael Boyce with id: m.02l9xz
matched: Michael Brandon to Michael Brandon with id: m.0ffl5q
matched: Michael Broad to Michael Broad with id: m.04lh0yb
matched: Michael Caine to Michael Caine with id: m.0h6485t
matched: Michael Capellas to Michael Capellas with id: m.06zfsk
matched: Michael Chertoff to Michael Chertoff with id: m.04yfpm
matched: Michael Chiklis to Michael Chiklis with id: m.03kt5c
matched: Michael Clarke Duncan to Michael Clarke Duncan with id: m.02lkcc
matched: Michael Dell to Michael Dell with id: m.013q3z
matched: Michael Diekmann to Michael Diekmann with id: m.04hdj5r
matched: Michael Doleac to Michael Doleac with id: m.05p2m2
matched: Michael Douglas to Michael Douglas with id: m.04g1950
matched: Michael Fitzgerald to Michael FitzGerald with id: m.0bx457v
matched: Michael Frayn to Michael Frayn with id: m.027d88
matched: Michael Friedman to Michael Friedman with id: m.03hn9_4
matched: Michael Hagee to Michael Hagee with id: m.0213zq
matc

matched: Nan Wang to Nan Wang with id: m.0_sknfh
matched: Nancy Kerrigan to Nancy Kerrigan with id: m.01x3mg
matched: Nancy Pelosi to Nancy Pelosi with id: m.012v1t
matched: Nancy Reagan to Nancy Reagan with id: m.059rv
matched: Nancy Sinatra to Nancy Sinatra with id: m.05ft3
matched: Nanni Moretti to Nanni Moretti with id: m.06dx0w
matched: Naomi Campbell to Naomi Campbell with id: m.01pcrw
matched: Naomi Watts to Naomi Watts with id: m.01xcfy
matched: Naoto Kan to Naoto Kan with id: m.01_x4x
matched: Narayan Singh Pun to Narayan Singh Pun with id: m.027sbq7
matched: Narendra Modi to Narendra Modi with id: m.0296q2
matched: Nasser al-Kidwa to Nasser al-Kidwa with id: m.03gxflt
matched: Nastassia Kinski to Nastassia Kinski with id: m.05hdf
matched: Nastia Liukin to Nastia Liukin with id: m.0bfy48
matched: Natalia Verbeke to Natalia Verbeke with id: m.03c16vy
matched: Natalie Cole to Natalie Cole with id: m.026spg
matched: Natalie Coughlin to Natalie Coughlin with id: m.03nw4q
matched: 

matched: Paul Crake to Paul Crake with id: m.027z30y
matched: Paul Desmarais to Paul Desmarais with id: m.01xkz2
matched: Paul Farley to Paul Farley with id: m.0fkq64
matched: Paul Gascoigne to Paul Gascoigne with id: m.025csj
matched: Paul Greengrass to Paul Greengrass with id: m.06cxyj
matched: Paul Henderson to Paul Henderson with id: m.08jc18
matched: Paul Kagame to Paul Kagame with id: m.02301x
matched: Paul Kariya to Paul Kariya with id: m.01kb6l
matched: Paul LeClerc to Paul LeClerc with id: m.0fqb2d
matched: Paul Lo Duca to Paul Lo Duca with id: m.0530sr
matched: Paul Lockhart to Paul Lockhart with id: m.02d1ks
matched: Paul Martin to Paul Martin with id: m.05w0sd6
matched: Paul McCartney to Paul McCartney with id: m.03j24kf
matched: Paul McNulty to Paul McNulty with id: m.08hy3y
matched: Paul Newman to Paul Newman with id: m.0d6d2
matched: Paul Otellini to Paul Otellini with id: m.05kx4n
matched: Paul Reiser to Paul Reiser with id: m.01y0y6
matched: Paul Sarbanes to Paul Sarba

matched: Ray Bradbury to Ray Bradbury with id: m.06jcc
matched: Ray Evernham to Ray Evernham with id: m.06_74n
matched: Ray Halbritter to Ray Halbritter with id: m.047mjzp
matched: Ray Liotta to Ray Liotta with id: m.02j490
matched: Ray Lucas to Ray Lucas with id: m.03ydd_3
matched: Ray Nagin to Ray Nagin with id: m.06jp4q
matched: Ray Price to Ray Price with id: m.0hzqg_d
matched: Ray Romano to Ray Romano with id: m.01h910
matched: Ray Sherman to Ray Sherman with id: m.05mznbs
matched: Ray Young to Ray Young with id: m.05mxz8z
matched: Raymond Odierno to Raymond Odierno with id: m.0268g3w
matched: Raza Rabbani to Raza Rabbani with id: m.03yj5dv
matched: Razali Ismail to Razali Ismail with id: m.044x82
matched: Red Auerbach to Red Auerbach with id: m.01t2df
matched: Reese Witherspoon to Reese Witherspoon with id: m.0n6f8
matched: Reggie Lewis to Reggie Lewis with id: m.03hk1tz
matched: Reggie Miller to Reggie Miller with id: m.02c5ls
matched: Reggie Sanders to Reggie Sanders with id: m

matched: Rohinton Mistry to Rohinton Mistry with id: m.0gr21
matched: Roland Koch to Roland Koch with id: m.0404bdc
matched: Rolandas Paksas to Rolandas Paksas with id: m.01xvhg
matched: Rolf Eckrodt to Rolf Eckrodt with id: m.0gjs5y
matched: Rollie Massimino to Rollie Massimino with id: m.05k185
matched: Romain Duris to Romain Duris with id: m.0615j_
matched: Roman Abramovich to Roman Abramovich with id: m.01zl71
matched: Roman Coppola to Roman Coppola with id: m.074tyf
matched: Roman Polanski to Roman Polanski with id: m.06b_0
matched: Roman Tam to Roman Tam with id: m.0vgmx
matched: Romano Prodi to Romano Prodi with id: m.01c3z5
matched: Romeo Gigli to Romeo Gigli with id: m.0dsbldr
matched: Ron Dittemore to Ron Dittemore with id: m.01jh6j
matched: Ron Gonzales to Ron Gonzales with id: m.05fvnl
matched: Ron Howard to Ron Howard with id: m.0g2lq
matched: Ron Kirk to Ron Kirk with id: m.02fhhn
matched: Ron Zook to Ron Zook with id: m.07072y
matched: Ronald Harwood to Ronald Harwood wi

matched: Sheila Taormina to Sheila Taormina with id: m.05tf7q
matched: Sheila Wellstone to Sheila Wellstone with id: m.04gkt6
matched: Sheldon Silver to Sheldon Silver with id: m.06dn4y
matched: Sherri Coale to Sherri Coale with id: m.0d5bxd
matched: Sheryl Crow to Sheryl Crow with id: m.06rgq
matched: Shi Guangsheng to Shi Guangsheng with id: m.04lf22t
matched: Shia LaBeouf to Shia LaBeouf with id: m.04w391
matched: Shigeo Nagashima to Shigeo Nagashima with id: m.01zv51
matched: Shigeru Ishiba to Shigeru Ishiba with id: m.03f5mf
matched: Shimon Peres to Shimon Peres with id: m.0c_8s
matched: Shingo Katayama to Shingo Katayama with id: m.06nxp4
matched: Shingo Suetsugu to Shingo Suetsugu with id: m.046x06
matched: Shinya Taniguchi to Shinya Taniguchi with id: m.0j_4brn
matched: Shinzo Abe to Shinzo Abe with id: m.07t7hy
matched: Shobha De to Shobha De with id: m.05hsn2
matched: Shoshana Johnson to Shoshana Johnson with id: m.05nltd
matched: Shoshannah Stern to Shoshannah Stern with id:

matched: Tang Jiaxuan to Tang Jiaxuan with id: m.05df3p
matched: Tara Kirk to Tara Kirk with id: m.0fvffw
matched: Tara Reid to Tara Reid with id: m.032wdd
matched: Tara VanDerveer to Tara VanDerveer with id: m.08q7kb
matched: Tassos Papadopoulos to Tassos Papadopoulos with id: m.022dpx
matched: Tatiana Panova to Tatiana Panova with id: m.02ryjj_
matched: Tatiana Shchegoleva to Tatiana Shchegoleva with id: m.04jbc_k
matched: Tatsuya Fuji to Tatsuya Fuji with id: m.05zlbz8
matched: Tatyana Tomashova to Tatyana Tomashova with id: m.09j36c
matched: Taufik Hidayat to Taufik Hidayat with id: m.03nrss
matched: Taylor Twellman to Taylor Twellman with id: m.03zyxy
matched: Tayshaun Prince to Tayshaun Prince with id: m.03qdw5
matched: Ted Christopher to Ted Christopher with id: m.03mdpqd
matched: Ted Maher to Ted Maher with id: m.04zzv3c
matched: Ted Nolan to Ted Nolan with id: m.02yjf8
matched: Ted Turner to Ted Turner with id: m.07hkd
matched: Ted Washington to Ted Washington with id: m.05ymz

matched: Trista Rehn to Trista Rehn with id: m.05dcbl
matched: Tristan Gretzky to Tristan Gretzky with id: m.0j2h3yn
matched: Troy Aikman to Troy Aikman with id: m.0dr5g9
matched: Troy Garity to Troy Garity with id: m.06zttt
matched: Troy Polamalu to Troy Polamalu with id: m.04m0nc
matched: Trudi Lacey to Trudi Lacey with id: m.0ds8xbl
matched: Tsutomu Takebe to Tsutomu Takebe with id: m.07w76b
matched: Tubby Smith to Tubby Smith with id: m.05hkc6
matched: Tung Chee-hwa to Tung Chee-Hwa with id: m.0mbs_
matched: Turner Gill to Turner Gill with id: m.08pg9y
matched: Turner Stevenson to Turner Stevenson with id: m.09xvn5
matched: Ty Votaw to Ty Votaw with id: m.0brtq2
matched: Tyler Hamilton to Tyler Hamilton with id: m.0fvcsw
matched: Tyra Banks to Tyra Banks with id: m.01jbx1
matched: Tzipora Obziler to Tzipora Obziler with id: m.08w04b
matched: Uday Hussein to Uday Hussein with id: m.01c3y8
matched: Uma Thurman to Uma Thurman with id: m.0kjrx
matched: Urmila Matondkar to Urmila Matond

In [51]:
print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')

found 3616 of 5749 names using exact matches


In [56]:
'Adrien Brody' in lfw_name_matches_exact.keys()

True

In [64]:
# make sanitized
lfw_name_matches_clean = {}
for lfw_item in tqdm(lfw_meta):
  lfw_name = lfw_item['name']  # name is transformed original name
  if lfw_name not in lfw_name_matches_exact.keys():
    print(f'clean matching: {lfw_name} ...')
    matched_id_kg = None
    for id_kg, identity in identities_tmp.items():
      # for each msceleb identity, look for match
      for lang, name in identity['names'].items():
        # for each name's language variation, look for match
        strict_match = identity_utils.names_match_strict(lfw_name, name)
        if strict_match:
          matched_id_kg = id_kg
          matched_lang = lang
          matched_name = name
          break
      if matched_id_kg:
        print(f'matched {lfw_name} to {matched_name} in {matched_lang}. Add to matched ids')
        lfw_name_matches_clean[lfw_name] = matched_id_kg
        break
    if not matched_id_kg:
      print(f'could not find: {lfw_name}')
print(f'found {len(lfw_name_matches_clean)} of {len(lfw_meta)} names using exact matches')

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

clean matching: AJ Cook ...
matched AJ Cook to A. J. Cook in canonical. Add to matched ids
clean matching: AJ Lamas ...
matched AJ Lamas to A.J. Lamas in canonical. Add to matched ids
clean matching: Aaron Patterson ...
could not find: Aaron Patterson
clean matching: Aaron Pena ...
matched Aaron Pena to Aaron Peña in canonical. Add to matched ids
clean matching: Abdel Aziz Al-Hakim ...
could not find: Abdel Aziz Al-Hakim
clean matching: Abdel Madi Shabneh ...
could not find: Abdel Madi Shabneh
clean matching: Abdel Nasser Assidi ...
could not find: Abdel Nasser Assidi
clean matching: Abdul Majeed Shobokshi ...
could not find: Abdul Majeed Shobokshi
clean matching: Abdulaziz Kamilov ...
matched Abdulaziz Kamilov to Abdulaziz Komilov in canonical. Add to matched ids
clean matching: Abdullah Nasseef ...
could not find: Abdullah Nasseef
clean matching: Abdullah al-Attiyah ...
could not find: Abdullah al-Attiyah
clean matching: Abdullatif Sener ...
could not find: Abdullatif Sener
clean mat

could not find: Andy Perez
clean matching: Andy Wisecarver ...
could not find: Andy Wisecarver
clean matching: Anette Hosoi ...
could not find: Anette Hosoi
clean matching: Angel Lockward ...
could not find: Angel Lockward
clean matching: Angel Maza ...
could not find: Angel Maza
clean matching: Angela Alvarado Rosa ...
could not find: Angela Alvarado Rosa
clean matching: Angela Mascia-Frye ...
could not find: Angela Mascia-Frye
clean matching: Angelica Romero ...
could not find: Angelica Romero
clean matching: Angelo Genova ...
could not find: Angelo Genova
clean matching: Angelo Reyes ...
could not find: Angelo Reyes
clean matching: Angie Arzola ...
could not find: Angie Arzola
clean matching: Anibal Ibarra ...
could not find: Anibal Ibarra
clean matching: Anil Ramsook ...
could not find: Anil Ramsook
clean matching: Anja Paerson ...
could not find: Anja Paerson
clean matching: Anjum Hussain ...
could not find: Anjum Hussain
clean matching: Ann Godbehere ...
could not find: Ann Godbe

could not find: Billy Rork
clean matching: Billy Sollie ...
could not find: Billy Sollie
clean matching: Billy Tibbets ...
could not find: Billy Tibbets
clean matching: Bob Cantrell ...
could not find: Bob Cantrell
clean matching: Bob Colvin ...
could not find: Bob Colvin
clean matching: Bob Crippen ...
could not find: Bob Crippen
clean matching: Bob Curtis ...
could not find: Bob Curtis
clean matching: Bob Eskridge ...
could not find: Bob Eskridge
clean matching: Bob Goldman ...
could not find: Bob Goldman
clean matching: Bob Hartley ...
could not find: Bob Hartley
clean matching: Bob Herz ...
could not find: Bob Herz
clean matching: Bob Melvin ...
could not find: Bob Melvin
clean matching: Bob Petrino ...
matched Bob Petrino to Bob Petrie in canonical. Add to matched ids
clean matching: Bob Riley ...
could not find: Bob Riley
clean matching: Bob Sulkin ...
could not find: Bob Sulkin
clean matching: Bob Wright ...
could not find: Bob Wright
clean matching: Bobby Goldwater ...
could no

could not find: Charles Holzner
clean matching: Charles Ingram ...
could not find: Charles Ingram
clean matching: Charles Kartman ...
matched Charles Kartman to Chester Kallman in canonical. Add to matched ids
clean matching: Charles Lebois ...
could not find: Charles Lebois
clean matching: Charles Mathews ...
matched Charles Mathews to Matthew Charles in canonical. Add to matched ids
clean matching: Charles Pickering ...
could not find: Charles Pickering
clean matching: Charles Pouty ...
could not find: Charles Pouty
clean matching: Charles Tannok ...
could not find: Charles Tannok
clean matching: Charley Armey ...
matched Charley Armey to Aryeh Carmell in canonical. Add to matched ids
clean matching: Charlie Deane ...
matched Charlie Deane to Charlie Deal in canonical. Add to matched ids
clean matching: Charlotte Chambers ...
could not find: Charlotte Chambers
clean matching: Chawki Armali ...
could not find: Chawki Armali
clean matching: Chea Sophara ...
matched Chea Sophara to Sara

could not find: Daniela Hantuchova
clean matching: Daniele Bergamin ...
could not find: Daniele Bergamin
clean matching: Daniele Hypolito ...
could not find: Daniele Hypolito
clean matching: Daniell Sunjata ...
could not find: Daniell Sunjata
clean matching: Danis Tanovic ...
could not find: Danis Tanovic
clean matching: Danny Avalon ...
could not find: Danny Avalon
clean matching: Danny Morgan ...
could not find: Danny Morgan
clean matching: Dario Camuffo ...
could not find: Dario Camuffo
clean matching: Darko Milicic ...
could not find: Darko Milicic
clean matching: Darlene Garrettson ...
could not find: Darlene Garrettson
clean matching: Darren Campel ...
could not find: Darren Campel
clean matching: Daryl Parks ...
could not find: Daryl Parks
clean matching: Dave Johnson ...
could not find: Dave Johnson
clean matching: Dave Lewis ...
could not find: Dave Lewis
clean matching: Dave McNealey ...
could not find: Dave McNealey
clean matching: Dave Potter ...
matched Dave Potter to Dave

could not find: Edward Arsenault
clean matching: Edward Belvin ...
could not find: Edward Belvin
clean matching: Edward Burns ...
could not find: Edward Burns
clean matching: Edward Lohn ...
could not find: Edward Lohn
clean matching: Edwin Edwards ...
could not find: Edwin Edwards
clean matching: Efrain Rios Montt ...
could not find: Efrain Rios Montt
clean matching: Eileen Spina ...
could not find: Eileen Spina
clean matching: Einars Repse ...
could not find: Einars Repse
clean matching: Ekaterina Dmitriev ...
could not find: Ekaterina Dmitriev
clean matching: Ekke Hard Forberg ...
could not find: Ekke Hard Forberg
clean matching: Eladio Larez ...
could not find: Eladio Larez
clean matching: Elena Bereznaya ...
could not find: Elena Bereznaya
clean matching: Elena Tihomirova ...
could not find: Elena Tihomirova
clean matching: Elena de Chavez ...
could not find: Elena de Chavez
clean matching: Eli Stutsman ...
could not find: Eli Stutsman
clean matching: Elias Attallah ...
could not 

could not find: Garry Witherall
clean matching: Gary Bergeron ...
matched Gary Bergeron to Bryan Gregory in canonical. Add to matched ids
clean matching: Gary Dellaverson ...
could not find: Gary Dellaverson
clean matching: Gary Forsee ...
could not find: Gary Forsee
clean matching: Gary Gitnick ...
could not find: Gary Gitnick
clean matching: Gary Leon Ridgway ...
could not find: Gary Leon Ridgway
clean matching: Gary Marshall ...
could not find: Gary Marshall
clean matching: Gary Paer ...
matched Gary Paer to Garyn Preen in canonical. Add to matched ids
clean matching: Gary Sayler ...
matched Gary Sayler to Sally Greene in canonical. Add to matched ids
clean matching: Gary Williams ...
could not find: Gary Williams
clean matching: Gaston Gaudio ...
could not find: Gaston Gaudio
clean matching: Gavyn Arthur ...
could not find: Gavyn Arthur
clean matching: Gen Meredith ...
could not find: Gen Meredith
clean matching: Gene Orza ...
could not find: Gene Orza
clean matching: Geoff Dixon .

could not find: Hestrie Cloette
clean matching: Hichiro Naemura ...
could not find: Hichiro Naemura
clean matching: Hideki Sato ...
could not find: Hideki Sato
clean matching: Hikmat al-Azzawi ...
could not find: Hikmat al-Azzawi
clean matching: Hilda Fortune ...
could not find: Hilda Fortune
clean matching: Hilmi Akin Zorlu ...
could not find: Hilmi Akin Zorlu
clean matching: Hilmi Ozkok ...
could not find: Hilmi Ozkok
clean matching: Himmler Rebu ...
could not find: Himmler Rebu
clean matching: Hipolito Mejia ...
could not find: Hipolito Mejia
clean matching: Hiroki Gomi ...
could not find: Hiroki Gomi
clean matching: Hisham Halawi ...
could not find: Hisham Halawi
clean matching: Hitoshi Oshitani ...
could not find: Hitoshi Oshitani
clean matching: Hitoshi Tanaka ...
could not find: Hitoshi Tanaka
clean matching: Hoda Asfor ...
could not find: Hoda Asfor
clean matching: Hootie Johnson ...
could not find: Hootie Johnson
clean matching: Horace Donovan Reid ...
could not find: Horace D

could not find: Janez Drnovsek
clean matching: Janica Kostelic ...
matched Janica Kostelic to Janica Kosteliča in lv. Add to matched ids
clean matching: Janice Abreu ...
could not find: Janice Abreu
clean matching: Janis Ruth Coulter ...
could not find: Janis Ruth Coulter
clean matching: Jaqueline Godoy ...
could not find: Jaqueline Godoy
clean matching: Jaromir Jagr ...
could not find: Jaromir Jagr
clean matching: Jason Alexander ...
matched Jason Alexander to Alexander Jones in canonical. Add to matched ids
clean matching: Jason Campbell ...
could not find: Jason Campbell
clean matching: Jason Gardner ...
matched Jason Gardner to Gerard Jones in canonical. Add to matched ids
clean matching: Jason Sorens ...
matched Jason Sorens to José Soares in pt. Add to matched ids
clean matching: Javier Camara ...
could not find: Javier Camara
clean matching: Jawad Boulus ...
could not find: Jawad Boulus
clean matching: Jayne Yarris ...
could not find: Jayne Yarris
clean matching: Jean-Claude Bra

could not find: John Philip Elkann
clean matching: John Reilly ...
matched John Reilly to John J. Riley in canonical. Add to matched ids
clean matching: John Robbins ...
matched John Robbins to John Robinson in de. Add to matched ids
clean matching: John Sidgmore ...
could not find: John Sidgmore
clean matching: John Sununu ...
could not find: John Sununu
clean matching: John Velazquez ...
could not find: John Velazquez
clean matching: Johnny Htu ...
matched Johnny Htu to Johnny Hunt in canonical. Add to matched ids
clean matching: Johnson Panjaitan ...
could not find: Johnson Panjaitan
clean matching: Jolanta Kwasniewski ...
could not find: Jolanta Kwasniewski
clean matching: Jon Constance ...
could not find: Jon Constance
clean matching: Jonathan Arden ...
could not find: Jonathan Arden
clean matching: Jonathan Edwards ...
matched Jonathan Edwards to Edward E. Johnston in canonical. Add to matched ids
clean matching: Jonathan Fine ...
could not find: Jonathan Fine
clean matching: Jon

could not find: Keith Rodriguez
clean matching: Kellie Greene ...
could not find: Kellie Greene
clean matching: Kelly Leigh ...
could not find: Kelly Leigh
clean matching: Kemal Dervis ...
could not find: Kemal Dervis
clean matching: Ken Balk ...
could not find: Ken Balk
clean matching: Ken Watanabe ...
could not find: Ken Watanabe
clean matching: Ken Wharfe ...
could not find: Ken Wharfe
clean matching: Kenneth Brill ...
could not find: Kenneth Brill
clean matching: Kenneth Dam ...
could not find: Kenneth Dam
clean matching: Kenneth Evans ...
could not find: Kenneth Evans
clean matching: Kenneth Reichert ...
could not find: Kenneth Reichert
clean matching: Kenny Brack ...
could not find: Kenny Brack
clean matching: Kent Robinson ...
matched Kent Robinson to Kent Robbins in canonical. Add to matched ids
clean matching: Kevin Crane ...
could not find: Kevin Crane
clean matching: Kevin Keegan ...
could not find: Kevin Keegan
clean matching: Kevin Satterfield ...
could not find: Kevin Sat

could not find: Lord Hutton
clean matching: Loretta Lynn Harper ...
could not find: Loretta Lynn Harper
clean matching: Louisa Baileche ...
could not find: Louisa Baileche
clean matching: Lubomir Zaoralek ...
could not find: Lubomir Zaoralek
clean matching: Lucas Wysocki ...
could not find: Lucas Wysocki
clean matching: Lucia Kenny Anthony ...
could not find: Lucia Kenny Anthony
clean matching: Luciano Bovicelli ...
could not find: Luciano Bovicelli
clean matching: Lucie Lapovsky ...
could not find: Lucie Lapovsky
clean matching: Lucio Gutierrez ...
could not find: Lucio Gutierrez
clean matching: Lucio Stanca ...
could not find: Lucio Stanca
clean matching: Lucrecia Orozco ...
could not find: Lucrecia Orozco
clean matching: Ludwig Ovalle ...
could not find: Ludwig Ovalle
clean matching: Luis Berrondo ...
matched Luis Berrondo to Louise Borden in canonical. Add to matched ids
clean matching: Luis Ernesto Derbez Bautista ...
could not find: Luis Ernesto Derbez Bautista
clean matching: Lu

could not find: Martin Burnham
clean matching: Martin Gecht ...
matched Martin Gecht to Martin Grech in canonical. Add to matched ids
clean matching: Martin Hoellwarth ...
could not find: Martin Hoellwarth
clean matching: Martin Howard ...
could not find: Martin Howard
clean matching: Martin Kristof ...
could not find: Martin Kristof
clean matching: Martin ONeill ...
matched Martin ONeill to Oriel Malet in canonical. Add to matched ids
clean matching: Martin Rodriguez ...
could not find: Martin Rodriguez
clean matching: Martin Torrijos ...
could not find: Martin Torrijos
clean matching: Marwan Barghouthi ...
could not find: Marwan Barghouthi
clean matching: Marwan Muasher ...
could not find: Marwan Muasher
clean matching: Mary Anne Souza ...
could not find: Mary Anne Souza
clean matching: Mary Blige ...
could not find: Mary Blige
clean matching: Mary Catherine Correll ...
could not find: Mary Catherine Correll
clean matching: Mary Descenza ...
could not find: Mary Descenza
clean matchi

could not find: Mireya Elisa Moscoso Rodriguez
clean matching: Misty Dawn Clymer ...
could not find: Misty Dawn Clymer
clean matching: Mitar Rasevic ...
could not find: Mitar Rasevic
clean matching: Mitchell Crooks ...
could not find: Mitchell Crooks
clean matching: Mitchell Daniels ...
matched Mitchell Daniels to Michel Decastel in canonical. Add to matched ids
clean matching: Mitchell Garabedian ...
could not find: Mitchell Garabedian
clean matching: Mitchell McLaughlin ...
could not find: Mitchell McLaughlin
clean matching: Mitchell Potter ...
matched Mitchell Potter to Pierre Michelot in canonical. Add to matched ids
clean matching: Mitchell Swartz ...
could not find: Mitchell Swartz
clean matching: Mitsou Gelinas ...
could not find: Mitsou Gelinas
clean matching: Mladen Naletilic ...
could not find: Mladen Naletilic
clean matching: Mo Elleithee ...
could not find: Mo Elleithee
clean matching: Mohamed Hammam ...
matched Mohamed Hammam to Hamada Mohamed in no. Add to matched ids
cle

could not find: Odilia Collazo
clean matching: Olesya Bonabarenko ...
could not find: Olesya Bonabarenko
clean matching: Oliver Phelps ...
could not find: Oliver Phelps
clean matching: Olivera Labus ...
could not find: Olivera Labus
clean matching: Omar Khan Sharif ...
could not find: Omar Khan Sharif
clean matching: Omar el-Heib ...
could not find: Omar el-Heib
clean matching: Ontario Lett ...
could not find: Ontario Lett
clean matching: Oracene Williams ...
could not find: Oracene Williams
clean matching: Osama Al Baz ...
could not find: Osama Al Baz
clean matching: Oscar Bolanos ...
could not find: Oscar Bolanos
clean matching: Oscar DLeon ...
could not find: Oscar DLeon
clean matching: Oscar Elias Biscet ...
could not find: Oscar Elias Biscet
clean matching: Osmond Smith ...
could not find: Osmond Smith
clean matching: Osrat Iosef ...
could not find: Osrat Iosef
clean matching: Oswald Gruebel ...
could not find: Oswald Gruebel
clean matching: Oswaldo Paya ...
could not find: Oswald

could not find: Raaf Schefter
clean matching: Raag Singhal ...
could not find: Raag Singhal
clean matching: Rachel Leigh Cook ...
could not find: Rachel Leigh Cook
clean matching: Rachel Wadsworth ...
could not find: Rachel Wadsworth
clean matching: Rachel Wheatley ...
could not find: Rachel Wheatley
clean matching: Radovan Karadzic ...
could not find: Radovan Karadzic
clean matching: Rafael Ramirez ...
could not find: Rafael Ramirez
clean matching: Rafael Vinoly ...
matched Rafael Vinoly to Rafael Viñoly in canonical. Add to matched ids
clean matching: Raghad Saddam Hussein ...
could not find: Raghad Saddam Hussein
clean matching: Rainer Geulen ...
could not find: Rainer Geulen
clean matching: Rainer Gut ...
could not find: Rainer Gut
clean matching: Rainer Schuettler ...
could not find: Rainer Schuettler
clean matching: Raja Ibrahim ...
could not find: Raja Ibrahim
clean matching: Raja Qureshi ...
could not find: Raja Qureshi
clean matching: Raja Ramani ...
could not find: Raja Raman

could not find: Ronaldo Luis Nazario de Lima
clean matching: Rosa Haywa de Condori ...
could not find: Rosa Haywa de Condori
clean matching: Rosalie Perkov ...
could not find: Rosalie Perkov
clean matching: Rosalyn Carter ...
matched Rosalyn Carter to Rosalynn Carter in canonical. Add to matched ids
clean matching: Rose Linkins ...
matched Rose Linkins to Rose Likins in it. Add to matched ids
clean matching: Rosny Desroches ...
could not find: Rosny Desroches
clean matching: Roy Moore ...
could not find: Roy Moore
clean matching: Ruano Pascual ...
could not find: Ruano Pascual
clean matching: Ruben Sierra ...
could not find: Ruben Sierra
clean matching: Ruben Wolkowyski ...
could not find: Ruben Wolkowyski
clean matching: Rubens Barrichello ...
could not find: Rubens Barrichello
clean matching: Rudi Voeller ...
could not find: Rudi Voeller
clean matching: Rudolph Holton ...
could not find: Rudolph Holton
clean matching: Rustu Recber ...
could not find: Rustu Recber
clean matching: Ruth

could not find: Stephen Ebberharter
clean matching: Stephen Glassroth ...
could not find: Stephen Glassroth
clean matching: Stephen Keener ...
could not find: Stephen Keener
clean matching: Stephen Oake ...
could not find: Stephen Oake
clean matching: Stephen Push ...
could not find: Stephen Push
clean matching: Stephen Swindal ...
could not find: Stephen Swindal
clean matching: Steve Allan ...
could not find: Steve Allan
clean matching: Steve Blankenship ...
could not find: Steve Blankenship
clean matching: Steve Coterill ...
could not find: Steve Coterill
clean matching: Steve Fehr ...
could not find: Steve Fehr
clean matching: Steve Lenard ...
could not find: Steve Lenard
clean matching: Steve Nash ...
could not find: Steve Nash
clean matching: Steve Nesbitt ...
matched Steve Nesbitt to Steve Nisbett in canonical. Add to matched ids
clean matching: Steve Pagliuca ...
could not find: Steve Pagliuca
clean matching: Steve Patterson ...
matched Steve Patterson to Steven Patterson in can

could not find: Tommy Tubberville
clean matching: Tono Suratman ...
could not find: Tono Suratman
clean matching: Tony Bennett ...
could not find: Tony Bennett
clean matching: Tony Cummo ...
could not find: Tony Cummo
clean matching: Tony LaRussa ...
matched Tony LaRussa to Tony La Russa in canonical. Add to matched ids
clean matching: Tonya Payne ...
could not find: Tonya Payne
clean matching: Tora Takagi ...
could not find: Tora Takagi
clean matching: Toshi Izawa ...
could not find: Toshi Izawa
clean matching: Tracee Treadwell ...
could not find: Tracee Treadwell
clean matching: Tracy Wyle ...
could not find: Tracy Wyle
clean matching: Travis Rudolph ...
could not find: Travis Rudolph
clean matching: Trevor Watson ...
matched Trevor Watson to Trevor Watts in canonical. Add to matched ids
clean matching: Trisha Meili ...
could not find: Trisha Meili
clean matching: Troy Hudson ...
could not find: Troy Hudson
clean matching: Troy Jenkins ...
could not find: Troy Jenkins
clean matching:

In [141]:
# compare this this to master identity
for lfw_item in tqdm(lfw_meta, desc='1st loop'):
    
  # for each LFW name, look for match
  lfw_name = lfw_item['name']
  matched_id = None
  
  for id_kg, identity in identities_tmp.items():
    # for each msceleb identity, look for match
    for lang, name in identity['names'].items():
      # for each name's language variation, look for match
      if not len(name) > 0:
        print('no name')
        continue
      strict_match = identity_utils.names_match_strict(lfw_name, name)
      if strict_match:
        #print(f'Strict matched "{lfw_name}" to "{name}"')
        matched_id = id_kg
        matched_lang = lang
        matched_name = name
        break
    if matched_id:
      matched_lang = lang
      matched_name = name
      print(f'OK. Found match: {lfw_name} == {matched_name} in lang: {matched_lang}')
      pbar_ids.clear()
      pbar_ids.close()
      break
  if not matched_id:
    print(f'ERROR: could not find {lfw_name}')
      

HBox(children=(IntProgress(value=0, description='1st loop', max=5749, style=ProgressStyle(description_width='i…

Found: Aaron Eckhart@ca
Found: Aaron Guiel@en
Found: Aaron Peirsol@ca
Found: Aaron Sorkin@ca
Found: Aaron Tippin@de
Found: Abba Eban@cs
Found: Abbas Kiarostami@ca
Found: Abdoulaye Wade@ca
Found: Abdul Rahman Lestaluhu@id
Found: Abdullah Cabir@tr
Found: Abdullah Ahmad Badawi@da
Found: Abdullah Gulam Rasoul@en
Found: Abel Aguilar@cs
Found: Abel Pacheco de la Espriella@es
Found: Abid Hamid Mahmud al-Tikriti@nl
Found: Abraham Foxman@cs
Found: Adam Ant@cs
Found: Adam Freier@en
Found: Adam Herbert@en
Found: Adam Mair@de
Found: Adam Richards@en
Found: Adam Sandler@ca
Found: George Adam Scott@en
Found: Adel Al-Jubeir@fr
Found: Adolfo Rodriguez Saa@id
Found: Adrian McPherson@en
Found: Adrian Murrell@en
Found: Adriana Lima@ca
Found: Adrien Brody@ca
Found: Afton Smith@cs
Found: Agbani Darego@de
Found: Agnelo Queiroz@en
Found: Agnes Bruckner@de
Found: Ahmed Ahmedou@de
Found: Ahmed Chalabi@en
Found: Mahmood Ahmed Ghazi@en
Found: Ahmet Necdet Sezer@ca
Found: Ai Sugiyama@da
Found: Aidan Quinn@ca
Foun

Found: Bill Callahan@en
Found: Bill Cartwright@en
Found: Bill Clancy@en
Found: Bill Clinton@ca
Found: Bill Curry@en
Found: Bill Doba@en
Found: Bill Elliott@pt
Found: Bill Fennelly@en
Found: Bill Frist@de
Found: Bill Gates@ca
Found: Bill Grahame@en
Found: Bill Guerin@de
Found: Bill Herrion@en
Found: Bill Hughes@en
Found: Bill Kollar@en
Found: Bill Kong@es
Found: Bill Mauldin@de
Found: Bill McBride@en
Found: Bill Nelson@da
Found: Bill Parcells@de
Found: Bill Parsons@en
Found: Bill Paxton@ca
Found: Bill Self@de
Found: Bill Sizemore@en
Found: Bill Stapleton@en
Found: Bill Steinke@en
Found: Bill Walton@de
Found: Billy Andrade@da
Found: Billy Beane@de
Found: Billy Bob Thornton@ca
Found: Billy Boyd@en
Found: Billy Crawford@de
Found: Billy Crystal@ca
Found: Billy Donovan@en
Found: Billy Gilman@en
Found: Billy Joel@ca
Found: Bing Crosby@ca
Found: Binyamin Ben-Eliezer@en
Found: Bison Dele@de
Found: Bixente Lizarazu@ca
Found: Blas Ople@de
Found: Blythe Danner@ca
Found: Blythe Hartley@de
Found: Bo

KeyboardInterrupt: 

In [103]:
identity_utils.names_match_strict('AJ Cook', 'A.J. Cook')

True

In [105]:
names_match('A.J. Cook', 'cook Aj', as_float=True, compound_score=True)

1.0

## PubFig

In [None]:
# add pubfig data

## Face Scrub

In [None]:
# add facescrub

## UMD Faces

In [None]:
# add umd

## CASIA Webface

In [None]:
# add CASIA Webface

# IMDB Wiki

In [None]:
# add imdb-wiki

## IMDB-Face

In [None]:
# add imdb face