# Identity Master List

- [x] MS Celeb 1M
- UMD Faces
- FaceScrub
- LFW
- PubFig
- PubFig83
- VGG Face
- VGG Face2
- IJB-C
- CASIA Webface
- IMDB-Face
- IMDB-Wiki

In [156]:
%reload_ext autoreload
%autoreload 2

import os
from os.path import join
from glob import glob
from pathlib import Path
import requests
import json
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request
import difflib
import unidecode

import slugify
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append('/work/megapixels_dev/megapixels')
from app.utils import api_utils, identity_utils
from app.settings import app_cfg
from app.settings import types

## MS Celeb Top 1M

- add column for each spelling of name
- convert kg id to standard google format

In [157]:
fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'
dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'
fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')

In [158]:
fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'
df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])
df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')
n_groups = df_msceleb_top1m_groups.ngroups

In [200]:
# create alphabetically sorted dict
msceleb_top1m_az = {}
a2z = 'abcdefghijklmnopqrstuvwxyz'
for c in a2z:
  msceleb_top1m_az[c] = []
for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):
  name = msceleb_row.name_lang
  try:
    msceleb_top1m_az[name[0].lower()].append({'name': name, 'id_kg': msceleb_row.id_kg})
  except Exception as e:
    pass

HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))

In [159]:
df_msceleb_top1m.head()

Unnamed: 0,id_kg,name_lang
0,m.01008l47,Patrick Cummins@en
1,m.01008l47,Patrick Cummins@pt
2,m.01008l96,Mohamed Guessous@en
3,m.01008l96,Mohamed Guessous@fr
4,m.01008l96,محمد جسوس@ar


In [160]:
print(f'There are {len(df_msceleb_top1m):,} total name variations')
print(f'There are {n_groups:,} unique identities')

There are 3,481,186 total name variations
There are 1,000,000 unique identities


In [161]:
# convert DataFrame to dict
mseleb_top1m_records = df_msceleb_top1m.to_dict('records')

In [162]:
# store all identity info here, until creating dataframe
msceleb_identities = {}

In [163]:
# utility functions
def split_name_lang(name_lang):
  '''Split name into name and language'''
  if '@' in name_lang:
    indexes = [i for i,x in enumerate(name_lang) if x == '@']
    idx_max = (max(indexes))
    lang = name_lang[(idx_max + 1):]
    name = name_lang[:(idx_max)]
  else:
    name = name_lang
    lang = ''
  return {'name': name, 'lang': lang}

# temp save DataFrame to CSV
def save_identity_master(identities, fp_out=fp_master_identities):
  df_identities_master = pd.DataFrame.from_dict(identities)
  df_identities_master.index.name = 'id'
  df_identities_master.to_csv(fp_master_identities)

In [164]:
# convert to "name@lang" to dict format
msceleb_identities = {}
for mseleb_top1m_record in tqdm(mseleb_top1m_records):
  id_kg = mseleb_top1m_record['id_kg'].replace('m.','/m/')
  if not id_kg in msceleb_identities.keys():
    msceleb_identities[id_kg] = {'names': {}}
  name_lang = split_name_lang(mseleb_top1m_record['name_lang'])
  name = name_lang['name']
  lang = name_lang['lang']
  if lang == 'en':
      msceleb_identities[id_kg]['names']['canonical'] = name
  msceleb_identities[id_kg]['names'][lang] = name

HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))

### Patch @en names

In [165]:
# check for missing english names
for id_kg, attrs in tqdm(msceleb_identities.items()):
  lang_attrs = attrs['names']
  name_en = lang_attrs.get('en', None)
  if not name_en:
    print(f'no english name for {id_kg}')

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

no english name for /m/017vbn
no english name for /m/026q0k_
no english name for /m/02k2kw
no english name for /m/0bwhrg1


In [166]:
# patch en name exception: 4 names missing english
en_exceptions = {
  '/m/017vbn': 'de',
  '/m/026q0k_': 'nl',
  '/m/02k2kw': 'de',
  '/m/0bwhrg1': 'it'
}
for id_kg, lang in en_exceptions.items():
  msceleb_identities[id_kg]['names']['en'] = msceleb_identities[id_kg]['names'][lang]
  msceleb_identities[id_kg]['names']['canonical'] = msceleb_identities[id_kg]['names']['en']
  print(f'patched {id_kg} {lang} to en')

patched /m/017vbn de to en
patched /m/026q0k_ nl to en
patched /m/02k2kw de to en
patched /m/0bwhrg1 it to en


### Remove duplicate names

In [167]:
# de-duplicate names that use same spelling for multiple languages
items_removed = []
msceleb_identities_copy = msceleb_identities.copy()

for id_kg, attrs in tqdm(msceleb_identities_copy.items()):
  lang_attrs = attrs['names']
  name_main = lang_attrs.get('canonical', None)
  if not name_en:
    print('error. all names need "en"')
    break
  lang_attrs_copy = attrs['names'].copy()
  for lang, name in lang_attrs_copy.items():
    if name == name_main and lang != 'en' and lang != 'canonical':
      # remove it
      items_removed.append(msceleb_identities[id_kg]['names'].pop(lang))
  del lang_attrs_copy

del msceleb_identities_copy
print(f'removed {len(items_removed):,} duplicate names')
del items_removed

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

removed 1,485,336 duplicate names


### Count images per person for ms celeb

In [168]:
# calculate total images per id
msceleb_files = {}
# load text file
with open(fp_msceleb_clean_txt,'r') as fp:
  msceleb_lines = fp.readlines()
  
# iterate lines and append all files
for filepath in tqdm(msceleb_lines):
  id_kg, fname = filepath.split('/')
  id_kg = id_kg.replace('m.', '/m/')
  if not id_kg in msceleb_files.keys():
    msceleb_files[id_kg] = []
  msceleb_files[id_kg].append(fname)

HBox(children=(IntProgress(value=0, max=5049824), HTML(value='')))

In [171]:
# add count to 
for id_kg, attrs in tqdm(msceleb_identities.items()):
  if id_kg in msceleb_files.keys():
    count = len(msceleb_files[id_kg])
  else:
    count = 0
  msceleb_identities[id_kg]['count_msceleb'] = count

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

In [172]:
im_counts_idxs = [attrs['count_msceleb'] for id_kg, attrs in msceleb_identities.items()]
im_counts_id_kg = [id_kg for id_kg, _ in msceleb_identities.items()]

In [173]:
# print stats
idx_max = np.argmax(im_counts_idxs)
id_kg_max = im_counts_id_kg[idx_max]
count_max = im_counts_idxs[idx_max]
name_max = msceleb_identities[id_kg_max]['names']['canonical']
print(f'Most images {count_max:,} for {name_max}')
# distribution
im_counts_idxs = np.array(im_counts_idxs)
print(f'{len(im_counts_idxs[im_counts_idxs > 10]):,} more than 10')
print(f'{len(im_counts_idxs[im_counts_idxs > 20]):,} more than 20')
print(f'{len(im_counts_idxs[im_counts_idxs > 50]):,} more than 50')
print(f'{len(im_counts_idxs[im_counts_idxs > 100]):,} more than 100')

Most images 130 for Leelee Sobieski
88,244 more than 10
78,027 more than 20
49,042 more than 50
5,025 more than 100


In [174]:
# awkward conversion of msceleb_identities to a list of dicts
identities_flat = []
for id_kg, attrs in tqdm(msceleb_identities.items()):
  obj = {'id_kg': id_kg}
  for lang, name in attrs['names'].items():
    if lang != 'canonical':
        col_name = f'name_msceleb_{lang}'
    elif lang == 'canonical':
      col_name = 'name_msceleb'
    obj[col_name] = name
  obj['count_msceleb'] = attrs['count_msceleb']
  identities_flat.append(obj)

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

In [175]:
# convert to dataframe
df_identities = pd.DataFrame.from_dict(identities_flat)

In [176]:
# save checkpoint CSV
save_identity_master(identities_flat)  # encoding='utf-16' ??

In [177]:
# copy to master and delete ref to msceleb
identities = msceleb_identities.copy()
del msceleb_identities

## LFW

In [193]:
# add LFW data
fp_lfw = '/data_store_hdd/datasets/people/lfw/downloads/lfw-names.txt'
with open(fp_lfw,'r') as fp:
  lfw_lines = fp.readlines()
lfw_lines = [x.strip() for x in lfw_lines]

lfw_meta = []
for lfw_line in lfw_lines:
  name_orig, count = lfw_line.split('\t')
  name_clean = name_orig.replace('_',' ')
  obj = {'name_orig': name_orig, 'name': name_clean, 'count':count}
  lfw_meta.append(obj)

In [179]:
identities_tmp = identities.copy()

In [None]:
# make exact name matches
lfw_name_matches_tmp = {}
for lfw_item in tqdm(lfw_meta):
  lfw_name = lfw_item['name']  # name is transformed original name
  lfwnl = lfw_name.lower()
  splits = lfw_name.split(' ')
  matches_tmp = {}
  for word in splits:
    # for each word in names, check if exact word is in master name list
    c = word[0].lower()
    matches_tmp = []
    for name_id_kg in msceleb_top1m_az[c]:
      name = name_id_kg['name']
      id_kg = name_id_kg['id_kg']
      if lfwnl in name.lower():
        lfw_name_matches_tmp[lfw_name] = id_kg
        break
print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')

In [212]:
# make exact name matches
lfw_name_matches_exact = {}
for lfw_item in tqdm(lfw_meta):
  lfw_name = lfw_item['name']  # name is transformed original name
  # quickly check if it's in the alphabetized list
  c = lfw_name[0].lower()
  lfwnl = lfw_name.lower()
  for name_id_kg in msceleb_top1m_az[c]:
    name = name_id_kg['name']
    id_kg = name_id_kg['id_kg']
    if lfwnl in name.lower():
      lfw_name_matches_exact[lfw_name] = id_kg
      break
print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

KeyboardInterrupt: 

In [217]:
# make strict name-letter matches
lfw_name_matches_strict = {}
for lfw_item in tqdm(lfw_meta):
  lfw_name = lfw_item['name']  # name is transformed original name
  if lfw_name in lfw_name_matches_exact.keys():
    continue
    
  matched_id_kg = None
  for id_kg, identity in identities_tmp.items():
    # for each msceleb identity, look for match
    for lang, name in identity['names'].items():
      # for each name's language variation, look for match
      strict_match = identity_utils.names_match_strict(lfw_name, name)
      if strict_match:
        matched_id_kg = id_kg
        matched_lang = lang
        matched_name = name
        break
    if matched_id_kg:
      print(f'matched {lfw_name} to {matched_name} in {matched_lang}. Add to matched ids')
      lfw_name_matches_strict[lfw_name] = matched_id_kg
      break
  if not matched_id_kg:
    print(f'could not find: {lfw_name}')
print(f'found {len(lfw_name_matches_strict)} of {len(lfw_meta)} names using exact matches')

HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))

matched AJ Cook to A. J. Cook in canonical. Add to matched ids
matched AJ Lamas to A.J. Lamas in canonical. Add to matched ids
could not find: Aaron Patterson
matched Aaron Pena to Aaron Peña in canonical. Add to matched ids
could not find: Abdel Aziz Al-Hakim
could not find: Abdel Madi Shabneh
could not find: Abdel Nasser Assidi
could not find: Abdul Majeed Shobokshi
matched Abdulaziz Kamilov to Abdulaziz Komilov in canonical. Add to matched ids
could not find: Abdullah Nasseef
could not find: Abdullah al-Attiyah
could not find: Abdullatif Sener
could not find: Abner Martinez
could not find: Aby Har-Even
could not find: Adam Kennedy
could not find: Adelina Avila
could not find: Adisai Bodharamik
could not find: Adolfo Aguilar Zinser
could not find: Adoor Gopalakarishnan
could not find: Adrian Annus
matched Adrian Fernandez to Adriana Fernández in canonical. Add to matched ids
could not find: Adrian Nastase
could not find: Adriana Perez Navarro


KeyboardInterrupt: 

In [None]:
# make fuzzy name matches


In [141]:
# compare this this to master identity
for lfw_item in tqdm(lfw_meta, desc='1st loop'):
    
  # for each LFW name, look for match
  lfw_name = lfw_item['name']
  matched_id = None
  
  for id_kg, identity in identities_tmp.items():
    # for each msceleb identity, look for match
    for lang, name in identity['names'].items():
      # for each name's language variation, look for match
      if not len(name) > 0:
        print('no name')
        continue
      strict_match = identity_utils.names_match_strict(lfw_name, name)
      if strict_match:
        #print(f'Strict matched "{lfw_name}" to "{name}"')
        matched_id = id_kg
        matched_lang = lang
        matched_name = name
        break
    if matched_id:
      matched_lang = lang
      matched_name = name
      print(f'OK. Found match: {lfw_name} == {matched_name} in lang: {matched_lang}')
      pbar_ids.clear()
      pbar_ids.close()
      break
  if not matched_id:
    print(f'ERROR: could not find {lfw_name}')
      

HBox(children=(IntProgress(value=0, description='1st loop', max=5749, style=ProgressStyle(description_width='i…

Found: Aaron Eckhart@ca
Found: Aaron Guiel@en
Found: Aaron Peirsol@ca
Found: Aaron Sorkin@ca
Found: Aaron Tippin@de
Found: Abba Eban@cs
Found: Abbas Kiarostami@ca
Found: Abdoulaye Wade@ca
Found: Abdul Rahman Lestaluhu@id
Found: Abdullah Cabir@tr
Found: Abdullah Ahmad Badawi@da
Found: Abdullah Gulam Rasoul@en
Found: Abel Aguilar@cs
Found: Abel Pacheco de la Espriella@es
Found: Abid Hamid Mahmud al-Tikriti@nl
Found: Abraham Foxman@cs
Found: Adam Ant@cs
Found: Adam Freier@en
Found: Adam Herbert@en
Found: Adam Mair@de
Found: Adam Richards@en
Found: Adam Sandler@ca
Found: George Adam Scott@en
Found: Adel Al-Jubeir@fr
Found: Adolfo Rodriguez Saa@id
Found: Adrian McPherson@en
Found: Adrian Murrell@en
Found: Adriana Lima@ca
Found: Adrien Brody@ca
Found: Afton Smith@cs
Found: Agbani Darego@de
Found: Agnelo Queiroz@en
Found: Agnes Bruckner@de
Found: Ahmed Ahmedou@de
Found: Ahmed Chalabi@en
Found: Mahmood Ahmed Ghazi@en
Found: Ahmet Necdet Sezer@ca
Found: Ai Sugiyama@da
Found: Aidan Quinn@ca
Foun

Found: Bill Callahan@en
Found: Bill Cartwright@en
Found: Bill Clancy@en
Found: Bill Clinton@ca
Found: Bill Curry@en
Found: Bill Doba@en
Found: Bill Elliott@pt
Found: Bill Fennelly@en
Found: Bill Frist@de
Found: Bill Gates@ca
Found: Bill Grahame@en
Found: Bill Guerin@de
Found: Bill Herrion@en
Found: Bill Hughes@en
Found: Bill Kollar@en
Found: Bill Kong@es
Found: Bill Mauldin@de
Found: Bill McBride@en
Found: Bill Nelson@da
Found: Bill Parcells@de
Found: Bill Parsons@en
Found: Bill Paxton@ca
Found: Bill Self@de
Found: Bill Sizemore@en
Found: Bill Stapleton@en
Found: Bill Steinke@en
Found: Bill Walton@de
Found: Billy Andrade@da
Found: Billy Beane@de
Found: Billy Bob Thornton@ca
Found: Billy Boyd@en
Found: Billy Crawford@de
Found: Billy Crystal@ca
Found: Billy Donovan@en
Found: Billy Gilman@en
Found: Billy Joel@ca
Found: Bing Crosby@ca
Found: Binyamin Ben-Eliezer@en
Found: Bison Dele@de
Found: Bixente Lizarazu@ca
Found: Blas Ople@de
Found: Blythe Danner@ca
Found: Blythe Hartley@de
Found: Bo

KeyboardInterrupt: 

In [103]:
identity_utils.names_match_strict('AJ Cook', 'A.J. Cook')

True

In [105]:
names_match('A.J. Cook', 'cook Aj', as_float=True, compound_score=True)

1.0

## PubFig

In [None]:
# add pubfig data

## Face Scrub

In [None]:
# add facescrub

## UMD Faces

In [None]:
# add umd

## CASIA Webface

In [None]:
# add CASIA Webface

# IMDB Wiki

In [None]:
# add imdb-wiki

## IMDB-Face

In [None]:
# add imdb face