# Identity Master List

- start with MS Celeb Top1M
- then progressively add smaller datasets

In [1]:
%reload_ext autoreload
%autoreload 2

import os
from os.path import join
from glob import glob
from pathlib import Path
import requests
import json
from pprint import pprint
from multiprocessing.pool import ThreadPool
import threading
import urllib.request
import difflib
import unidecode

import slugify
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from scipy.io import loadmat
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import sys
sys.path.append('/work/megapixels_dev/megapixels')
from app.utils import api_utils, identity_utils
from app.settings import app_cfg
from app.settings import types

In [58]:
fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'

## MS Celeb Top 1M

- add column for each spelling of name
- convert kg id to standard google format

In [16]:
fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'
df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\t', header=None, encoding='utf-8', names=['id_kg', 'name'])
df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')
n_groups = df_msceleb_top1m_groups.ngroups
print(f'{n_groups} groups')
df_msceleb_top1m.head(2)

In [110]:
mseleb_top1m_records = df_msceleb_top1m.to_dict('records')

In [106]:
#df_msceleb_top1m.head(100)

In [None]:
abbrev_mappings = {
 'en-US': 'en',
 'en-GB': 'en',
 'es-419': 'es-419',
 'es'
}

In [None]:
msceleb_identities = {}

In [120]:
def split_name_lang(name_lang):
 '''Split name into name and language'''
 if '@' in name_lang:
 indexes = [i for i,x in enumerate(name_lang) if x == '@']
 idx_max = (max(indexes))
 lang = name_lang[(idx_max + 1):]
 name = name_lang[:(idx_max)]
 else:
 name = name_lang
 lang = ''
 return {'name': name, 'lang': lang}

In [122]:
split_name_lang('r@destiny@en')

{'name': 'r@destiny', 'lang': 'en-417'}

In [141]:
msceleb_identities = {}
for mseleb_top1m_record in tqdm(mseleb_top1m_records):
 id_kg = mseleb_top1m_record['id_kg']
 if not id_kg in msceleb_identities.keys():
 msceleb_identities[id_kg] = {}
 name_lang = split_name_lang(mseleb_top1m_record['name'])
 name = name_lang['name']
 lang = name_lang['lang']
 msceleb_identities[id_kg][lang] = name

HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))

In [142]:
import itertools
msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))

In [145]:
# de-duplicate names that use same spelling for multiple languages
for id_kg, name_langs in msceleb_identities_sm.items():
 if 'en' in name_langs.keys():
 name_en = name_langs['en']
 for lang, name in name_langs.items():
 print(name, lang)

Patrick Cummins en
Patrick Cummins pt
Mohamed Guessous en
Mohamed Guessous fr
محمد جسوس ar
Tsvetta Kaleynska en
Tsvetta Kaleynska es
Tsvetta Kaleynska fr
Цвета Калейнска bg
Цвета Калейнска ru
Caio Henrique Siqueira Sanchez en
Кајо Санчез sr
Julio Ríos Gallego ca
Julio Ríos Gallego en
Julio Ríos Gallego es
Nilson Ricardo da Silva Júnior en
ニルソン・リカルド・ダ・シルバ・ジュニオール ja
니우송 히카르두 다 시우바 주니오르 ko
Aleksej Aleksandrovič Starobinski sl
Alexei Alexandrowitsch Starobinski de
Alexei Starobinski pt
Alexei Starobinsky en
Alexeï Starobinski fr
Алексей Александрович Старобинский ru
Старобінський Олексій Олександрович uk
アレクセイ・スタロビンスキー ja
Hilda Rix Nicholas en
هیلدا ریکس نیکولاس fa
Behrouz Makvandi en
Бехруз Макванди ru
بهروز مکوندی fa
Borislav Terzić en
Борислав Терзић sr


In [103]:
messages = []

for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):
 id_kg = id_kg.replace('m.', '/m/')
 for df_row in msceleb_group.itertuples():
 if '@' in df_row.name:
 splits = df_row.name.split('@')
 if not len(splits) > 1:
 msg = f'only one split: {df_row.name}'
 if not msg in messages:
 print(msg)
 messages.append(msg)
 elif len(splits) > 1:
 if len(splits[1]) != 2:
 msg = f'n2 split is long: {splits[1]}'
 if not msg in messages:
 print(msg)
 messages.append(msg)
 else:
 print(df_row.name)

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

n2 split is long: zh-Hant
n2 split is long: es-419
n2 split is long: fil
n2 split is long: en-GB
n2 split is long: en-US
n2 split is long: zh-HK
n2 split is long: fr-CA
n2 split is long: pt-PT
n2 split is long: ceb
n2 split is long: zorbla.de
n2 split is long: N
n2 split is long: hu
m.03zytg	Αστέριος"
n2 split is long: destiny
n2 split is long: Teng Boon Soon
n2 split is long: Yong Khoon Seng
n2 split is long: Tiki Anak Lafe
n2 split is long: Marcus Mojigoh
n2 split is long: Nyallau Anak Badak
n2 split is long: Bousou P
n2 split is long: evleaks


In [55]:
# iterate groups and flatten language variations into named columns
identities = []
for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):
 id_kg = id_kg.replace('m.', '/m/')
 for df_row in msceleb_group.itertuples():
 if '@' in df_row.name:
 splits = df_row.name.split('@')
 name = splits[0]
 lang = splits[1] if len(splits) > 0 else 'en'
 else:
 # default to 'en'
 lang = 'en'
 name = df_row.name
 col_name = f'ms_name_{lang}'
 identities.append({'id_kg': id_kg, col_name: name})

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))

In [50]:
print(identities[0:10])

[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]


In [91]:
# temp save DataFrame to CSV
def save_identity_master(identities, fp_out=fp_master_identities):
 df_identities_master = pd.DataFrame.from_dict(identities)
 df_identities_master.index.name = 'id'
 df_identities_master.to_csv(fp_master_identities)

### Add image count data for MS Celeb

In [70]:
# load lines
fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'
with open(fp_msceleb_clean,'r') as fp:
 msceleb_lines = fp.readlines()
msceleb_files = {}

# iterate lines and append all files
for filepath in msceleb_lines:
 id_kg, fname = filepath.split('/')
 id_kg = id_kg.replace('m.', '/m/')
 if not id_kg in msceleb_files.keys():
 msceleb_files[id_kg] = []
 msceleb_files[id_kg].append(fname)

 # add count
for identity in identities:
 id_kg = identity['id_kg']
 if id_kg in msceleb_files.keys():
 identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])
 else:
 identity['msceleb_count'] = 0

In [92]:
# save (takes 30 seconds)
save_identity_master(identities) # encoding='utf-16' ??

In [95]:
list(df_identities_master.keys())

['id_kg',
 'ms_name_ Marcus Mojigoh',
 'ms_name_ Nyallau Anak Badak',
 'ms_name_ Teng Boon Soon',
 'ms_name_ Tiki Anak Lafe',
 'ms_name_ Yong Khoon Seng',
 'ms_name_Bousou P',
 'ms_name_N',
 'ms_name_af',
 'ms_name_am',
 'ms_name_ar',
 'ms_name_az',
 'ms_name_be',
 'ms_name_bg',
 'ms_name_bm',
 'ms_name_bn',
 'ms_name_bo',
 'ms_name_br',
 'ms_name_bs',
 'ms_name_ca',
 'ms_name_ceb',
 'ms_name_ck',
 'ms_name_co',
 'ms_name_cr',
 'ms_name_cs',
 'ms_name_cy',
 'ms_name_da',
 'ms_name_de',
 'ms_name_destiny',
 'ms_name_dz',
 'ms_name_el',
 'ms_name_en',
 'ms_name_en-GB',
 'ms_name_en-US',
 'ms_name_eo',
 'ms_name_es',
 'ms_name_es-419',
 'ms_name_et',
 'ms_name_eu',
 'ms_name_evleaks',
 'ms_name_fa',
 'ms_name_fi',
 'ms_name_fil',
 'ms_name_fo',
 'ms_name_fr',
 'ms_name_fr-CA',
 'ms_name_fy',
 'ms_name_ga',
 'ms_name_gd',
 'ms_name_gl',
 'ms_name_gn',
 'ms_name_gu',
 'ms_name_ha',
 'ms_name_hi',
 'ms_name_hr',
 'ms_name_ht',
 'ms_name_hu',
 'ms_name_hu\r\nm.03zytg\tΑστέριος"',
 'ms_name_hy