summaryrefslogtreecommitdiff
path: root/megapixels/notebooks
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks')
-rw-r--r--megapixels/notebooks/datasets/identity/identity_master.ipynb1561
1 files changed, 1226 insertions, 335 deletions
diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb
index a48a7ba1..e932a947 100644
--- a/megapixels/notebooks/datasets/identity/identity_master.ipynb
+++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb
@@ -6,13 +6,30 @@
"source": [
"# Identity Master List\n",
"\n",
- "- start with MS Celeb Top1M\n",
- "- then progressively add smaller datasets"
+ "- [x] MS Celeb 1M\n",
+ "- UMD Faces\n",
+ "- FaceScrub\n",
+ "- LFW\n",
+ "- PubFig\n",
+ "- PubFig83\n",
+ "- VGG Face\n",
+ "- VGG Face2\n",
+ "- IJB-C\n",
+ "- CASIA Webface\n",
+ "- IMDB-Face\n",
+ "- IMDB-Wiki"
]
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 156,
"metadata": {},
"outputs": [],
"source": [
@@ -48,15 +65,6 @@
]
},
{
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [],
- "source": [
- "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'"
- ]
- },
- {
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -68,65 +76,185 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'\n",
+ "dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'\n",
+ "fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n",
- "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n",
+ "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])\n",
"df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n",
- "n_groups = df_msceleb_top1m_groups.ngroups\n",
- "print(f'{n_groups} groups')\n",
- "df_msceleb_top1m.head(2)"
+ "n_groups = df_msceleb_top1m_groups.ngroups"
]
},
{
"cell_type": "code",
- "execution_count": 110,
+ "execution_count": 200,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fbc706a8b9f34d958e478cdf584bf853",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
- "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')"
+ "# create alphabetically sorted dict\n",
+ "msceleb_top1m_az = {}\n",
+ "a2z = 'abcdefghijklmnopqrstuvwxyz'\n",
+ "for c in a2z:\n",
+ " msceleb_top1m_az[c] = []\n",
+ "for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):\n",
+ " name = msceleb_row.name_lang\n",
+ " try:\n",
+ " msceleb_top1m_az[name[0].lower()].append({'name': name, 'id_kg': msceleb_row.id_kg})\n",
+ " except Exception as e:\n",
+ " pass"
]
},
{
"cell_type": "code",
- "execution_count": 106,
+ "execution_count": 159,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>id_kg</th>\n",
+ " <th>name_lang</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>m.01008l47</td>\n",
+ " <td>Patrick Cummins@en</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>m.01008l47</td>\n",
+ " <td>Patrick Cummins@pt</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>m.01008l96</td>\n",
+ " <td>Mohamed Guessous@en</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>m.01008l96</td>\n",
+ " <td>Mohamed Guessous@fr</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>m.01008l96</td>\n",
+ " <td>محمد جسوس@ar</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " id_kg name_lang\n",
+ "0 m.01008l47 Patrick Cummins@en\n",
+ "1 m.01008l47 Patrick Cummins@pt\n",
+ "2 m.01008l96 Mohamed Guessous@en\n",
+ "3 m.01008l96 Mohamed Guessous@fr\n",
+ "4 m.01008l96 محمد جسوس@ar"
+ ]
+ },
+ "execution_count": 159,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "#df_msceleb_top1m.head(100)"
+ "df_msceleb_top1m.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 160,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "There are 3,481,186 total name variations\n",
+ "There are 1,000,000 unique identities\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'There are {len(df_msceleb_top1m):,} total name variations')\n",
+ "print(f'There are {n_groups:,} unique identities')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
- "abbrev_mappings = {\n",
- " 'en-US': 'en',\n",
- " 'en-GB': 'en',\n",
- " 'es-419': 'es-419',\n",
- " 'es'\n",
- "}"
+ "# convert DataFrame to dict\n",
+ "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
+ "# store all identity info here, until creating dataframe\n",
"msceleb_identities = {}"
]
},
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
+ "# utility functions\n",
"def split_name_lang(name_lang):\n",
" '''Split name into name and language'''\n",
" if '@' in name_lang:\n",
@@ -137,141 +265,142 @@
" else:\n",
" name = name_lang\n",
" lang = ''\n",
- " return {'name': name, 'lang': lang}"
+ " return {'name': name, 'lang': lang}\n",
+ "\n",
+ "# temp save DataFrame to CSV\n",
+ "def save_identity_master(identities, fp_out=fp_master_identities):\n",
+ " df_identities_master = pd.DataFrame.from_dict(identities)\n",
+ " df_identities_master.index.name = 'id'\n",
+ " df_identities_master.to_csv(fp_master_identities)"
]
},
{
"cell_type": "code",
- "execution_count": 122,
+ "execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "884edc099a404dfcb53e353d2abf6819",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/plain": [
- "{'name': 'r@destiny', 'lang': 'en-417'}"
+ "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))"
]
},
- "execution_count": 122,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
}
],
"source": [
- "split_name_lang('r@destiny@en')"
+ "# convert to \"name@lang\" to dict format\n",
+ "msceleb_identities = {}\n",
+ "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n",
+ " id_kg = mseleb_top1m_record['id_kg'].replace('m.','/m/')\n",
+ " if not id_kg in msceleb_identities.keys():\n",
+ " msceleb_identities[id_kg] = {'names': {}}\n",
+ " name_lang = split_name_lang(mseleb_top1m_record['name_lang'])\n",
+ " name = name_lang['name']\n",
+ " lang = name_lang['lang']\n",
+ " if lang == 'en':\n",
+ " msceleb_identities[id_kg]['names']['canonical'] = name\n",
+ " msceleb_identities[id_kg]['names'][lang] = name"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Patch @en names"
]
},
{
"cell_type": "code",
- "execution_count": 141,
+ "execution_count": 165,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "0120e006a7564f5c82729a7050ef0386",
+ "model_id": "1cd2915f485b4cd299a929e1fb2d5926",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
- "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))"
+ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "no english name for /m/017vbn\n",
+ "no english name for /m/026q0k_\n",
+ "no english name for /m/02k2kw\n",
+ "no english name for /m/0bwhrg1\n"
+ ]
}
],
"source": [
- "msceleb_identities = {}\n",
- "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n",
- " id_kg = mseleb_top1m_record['id_kg']\n",
- " if not id_kg in msceleb_identities.keys():\n",
- " msceleb_identities[id_kg] = {}\n",
- " name_lang = split_name_lang(mseleb_top1m_record['name'])\n",
- " name = name_lang['name']\n",
- " lang = name_lang['lang']\n",
- " msceleb_identities[id_kg][lang] = name"
+ "# check for missing english names\n",
+ "for id_kg, attrs in tqdm(msceleb_identities.items()):\n",
+ " lang_attrs = attrs['names']\n",
+ " name_en = lang_attrs.get('en', None)\n",
+ " if not name_en:\n",
+ " print(f'no english name for {id_kg}')"
]
},
{
"cell_type": "code",
- "execution_count": 142,
- "metadata": {},
- "outputs": [],
- "source": [
- "import itertools\n",
- "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 145,
+ "execution_count": 166,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Patrick Cummins en\n",
- "Patrick Cummins pt\n",
- "Mohamed Guessous en\n",
- "Mohamed Guessous fr\n",
- "محمد جسوس ar\n",
- "Tsvetta Kaleynska en\n",
- "Tsvetta Kaleynska es\n",
- "Tsvetta Kaleynska fr\n",
- "Цвета Калейнска bg\n",
- "Цвета Калейнска ru\n",
- "Caio Henrique Siqueira Sanchez en\n",
- "Кајо Санчез sr\n",
- "Julio Ríos Gallego ca\n",
- "Julio Ríos Gallego en\n",
- "Julio Ríos Gallego es\n",
- "Nilson Ricardo da Silva Júnior en\n",
- "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n",
- "니우송 히카르두 다 시우바 주니오르 ko\n",
- "Aleksej Aleksandrovič Starobinski sl\n",
- "Alexei Alexandrowitsch Starobinski de\n",
- "Alexei Starobinski pt\n",
- "Alexei Starobinsky en\n",
- "Alexeï Starobinski fr\n",
- "Алексей Александрович Старобинский ru\n",
- "Старобінський Олексій Олександрович uk\n",
- "アレクセイ・スタロビンスキー ja\n",
- "Hilda Rix Nicholas en\n",
- "هیلدا ریکس نیکولاس fa\n",
- "Behrouz Makvandi en\n",
- "Бехруз Макванди ru\n",
- "بهروز مکوندی fa\n",
- "Borislav Terzić en\n",
- "Борислав Терзић sr\n"
+ "patched /m/017vbn de to en\n",
+ "patched /m/026q0k_ nl to en\n",
+ "patched /m/02k2kw de to en\n",
+ "patched /m/0bwhrg1 it to en\n"
]
}
],
"source": [
- "# de-duplicate names that use same spelling for multiple languages\n",
- "for id_kg, name_langs in msceleb_identities_sm.items():\n",
- " if 'en' in name_langs.keys():\n",
- " name_en = name_langs['en']\n",
- " for lang, name in name_langs.items():\n",
- " print(name, lang)"
+ "# patch en name exception: 4 names missing english\n",
+ "en_exceptions = {\n",
+ " '/m/017vbn': 'de',\n",
+ " '/m/026q0k_': 'nl',\n",
+ " '/m/02k2kw': 'de',\n",
+ " '/m/0bwhrg1': 'it'\n",
+ "}\n",
+ "for id_kg, lang in en_exceptions.items():\n",
+ " msceleb_identities[id_kg]['names']['en'] = msceleb_identities[id_kg]['names'][lang]\n",
+ " msceleb_identities[id_kg]['names']['canonical'] = msceleb_identities[id_kg]['names']['en']\n",
+ " print(f'patched {id_kg} {lang} to en')"
]
},
{
- "cell_type": "code",
- "execution_count": null,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
- "source": []
+ "source": [
+ "### Remove duplicate names"
+ ]
},
{
"cell_type": "code",
- "execution_count": 103,
+ "execution_count": 167,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "374a55f504084f14bd4d77fed0e2f4e4",
+ "model_id": "33ffa229c16d4a9088087c21210d421e",
"version_major": 2,
"version_minor": 0
},
@@ -286,62 +415,85 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "n2 split is long: zh-Hant\n",
- "n2 split is long: es-419\n",
- "n2 split is long: fil\n",
- "n2 split is long: en-GB\n",
- "n2 split is long: en-US\n",
- "n2 split is long: zh-HK\n",
- "n2 split is long: fr-CA\n",
- "n2 split is long: pt-PT\n",
- "n2 split is long: ceb\n",
- "n2 split is long: zorbla.de\n",
- "n2 split is long: N\n",
- "n2 split is long: hu\n",
- "m.03zytg\tΑστέριος\"\n",
- "n2 split is long: destiny\n",
- "n2 split is long: Teng Boon Soon\n",
- "n2 split is long: Yong Khoon Seng\n",
- "n2 split is long: Tiki Anak Lafe\n",
- "n2 split is long: Marcus Mojigoh\n",
- "n2 split is long: Nyallau Anak Badak\n",
- "n2 split is long: Bousou P\n",
- "n2 split is long: evleaks\n"
+ "removed 1,485,336 duplicate names\n"
]
}
],
"source": [
- "messages = []\n",
+ "# de-duplicate names that use same spelling for multiple languages\n",
+ "items_removed = []\n",
+ "msceleb_identities_copy = msceleb_identities.copy()\n",
+ "\n",
+ "for id_kg, attrs in tqdm(msceleb_identities_copy.items()):\n",
+ " lang_attrs = attrs['names']\n",
+ " name_main = lang_attrs.get('canonical', None)\n",
+ " if not name_en:\n",
+ " print('error. all names need \"en\"')\n",
+ " break\n",
+ " lang_attrs_copy = attrs['names'].copy()\n",
+ " for lang, name in lang_attrs_copy.items():\n",
+ " if name == name_main and lang != 'en' and lang != 'canonical':\n",
+ " # remove it\n",
+ " items_removed.append(msceleb_identities[id_kg]['names'].pop(lang))\n",
+ " del lang_attrs_copy\n",
"\n",
- "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
+ "del msceleb_identities_copy\n",
+ "print(f'removed {len(items_removed):,} duplicate names')\n",
+ "del items_removed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Count images per person for ms celeb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6e3a3f659fa6414b80d678d5b991ed0a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=5049824), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# calculate total images per id\n",
+ "msceleb_files = {}\n",
+ "# load text file\n",
+ "with open(fp_msceleb_clean_txt,'r') as fp:\n",
+ " msceleb_lines = fp.readlines()\n",
+ " \n",
+ "# iterate lines and append all files\n",
+ "for filepath in tqdm(msceleb_lines):\n",
+ " id_kg, fname = filepath.split('/')\n",
" id_kg = id_kg.replace('m.', '/m/')\n",
- " for df_row in msceleb_group.itertuples():\n",
- " if '@' in df_row.name:\n",
- " splits = df_row.name.split('@')\n",
- " if not len(splits) > 1:\n",
- " msg = f'only one split: {df_row.name}'\n",
- " if not msg in messages:\n",
- " print(msg)\n",
- " messages.append(msg)\n",
- " elif len(splits) > 1:\n",
- " if len(splits[1]) != 2:\n",
- " msg = f'n2 split is long: {splits[1]}'\n",
- " if not msg in messages:\n",
- " print(msg)\n",
- " messages.append(msg)\n",
- " else:\n",
- " print(df_row.name)"
+ " if not id_kg in msceleb_files.keys():\n",
+ " msceleb_files[id_kg] = []\n",
+ " msceleb_files[id_kg].append(fname)"
]
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "475871ac6d08484cbec44d5ccf099bd8",
+ "model_id": "bd0530f0e4634a8dbae0308964cd6e2b",
"version_major": 2,
"version_minor": 0
},
@@ -354,251 +506,990 @@
}
],
"source": [
- "# iterate groups and flatten language variations into named columns\n",
- "identities = []\n",
- "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n",
- " id_kg = id_kg.replace('m.', '/m/')\n",
- " for df_row in msceleb_group.itertuples():\n",
- " if '@' in df_row.name:\n",
- " splits = df_row.name.split('@')\n",
- " name = splits[0]\n",
- " lang = splits[1] if len(splits) > 0 else 'en'\n",
- " else:\n",
- " # default to 'en'\n",
- " lang = 'en'\n",
- " name = df_row.name\n",
- " col_name = f'ms_name_{lang}'\n",
- " identities.append({'id_kg': id_kg, col_name: name})"
+ "# add count to \n",
+ "for id_kg, attrs in tqdm(msceleb_identities.items()):\n",
+ " if id_kg in msceleb_files.keys():\n",
+ " count = len(msceleb_files[id_kg])\n",
+ " else:\n",
+ " count = 0\n",
+ " msceleb_identities[id_kg]['count_msceleb'] = count"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "im_counts_idxs = [attrs['count_msceleb'] for id_kg, attrs in msceleb_identities.items()]\n",
+ "im_counts_id_kg = [id_kg for id_kg, _ in msceleb_identities.items()]"
]
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 173,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n"
+ "Most images 130 for Leelee Sobieski\n",
+ "88,244 more than 10\n",
+ "78,027 more than 20\n",
+ "49,042 more than 50\n",
+ "5,025 more than 100\n"
]
}
],
"source": [
- "print(identities[0:10])"
+ "# print stats\n",
+ "idx_max = np.argmax(im_counts_idxs)\n",
+ "id_kg_max = im_counts_id_kg[idx_max]\n",
+ "count_max = im_counts_idxs[idx_max]\n",
+ "name_max = msceleb_identities[id_kg_max]['names']['canonical']\n",
+ "print(f'Most images {count_max:,} for {name_max}')\n",
+ "# distribution\n",
+ "im_counts_idxs = np.array(im_counts_idxs)\n",
+ "print(f'{len(im_counts_idxs[im_counts_idxs > 10]):,} more than 10')\n",
+ "print(f'{len(im_counts_idxs[im_counts_idxs > 20]):,} more than 20')\n",
+ "print(f'{len(im_counts_idxs[im_counts_idxs > 50]):,} more than 50')\n",
+ "print(f'{len(im_counts_idxs[im_counts_idxs > 100]):,} more than 100')"
]
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 174,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "420bc435f447454faa2dba73d7dff982",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# awkward conversion of msceleb_identities to a list of dicts\n",
+ "identities_flat = []\n",
+ "for id_kg, attrs in tqdm(msceleb_identities.items()):\n",
+ " obj = {'id_kg': id_kg}\n",
+ " for lang, name in attrs['names'].items():\n",
+ " if lang != 'canonical':\n",
+ " col_name = f'name_msceleb_{lang}'\n",
+ " elif lang == 'canonical':\n",
+ " col_name = 'name_msceleb'\n",
+ " obj[col_name] = name\n",
+ " obj['count_msceleb'] = attrs['count_msceleb']\n",
+ " identities_flat.append(obj)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
- "# temp save DataFrame to CSV\n",
- "def save_identity_master(identities, fp_out=fp_master_identities):\n",
- " df_identities_master = pd.DataFrame.from_dict(identities)\n",
- " df_identities_master.index.name = 'id'\n",
- " df_identities_master.to_csv(fp_master_identities)"
+ "# convert to dataframe\n",
+ "df_identities = pd.DataFrame.from_dict(identities_flat)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 176,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# save checkpoint CSV\n",
+ "save_identity_master(identities_flat) # encoding='utf-16' ??"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 177,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# copy to master and delete ref to msceleb\n",
+ "identities = msceleb_identities.copy()\n",
+ "del msceleb_identities"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Add image count data for MS Celeb"
+ "## LFW"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
- "# load lines\n",
- "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n",
- "with open(fp_msceleb_clean,'r') as fp:\n",
- " msceleb_lines = fp.readlines()\n",
- "msceleb_files = {}\n",
+ "# add LFW data\n",
+ "fp_lfw = '/data_store_hdd/datasets/people/lfw/downloads/lfw-names.txt'\n",
+ "with open(fp_lfw,'r') as fp:\n",
+ " lfw_lines = fp.readlines()\n",
+ "lfw_lines = [x.strip() for x in lfw_lines]\n",
"\n",
- "# iterate lines and append all files\n",
- "for filepath in msceleb_lines:\n",
- " id_kg, fname = filepath.split('/')\n",
- " id_kg = id_kg.replace('m.', '/m/')\n",
- " if not id_kg in msceleb_files.keys():\n",
- " msceleb_files[id_kg] = []\n",
- " msceleb_files[id_kg].append(fname)\n",
- "\n",
- " # add count\n",
- "for identity in identities:\n",
- " id_kg = identity['id_kg']\n",
- " if id_kg in msceleb_files.keys():\n",
- " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n",
- " else:\n",
- " identity['msceleb_count'] = 0"
+ "lfw_meta = []\n",
+ "for lfw_line in lfw_lines:\n",
+ " name_orig, count = lfw_line.split('\\t')\n",
+ " name_clean = name_orig.replace('_',' ')\n",
+ " obj = {'name_orig': name_orig, 'name': name_clean, 'count':count}\n",
+ " lfw_meta.append(obj)"
]
},
{
"cell_type": "code",
- "execution_count": 92,
+ "execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
- "# save (takes 30 seconds)\n",
- "save_identity_master(identities) # encoding='utf-16' ??"
+ "identities_tmp = identities.copy()"
]
},
{
"cell_type": "code",
- "execution_count": 95,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# make exact name matches\n",
+ "lfw_name_matches_tmp = {}\n",
+ "for lfw_item in tqdm(lfw_meta):\n",
+ " lfw_name = lfw_item['name'] # name is transformed original name\n",
+ " lfwnl = lfw_name.lower()\n",
+ " splits = lfw_name.split(' ')\n",
+ " matches_tmp = {}\n",
+ " for word in splits:\n",
+ " # for each word in names, check if exact word is in master name list\n",
+ " c = word[0].lower()\n",
+ " matches_tmp = []\n",
+ " for name_id_kg in msceleb_top1m_az[c]:\n",
+ " name = name_id_kg['name']\n",
+ " id_kg = name_id_kg['id_kg']\n",
+ " if lfwnl in name.lower():\n",
+ " lfw_name_matches_tmp[lfw_name] = id_kg\n",
+ " break\n",
+ "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 212,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8c0f2dbf032145fea3ad5759a97abc44",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-212-13b8b31f417d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mlfwnl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mname_id_kg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_top1m_az\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mid_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id_kg'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "# make exact name matches\n",
+ "lfw_name_matches_exact = {}\n",
+ "for lfw_item in tqdm(lfw_meta):\n",
+ " lfw_name = lfw_item['name'] # name is transformed original name\n",
+ " # quickly check if it's in the alphabetized list\n",
+ " c = lfw_name[0].lower()\n",
+ " lfwnl = lfw_name.lower()\n",
+ " for name_id_kg in msceleb_top1m_az[c]:\n",
+ " name = name_id_kg['name']\n",
+ " id_kg = name_id_kg['id_kg']\n",
+ " if lfwnl in name.lower():\n",
+ " lfw_name_matches_exact[lfw_name] = id_kg\n",
+ " break\n",
+ "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 217,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "1949065d12b349ce8bbf28ebd09f1e29",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "matched AJ Cook to A. J. Cook in canonical. Add to matched ids\n",
+ "matched AJ Lamas to A.J. Lamas in canonical. Add to matched ids\n",
+ "could not find: Aaron Patterson\n",
+ "matched Aaron Pena to Aaron Peña in canonical. Add to matched ids\n",
+ "could not find: Abdel Aziz Al-Hakim\n",
+ "could not find: Abdel Madi Shabneh\n",
+ "could not find: Abdel Nasser Assidi\n",
+ "could not find: Abdul Majeed Shobokshi\n",
+ "matched Abdulaziz Kamilov to Abdulaziz Komilov in canonical. Add to matched ids\n",
+ "could not find: Abdullah Nasseef\n",
+ "could not find: Abdullah al-Attiyah\n",
+ "could not find: Abdullatif Sener\n",
+ "could not find: Abner Martinez\n",
+ "could not find: Aby Har-Even\n",
+ "could not find: Adam Kennedy\n",
+ "could not find: Adelina Avila\n",
+ "could not find: Adisai Bodharamik\n",
+ "could not find: Adolfo Aguilar Zinser\n",
+ "could not find: Adoor Gopalakarishnan\n",
+ "could not find: Adrian Annus\n",
+ "matched Adrian Fernandez to Adriana Fernández in canonical. Add to matched ids\n",
+ "could not find: Adrian Nastase\n",
+ "could not find: Adriana Perez Navarro\n"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-217-f9d734a428b9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0midentity\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# for each name's language variation, look for match\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mstrict_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midentity_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlfw_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstrict_match\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mmatched_id_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mid_kg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mnames_match_strict\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mclean_a\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mclean_b\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mletter_strip\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "# make strict name-letter matches\n",
+ "lfw_name_matches_strict = {}\n",
+ "for lfw_item in tqdm(lfw_meta):\n",
+ " lfw_name = lfw_item['name'] # name is transformed original name\n",
+ " if lfw_name in lfw_name_matches_exact.keys():\n",
+ " continue\n",
+ " \n",
+ " matched_id_kg = None\n",
+ " for id_kg, identity in identities_tmp.items():\n",
+ " # for each msceleb identity, look for match\n",
+ " for lang, name in identity['names'].items():\n",
+ " # for each name's language variation, look for match\n",
+ " strict_match = identity_utils.names_match_strict(lfw_name, name)\n",
+ " if strict_match:\n",
+ " matched_id_kg = id_kg\n",
+ " matched_lang = lang\n",
+ " matched_name = name\n",
+ " break\n",
+ " if matched_id_kg:\n",
+ " print(f'matched {lfw_name} to {matched_name} in {matched_lang}. Add to matched ids')\n",
+ " lfw_name_matches_strict[lfw_name] = matched_id_kg\n",
+ " break\n",
+ " if not matched_id_kg:\n",
+ " print(f'could not find: {lfw_name}')\n",
+ "print(f'found {len(lfw_name_matches_strict)} of {len(lfw_meta)} names using exact matches')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# make fuzzy name matches\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d822a21cc63e4c5c9fe9bb637f5455dd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, description='1st loop', max=5749, style=ProgressStyle(description_width='i…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found: Aaron Eckhart@ca\n",
+ "Found: Aaron Guiel@en\n",
+ "Found: Aaron Peirsol@ca\n",
+ "Found: Aaron Sorkin@ca\n",
+ "Found: Aaron Tippin@de\n",
+ "Found: Abba Eban@cs\n",
+ "Found: Abbas Kiarostami@ca\n",
+ "Found: Abdoulaye Wade@ca\n",
+ "Found: Abdul Rahman Lestaluhu@id\n",
+ "Found: Abdullah Cabir@tr\n",
+ "Found: Abdullah Ahmad Badawi@da\n",
+ "Found: Abdullah Gulam Rasoul@en\n",
+ "Found: Abel Aguilar@cs\n",
+ "Found: Abel Pacheco de la Espriella@es\n",
+ "Found: Abid Hamid Mahmud al-Tikriti@nl\n",
+ "Found: Abraham Foxman@cs\n",
+ "Found: Adam Ant@cs\n",
+ "Found: Adam Freier@en\n",
+ "Found: Adam Herbert@en\n",
+ "Found: Adam Mair@de\n",
+ "Found: Adam Richards@en\n",
+ "Found: Adam Sandler@ca\n",
+ "Found: George Adam Scott@en\n",
+ "Found: Adel Al-Jubeir@fr\n",
+ "Found: Adolfo Rodriguez Saa@id\n",
+ "Found: Adrian McPherson@en\n",
+ "Found: Adrian Murrell@en\n",
+ "Found: Adriana Lima@ca\n",
+ "Found: Adrien Brody@ca\n",
+ "Found: Afton Smith@cs\n",
+ "Found: Agbani Darego@de\n",
+ "Found: Agnelo Queiroz@en\n",
+ "Found: Agnes Bruckner@de\n",
+ "Found: Ahmed Ahmedou@de\n",
+ "Found: Ahmed Chalabi@en\n",
+ "Found: Mahmood Ahmed Ghazi@en\n",
+ "Found: Ahmet Necdet Sezer@ca\n",
+ "Found: Ai Sugiyama@da\n",
+ "Found: Aidan Quinn@ca\n",
+ "Found: Aileen Riggin Soule@fr\n",
+ "Found: Aishwarya Rai Bachchan@en\n",
+ "Found: Ajit Agarkar@en\n",
+ "Found: Akbar Al Baker@en\n",
+ "Found: Akbar Hashemi Rafsanjani@da\n",
+ "Found: Akhmed Zakayev@en\n",
+ "Found: Akiko Morigami@da\n",
+ "Found: Al Cardenas@en\n",
+ "Found: Vidal Davis@en\n",
+ "Found: Al Gore III@en\n",
+ "Found: Al Leiter@de\n",
+ "Found: Al Pacino@ca\n",
+ "Found: Al Sharpton@de\n",
+ "Found: Alain Cervantes@en\n",
+ "Found: Alain Ducasse@de\n",
+ "Found: Alan Ball jr.@nl\n",
+ "Found: Alan Dershowitz@da\n",
+ "Found: Alan Greenspan@de\n",
+ "Found: Alan Mulally@de\n",
+ "Found: Alan Trammell@de\n",
+ "Found: Alan Zemaitis@en\n",
+ "Found: Alanis Morissette@ca\n",
+ "Found: Alanna Ubach@de\n",
+ "Found: Alastair Campbell@de\n",
+ "Found: Alastair Johnston@en\n",
+ "Found: Albert Costa Balboa@es\n",
+ "Found: Albert Pujols@da\n",
+ "Found: Alberto Acosta@ca\n",
+ "Found: Alberto Fujimori@ca\n",
+ "Found: Alberto Sordi@ca\n",
+ "Found: Aldo Paredes@en\n",
+ "Found: Alec Baldwin@ca\n",
+ "Found: Alejandro Atchugarry@de\n",
+ "Found: Alejandro Fernandez Almendras@sl\n",
+ "Found: Alejandro Lembo@de\n",
+ "Found: Alejandro Lerner@en\n",
+ "Found: Alejandro Toledo@en\n",
+ "Found: Alek Wek@de\n",
+ "Found: Alessandro Nesta@ca\n",
+ "Found: Alex Barros@de\n",
+ "Found: Alex Cabrera@en\n",
+ "Found: Alex Ferguson@en\n",
+ "Found: Alex Holmes@en\n",
+ "Found: Alex Kingston@cs\n",
+ "Found: Alex Penelas@en\n",
+ "Found: Alex Popovici@es\n",
+ "Found: Alex Sink@en\n",
+ "Found: Alex Wallau@en\n",
+ "Found: Alex Zanardi@ca\n",
+ "Found: Alexa Vega@da\n",
+ "Found: Alexander Downer@de\n",
+ "Found: Alexander Losyukov@en\n",
+ "Found: Alexander Lukashenko@en\n",
+ "Found: Alexander Payne@cs\n",
+ "Found: Alexandra Pelosi@en\n",
+ "Found: Alexandra Stevenson@de\n",
+ "Found: Alexandre Daigle@cs\n",
+ "Found: Alexandre Despatie@ca\n",
+ "Found: Alexandre Herchcovitch@en\n",
+ "Found: Alexandre Vinokourov@fr\n",
+ "Found: Alexis Bledel@ca\n",
+ "Found: Alfonso Portillo@en\n",
+ "Found: Alfonso Soriano@en\n",
+ "Found: James Alfred Ford@en\n",
+ "Found: Alfred Santell@en\n",
+ "Found: Alfredo Moreno@en\n",
+ "Found: Ali Abbas Al-Hilfi@en\n",
+ "Found: Ali Abdullah Saleh@da\n",
+ "Found: Ali Ahmeti@de\n",
+ "Found: Prince Ali bin Hussein@en\n",
+ "Found: Ali Fallahian@de\n",
+ "Found: Ali Hammoud@en\n",
+ "Found: Ali Khamenei@ca\n",
+ "Found: Alicia Hollowell@en\n",
+ "Found: Alicia Keys@ca\n",
+ "Found: Alicia Molik@de\n",
+ "Found: Alicia Silverstone@ca\n",
+ "Found: Alicia Witt@ca\n",
+ "Found: Alimzhan Tokhtakhounov@pt\n",
+ "Found: Alina Kabaeva@en\n",
+ "Found: Alison Krauss@ca\n",
+ "Found: Alison Lohman@de\n",
+ "Found: Alistair Macdonald@en\n",
+ "Found: Allan Houston@ca\n",
+ "Found: Allan Kemakeza@de\n",
+ "Found: Allan Wagner Tizón@de\n",
+ "Found: Allen Iverson@ca\n",
+ "Found: Allison Janney@da\n",
+ "Found: Ally Sheedy@ca\n",
+ "Found: Allyson Felix@ca\n",
+ "Found: Alma Powell@de\n",
+ "Found: Alonzo Mourning@ca\n",
+ "Found: Aly Wagner@de\n",
+ "Found: Alyson Hannigan@ca\n",
+ "Found: Amanda Beard@de\n",
+ "Found: Amanda Bynes@ca\n",
+ "Found: Amanda Coetzer@de\n",
+ "Found: Amanda Marshall@de\n",
+ "Found: Amber Frey@en\n",
+ "Found: Amber Tamblyn@de\n",
+ "Found: Ambrose Lee@en\n",
+ "Found: Amelia Vega@en\n",
+ "Found: Amelie Mauresmo@ms\n",
+ "Found: Amr Moussa@ca\n",
+ "Found: Amram Mitzna@de\n",
+ "Found: Amy Brenneman@da\n",
+ "Found: Amy Cotton@en\n",
+ "Found: Amy Pascal@de\n",
+ "Found: Amy Redford@de\n",
+ "Found: Amy Smart@da\n",
+ "Found: Amy Yasbeck@de\n",
+ "Found: Ana Guevara@de\n",
+ "Found: Ananías Maidana Palacios@es\n",
+ "Found: Anastasia Kelesidou@de\n",
+ "Found: Anastasia Myskina@en\n",
+ "Found: Anatoliy Kinakh@en\n",
+ "Found: Anders Fogh Rasmussen@ca\n",
+ "Found: Andre Agassi@ca\n",
+ "Found: Andre Lange@et\n",
+ "Found: J. Andre Smith@en\n",
+ "Found: Andrea Bocelli@ca\n",
+ "Found: Andrea De Cruz@en\n",
+ "Found: Andrea Yates@en\n",
+ "Found: Andreas Vinciguerra@de\n",
+ "Found: Andrei Konchalovsky@en\n",
+ "Found: Andrei Mikhnevich@en\n",
+ "Found: Andrei Nikolishin@en\n",
+ "Found: Andrew Bernard@en\n",
+ "Found: Andrew Caldecott@en\n",
+ "Found: Andrew Cuomo@ca\n",
+ "Found: Andrew Fastow@de\n",
+ "Found: Andrew Firestone@en\n",
+ "Found: Andrew Gilligan@en\n",
+ "Found: Andrew Jarecki@de\n",
+ "Found: Andrew Luster@de\n",
+ "Found: Andrew Niccol@cs\n",
+ "Found: Andy Benes@en\n",
+ "Found: Andy Dickens@en\n",
+ "Found: DJ Andy Garcia@en\n",
+ "Found: Andy Griffith@ca\n",
+ "Found: Andy Griggs@en\n",
+ "Found: Andy Lau@cs\n",
+ "Found: Andy Northey@en\n",
+ "Found: Sandy Perez Aguila@en\n",
+ "Found: Andy Roddick@ca\n",
+ "Found: Andy Rooney@da\n",
+ "Found: Andy Warhol@ca\n",
+ "Found: Angela Bassett@ca\n",
+ "Found: Angela Lansbury@ca\n",
+ "Found: Angela Merkel@ca\n",
+ "Found: Angelina Jolie@ca\n",
+ "Found: Angie Martinez@en\n",
+ "Found: Anita DeFrantz@de\n",
+ "Found: Ann Landers@da\n",
+ "Found: Ann Morgan Guilbert@en\n",
+ "Found: Ann Veneman@de\n",
+ "Found: Anna Chicherova@en\n",
+ "Found: Anna Faris@ca\n",
+ "Found: Susanna Jones@en\n",
+ "Found: Anna Kournikova@da\n",
+ "Found: Anna Nicole Smith@ca\n",
+ "Found: Anne Donovan@de\n",
+ "Found: Anne Heche@ca\n",
+ "Found: Anne Krueger@fr\n",
+ "Found: Anne McLellan@en\n",
+ "Found: Annette Bening@ca\n",
+ "Found: Annette Lu@de\n",
+ "Found: Annie Machon@de\n",
+ "Found: Antanas Valionis@de\n",
+ "Found: Anthony Fauci@de\n",
+ "Found: Anthony Garotinho@en\n",
+ "Found: Anthony Hopkins@ca\n",
+ "Found: Anthony LaPaglia@da\n",
+ "Found: Anthony Principi@de\n",
+ "Found: Antje Buschschulte@de\n",
+ "Found: Anton Balasingham@en\n",
+ "Found: Antonio Banderas@ca\n",
+ "Found: Antonio Cassano@ca\n",
+ "Found: Antonio Catania@de\n",
+ "Found: Antonio Palocci@de\n",
+ "Found: Antonio Trillanes IV@fil\n",
+ "Found: Antony Leung@en\n",
+ "Found: Antwun Echols@en\n",
+ "Found: Anwar Ibrahim@da\n",
+ "Found: Aretha Franklin@ca\n",
+ "Found: Ari Bousbib@en\n",
+ "Found: Ari Fleischer@de\n",
+ "Found: Arianna Huffington@ca\n",
+ "Found: Arie Haan@de\n",
+ "Found: Ariel Sharon@ca\n",
+ "Found: Arif Mardin@de\n",
+ "Found: Arlen Specter@ca\n",
+ "Found: Armando Carrillo@en\n",
+ "Found: Arminio Fraga@en\n",
+ "Found: Arnold Palmer@da\n",
+ "Found: Arnold Schwarzenegger@ca\n",
+ "Found: Rolfe Arnold Scott-James@en\n",
+ "Found: Aron Ralston@cs\n",
+ "Found: Stuart Cooper@en\n",
+ "Found: Stuart Howe@en\n",
+ "Found: Arthur Johnson@it\n",
+ "Found: John Arthur Martinez@en\n",
+ "Found: Arturo Gatti@ca\n",
+ "Found: Asa Hutchinson@de\n",
+ "Found: Ashanti Douglas@nl\n",
+ "Found: Ashley Judd@ca\n",
+ "Found: Ashley Olsen@ca\n",
+ "Found: Ashley Postell@en\n",
+ "Found: Ashraf Ghani Ahmadzai@es\n",
+ "Found: Ashton Kutcher@ca\n",
+ "Found: Asif Ali Zardari@ca\n",
+ "Found: Askar Akayev@en\n",
+ "Found: Astou Ndiaye-Diatta@en\n",
+ "Found: Premiership of Atal Bihari Vajpayee@en\n",
+ "Found: Atom Egoyan@da\n",
+ "Found: Atsushi Satou@id\n",
+ "Found: Audrey Lacroix@en\n",
+ "Found: Audrey Sauret@en\n",
+ "Found: Augusto Pinochet Ugarte@ca\n",
+ "Found: Augusto Roa Bastos@de\n",
+ "Found: Aung San Suu Kyi@ca\n",
+ "Found: Austin Kearns@en\n",
+ "Found: Avril Lavigne@ca\n",
+ "Found: Azmi Bishara@ca\n",
+ "Found: Azra Akin@id\n",
+ "Found: Babe Ruth@ca\n",
+ "Found: Barbara Bach@cs\n",
+ "Found: Barbara Becker-Cantarino@en\n",
+ "Found: Barbara Bodine@en\n",
+ "Found: Barbara Boxer@cs\n",
+ "Found: Barbara Brezigar@cs\n",
+ "Found: Barbara Robertson@en\n",
+ "Found: Barbara Walters@de\n",
+ "Found: Barbra Streisand@ca\n",
+ "Found: Barry Alvarez@en\n",
+ "Found: Barry Bonds@da\n",
+ "Found: Barry Collier@en\n",
+ "Found: Barry Diller@de\n",
+ "Found: Barry Forde@ca\n",
+ "Found: Barry Hinson@en\n",
+ "Found: Barry Switzer@de\n",
+ "Found: Barry Williamson@en\n",
+ "Found: Barry Zito@de\n",
+ "Found: Bart Freundlich@de\n",
+ "Found: Bart Hendricks@en\n",
+ "Found: Bartosz Kizierowski@de\n",
+ "Found: Barzan Al-Tikriti@fr\n",
+ "Found: Basdeo Panday@de\n",
+ "Found: Baz Luhrmann@ca\n",
+ "Found: Beatriz Merino Lucero@pl\n",
+ "Found: Bela Karolyi@ms\n",
+ "Found: Ben Affleck@ca\n",
+ "Found: Torben Betts@en\n",
+ "Found: Ben Braun@en\n",
+ "Found: Ben Broussard@en\n",
+ "Found: Ben Cahoon@en\n",
+ "Found: Reuben Davis@en\n",
+ "Found: Ben Kingsley@ca\n",
+ "Found: Ben Lee Tyler@en\n",
+ "Found: Ben Steinbauer@en\n",
+ "Found: Benazir Bhutto@ca\n",
+ "Found: Benedita da Silva@en\n",
+ "Found: Benicio Del Toro@fi\n",
+ "Found: Benito Santiago@en\n",
+ "Found: Benjamin Bratt@cs\n",
+ "Found: Benjamin Franklin Bailey@en\n",
+ "Found: Benjamin McKenzie@ca\n",
+ "Found: Benjamin Netanyahu@da\n",
+ "Found: Bernadette Peters@ca\n",
+ "Found: Bernard Ebbers@de\n",
+ "Found: Bernard Giraudeau@de\n",
+ "Found: Bernard Kerik@en\n",
+ "Found: Bernard Landry@de\n",
+ "Found: Bernard Law@fr\n",
+ "Found: Bernard Lord@en\n",
+ "Found: Bernardo Segura@de\n",
+ "Found: Bertie Ahern@ca\n",
+ "Found: Bertrand Bonello@de\n",
+ "Found: A. Elizabeth Jones@en\n",
+ "Found: Bettina Rheims@cs\n",
+ "Found: Betty Williams@en\n",
+ "Found: Bianca Jagger@da\n",
+ "Found: Bijan Namdar Zangeneh@de\n",
+ "Found: Bill Belichick@da\n",
+ "Found: Bill Butler@ca\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found: Bill Callahan@en\n",
+ "Found: Bill Cartwright@en\n",
+ "Found: Bill Clancy@en\n",
+ "Found: Bill Clinton@ca\n",
+ "Found: Bill Curry@en\n",
+ "Found: Bill Doba@en\n",
+ "Found: Bill Elliott@pt\n",
+ "Found: Bill Fennelly@en\n",
+ "Found: Bill Frist@de\n",
+ "Found: Bill Gates@ca\n",
+ "Found: Bill Grahame@en\n",
+ "Found: Bill Guerin@de\n",
+ "Found: Bill Herrion@en\n",
+ "Found: Bill Hughes@en\n",
+ "Found: Bill Kollar@en\n",
+ "Found: Bill Kong@es\n",
+ "Found: Bill Mauldin@de\n",
+ "Found: Bill McBride@en\n",
+ "Found: Bill Nelson@da\n",
+ "Found: Bill Parcells@de\n",
+ "Found: Bill Parsons@en\n",
+ "Found: Bill Paxton@ca\n",
+ "Found: Bill Self@de\n",
+ "Found: Bill Sizemore@en\n",
+ "Found: Bill Stapleton@en\n",
+ "Found: Bill Steinke@en\n",
+ "Found: Bill Walton@de\n",
+ "Found: Billy Andrade@da\n",
+ "Found: Billy Beane@de\n",
+ "Found: Billy Bob Thornton@ca\n",
+ "Found: Billy Boyd@en\n",
+ "Found: Billy Crawford@de\n",
+ "Found: Billy Crystal@ca\n",
+ "Found: Billy Donovan@en\n",
+ "Found: Billy Gilman@en\n",
+ "Found: Billy Joel@ca\n",
+ "Found: Bing Crosby@ca\n",
+ "Found: Binyamin Ben-Eliezer@en\n",
+ "Found: Bison Dele@de\n",
+ "Found: Bixente Lizarazu@ca\n",
+ "Found: Blas Ople@de\n",
+ "Found: Blythe Danner@ca\n",
+ "Found: Blythe Hartley@de\n",
+ "Found: Bo Pelini@en\n",
+ "Found: Bo Ryan@en\n",
+ "Found: Bob Alper@en\n",
+ "Found: Bob Beauprez@de\n",
+ "Found: Bob Bowlsby@en\n",
+ "Found: Bob Dole@ca\n",
+ "Found: Bob Ferguson@da\n",
+ "Found: Bob Geldof@ca\n",
+ "Found: Bob Graham@en\n",
+ "Found: Bob Guccione@cs\n",
+ "Found: Bob Hayes@cs\n",
+ "Found: Bob Holden@de\n",
+ "Found: Bob Hope@ca\n",
+ "Found: Bob Huggins@en\n",
+ "Found: Bob Iger@en\n",
+ "Found: Bob Krueger@en\n",
+ "Found: Bob Menendez@da\n",
+ "Found: Bob Newhart@de\n",
+ "Found: Bob Stoops@en\n",
+ "Found: Bob Taft@de\n",
+ "Found: Bobby Bowden@de\n",
+ "Found: Bobby Kielty@en\n",
+ "Found: Bobby Robson@ca\n",
+ "Found: Bode Miller@ca\n",
+ "Found: Bonnie Fuller@en\n",
+ "Found: Bonnie Hunt@ca\n",
+ "Found: Nella Maria Bonora@de\n",
+ "Found: Boris Berezovsky@en\n",
+ "Found: Boris Henry@cs\n",
+ "Found: Boris Jordan@en\n",
+ "Found: Boris Trajkovski@ca\n",
+ "Found: Boris Yeltsin@en\n",
+ "Found: Brad Banks@en\n",
+ "Found: Brad Brownell@en\n",
+ "Found: Brad Garrett@da\n",
+ "Found: Brad Gushue@de\n",
+ "Found: Brad Miller@en\n",
+ "Found: Brad Pitt@ca\n",
+ "Found: Brad Wilk@cs\n",
+ "Found: Brajesh Mishra@en\n",
+ "Found: Brandon Boyd@da\n",
+ "Found: Brandon Hammond@en\n",
+ "Found: Brandon Inge@de\n",
+ "Found: Brandon Jones@en\n",
+ "Found: Brandon Knight@de\n",
+ "Found: Brandon Larson@en\n",
+ "Found: Brandon Lloyd@en\n",
+ "Found: Brandon Webb@pl\n",
+ "Found: Branko Crvenkovski@ca\n",
+ "Found: Brendan Fraser@ca\n",
+ "Found: Brendan Gaughan@en\n",
+ "Found: Brendan Hansen@en\n",
+ "Found: H. Brent Coles@en\n",
+ "Found: Brett Hawke@en\n",
+ "Found: Brett Hull@cs\n",
+ "Found: Brian Billick@de\n",
+ "Found: Brian Campbell Vickery@de\n",
+ "Found: Brian Cashman@en\n",
+ "Found: Brian Clemens@de\n",
+ "Found: Brian Cook@en\n",
+ "Found: Brian Cowen@ca\n",
+ "Found: Brian De Palma@ca\n",
+ "Found: Brian Gregory@en\n",
+ "Found: Brian Griese@en\n",
+ "Found: Brian Heidik@en\n",
+ "Found: Brian Henson@en\n",
+ "Found: Brian Kerr@de\n",
+ "Found: Brian Lara@de\n",
+ "Found: Brian Mulroney@ca\n",
+ "Found: Brian Olson@en\n",
+ "Found: Brian Scalabrine@ca\n",
+ "Found: Brian Schneider@en\n"
+ ]
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-141-5351e70c6afa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# first, grep all rows of the original TSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mmsceleb_row\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_msceleb_top1m\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertuples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_row\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname_lang\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Found: {msceleb_row.name_lang}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "# compare this this to master identity\n",
+ "for lfw_item in tqdm(lfw_meta, desc='1st loop'):\n",
+ " \n",
+ " # for each LFW name, look for match\n",
+ " lfw_name = lfw_item['name']\n",
+ " matched_id = None\n",
+ " \n",
+ " for id_kg, identity in identities_tmp.items():\n",
+ " # for each msceleb identity, look for match\n",
+ " for lang, name in identity['names'].items():\n",
+ " # for each name's language variation, look for match\n",
+ " if not len(name) > 0:\n",
+ " print('no name')\n",
+ " continue\n",
+ " strict_match = identity_utils.names_match_strict(lfw_name, name)\n",
+ " if strict_match:\n",
+ " #print(f'Strict matched \"{lfw_name}\" to \"{name}\"')\n",
+ " matched_id = id_kg\n",
+ " matched_lang = lang\n",
+ " matched_name = name\n",
+ " break\n",
+ " if matched_id:\n",
+ " matched_lang = lang\n",
+ " matched_name = name\n",
+ " print(f'OK. Found match: {lfw_name} == {matched_name} in lang: {matched_lang}')\n",
+ " pbar_ids.clear()\n",
+ " pbar_ids.close()\n",
+ " break\n",
+ " if not matched_id:\n",
+ " print(f'ERROR: could not find {lfw_name}')\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['id_kg',\n",
- " 'ms_name_ Marcus Mojigoh',\n",
- " 'ms_name_ Nyallau Anak Badak',\n",
- " 'ms_name_ Teng Boon Soon',\n",
- " 'ms_name_ Tiki Anak Lafe',\n",
- " 'ms_name_ Yong Khoon Seng',\n",
- " 'ms_name_Bousou P',\n",
- " 'ms_name_N',\n",
- " 'ms_name_af',\n",
- " 'ms_name_am',\n",
- " 'ms_name_ar',\n",
- " 'ms_name_az',\n",
- " 'ms_name_be',\n",
- " 'ms_name_bg',\n",
- " 'ms_name_bm',\n",
- " 'ms_name_bn',\n",
- " 'ms_name_bo',\n",
- " 'ms_name_br',\n",
- " 'ms_name_bs',\n",
- " 'ms_name_ca',\n",
- " 'ms_name_ceb',\n",
- " 'ms_name_ck',\n",
- " 'ms_name_co',\n",
- " 'ms_name_cr',\n",
- " 'ms_name_cs',\n",
- " 'ms_name_cy',\n",
- " 'ms_name_da',\n",
- " 'ms_name_de',\n",
- " 'ms_name_destiny',\n",
- " 'ms_name_dz',\n",
- " 'ms_name_el',\n",
- " 'ms_name_en',\n",
- " 'ms_name_en-GB',\n",
- " 'ms_name_en-US',\n",
- " 'ms_name_eo',\n",
- " 'ms_name_es',\n",
- " 'ms_name_es-419',\n",
- " 'ms_name_et',\n",
- " 'ms_name_eu',\n",
- " 'ms_name_evleaks',\n",
- " 'ms_name_fa',\n",
- " 'ms_name_fi',\n",
- " 'ms_name_fil',\n",
- " 'ms_name_fo',\n",
- " 'ms_name_fr',\n",
- " 'ms_name_fr-CA',\n",
- " 'ms_name_fy',\n",
- " 'ms_name_ga',\n",
- " 'ms_name_gd',\n",
- " 'ms_name_gl',\n",
- " 'ms_name_gn',\n",
- " 'ms_name_gu',\n",
- " 'ms_name_ha',\n",
- " 'ms_name_hi',\n",
- " 'ms_name_hr',\n",
- " 'ms_name_ht',\n",
- " 'ms_name_hu',\n",
- " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n",
- " 'ms_name_hy',\n",
- " 'ms_name_id',\n",
- " 'ms_name_ig',\n",
- " 'ms_name_is',\n",
- " 'ms_name_it',\n",
- " 'ms_name_iw',\n",
- " 'ms_name_ja',\n",
- " 'ms_name_ka',\n",
- " 'ms_name_kk',\n",
- " 'ms_name_kl',\n",
- " 'ms_name_km',\n",
- " 'ms_name_kn',\n",
- " 'ms_name_ko',\n",
- " 'ms_name_ku',\n",
- " 'ms_name_ky',\n",
- " 'ms_name_la',\n",
- " 'ms_name_lb',\n",
- " 'ms_name_lo',\n",
- " 'ms_name_lt',\n",
- " 'ms_name_lv',\n",
- " 'ms_name_mg',\n",
- " 'ms_name_mi',\n",
- " 'ms_name_mk',\n",
- " 'ms_name_ml',\n",
- " 'ms_name_mn',\n",
- " 'ms_name_mr',\n",
- " 'ms_name_ms',\n",
- " 'ms_name_mt',\n",
- " 'ms_name_my',\n",
- " 'ms_name_ne',\n",
- " 'ms_name_nl',\n",
- " 'ms_name_nn',\n",
- " 'ms_name_no',\n",
- " 'ms_name_nv',\n",
- " 'ms_name_ny',\n",
- " 'ms_name_oc',\n",
- " 'ms_name_or',\n",
- " 'ms_name_pa',\n",
- " 'ms_name_pl',\n",
- " 'ms_name_ps',\n",
- " 'ms_name_pt',\n",
- " 'ms_name_pt-PT',\n",
- " 'ms_name_ro',\n",
- " 'ms_name_ru',\n",
- " 'ms_name_rw',\n",
- " 'ms_name_sa',\n",
- " 'ms_name_sc',\n",
- " 'ms_name_se',\n",
- " 'ms_name_si',\n",
- " 'ms_name_sk',\n",
- " 'ms_name_sl',\n",
- " 'ms_name_sn',\n",
- " 'ms_name_so',\n",
- " 'ms_name_sq',\n",
- " 'ms_name_sr',\n",
- " 'ms_name_st',\n",
- " 'ms_name_su',\n",
- " 'ms_name_sv',\n",
- " 'ms_name_sw',\n",
- " 'ms_name_ta',\n",
- " 'ms_name_te',\n",
- " 'ms_name_tg',\n",
- " 'ms_name_th',\n",
- " 'ms_name_tr',\n",
- " 'ms_name_ug',\n",
- " 'ms_name_uk',\n",
- " 'ms_name_ur',\n",
- " 'ms_name_uz',\n",
- " 'ms_name_vi',\n",
- " 'ms_name_xh',\n",
- " 'ms_name_yi',\n",
- " 'ms_name_yo',\n",
- " 'ms_name_zh',\n",
- " 'ms_name_zh-HK',\n",
- " 'ms_name_zh-Hant',\n",
- " 'ms_name_zorbla.de',\n",
- " 'ms_name_zu']"
+ "True"
]
},
- "execution_count": 95,
+ "execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "list(df_identities_master.keys())"
+ "identity_utils.names_match_strict('AJ Cook', 'A.J. Cook')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "names_match('A.J. Cook', 'cook Aj', as_float=True, compound_score=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## PubFig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add pubfig data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Face Scrub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add facescrub"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## UMD Faces"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add umd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## CASIA Webface"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add CASIA Webface"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# IMDB Wiki"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add imdb-wiki"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## IMDB-Face"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add imdb face"
]
},
{
@@ -625,7 +1516,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.7.1"
}
},
"nbformat": 4,