diff options
| author | Adam Harvey <adam@ahprojects.com> | 2019-02-19 22:50:01 +0100 |
|---|---|---|
| committer | Adam Harvey <adam@ahprojects.com> | 2019-02-19 22:50:01 +0100 |
| commit | fe0dee2f8c8a7127d1ac2f01c5989f5011a2ee8a (patch) | |
| tree | f2cdd2405a11f30c14e73fea67775a280a854743 /megapixels | |
| parent | b28f65ad5016ba3c3c9f973bd2a64ea3c8a3f84c (diff) | |
...identity
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/app/utils/api_utils.py | 2 | ||||
| -rw-r--r-- | megapixels/app/utils/identity_utils.py | 22 | ||||
| -rw-r--r-- | megapixels/notebooks/datasets/identity/identity_master.ipynb | 1561 |
3 files changed, 1248 insertions, 337 deletions
diff --git a/megapixels/app/utils/api_utils.py b/megapixels/app/utils/api_utils.py index d9d67425..a4dad501 100644 --- a/megapixels/app/utils/api_utils.py +++ b/megapixels/app/utils/api_utils.py @@ -3,7 +3,7 @@ import urllib import urllib.request from app.settings import app_cfg -from app.utils import file_utils, im_utils, logger_utils +from app.utils import logger_utils class WikipediaAPI: diff --git a/megapixels/app/utils/identity_utils.py b/megapixels/app/utils/identity_utils.py index f9ed009e..775652dc 100644 --- a/megapixels/app/utils/identity_utils.py +++ b/megapixels/app/utils/identity_utils.py @@ -10,6 +10,25 @@ from app.utils import logger_utils log = logger_utils.Logger.getLogger() +az = 'abcdefghijklmlopqrstuvwzxyz' +AZ = az.upper() +z9 = list(map(str, list(range(0,10)))) +aZ9 = list(az) + list(AZ) + z9 + +def letter_strip(a, b=aZ9): + # strip every letter from a that is not in b + return ''.join([x for x in a if x in b]) + +def letter_match(a, b): + # check if every letter (a-zA-Z0-9) exists in both + return sum([x in b for x in a]) == len(a) + +def names_match_strict(a, b): + clean_a = letter_strip(a) + clean_b = letter_strip(b) + return len(clean_a) == len(clean_b) and letter_match(clean_a, clean_b) and letter_match(clean_b, clean_a) + + ''' class Dataset(Enum): LFW, VGG_FACE, VGG_FACE2, MSCELEB, UCCS, UMD_FACES, SCUT_FBP, UCF_SELFIE, UTK, \ @@ -83,6 +102,7 @@ def get_names(opt_dataset, opt_data_store=types.DataStore.HDD): result = {'names_orig': names_orig, 'names_query': names_query} return result + def similarity(a, b): return difflib.SequenceMatcher(a=a.lower(), b=b.lower()).ratio() @@ -111,7 +131,7 @@ def names_match(name_a, name_b, threshold=0.9, as_float=False, compound_score=Fa scores.append(subscores) # return result - ratio_similar = sum(max(x) for x in scores) / len_min + ratio_similar = sum(max(x) for x in scores) / len(scores) if compound_score: # combine with any missing letters/words diff --git a/megapixels/notebooks/datasets/identity/identity_master.ipynb b/megapixels/notebooks/datasets/identity/identity_master.ipynb index a48a7ba1..e932a947 100644 --- a/megapixels/notebooks/datasets/identity/identity_master.ipynb +++ b/megapixels/notebooks/datasets/identity/identity_master.ipynb @@ -6,13 +6,30 @@ "source": [ "# Identity Master List\n", "\n", - "- start with MS Celeb Top1M\n", - "- then progressively add smaller datasets" + "- [x] MS Celeb 1M\n", + "- UMD Faces\n", + "- FaceScrub\n", + "- LFW\n", + "- PubFig\n", + "- PubFig83\n", + "- VGG Face\n", + "- VGG Face2\n", + "- IJB-C\n", + "- CASIA Webface\n", + "- IMDB-Face\n", + "- IMDB-Wiki" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -48,15 +65,6 @@ ] }, { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'" - ] - }, - { "cell_type": "markdown", "metadata": {}, "source": [ @@ -68,65 +76,185 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'\n", + "dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'\n", + "fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n", - "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n", + "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])\n", "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n", - "n_groups = df_msceleb_top1m_groups.ngroups\n", - "print(f'{n_groups} groups')\n", - "df_msceleb_top1m.head(2)" + "n_groups = df_msceleb_top1m_groups.ngroups" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 200, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fbc706a8b9f34d958e478cdf584bf853", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" + "# create alphabetically sorted dict\n", + "msceleb_top1m_az = {}\n", + "a2z = 'abcdefghijklmnopqrstuvwxyz'\n", + "for c in a2z:\n", + " msceleb_top1m_az[c] = []\n", + "for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):\n", + " name = msceleb_row.name_lang\n", + " try:\n", + " msceleb_top1m_az[name[0].lower()].append({'name': name, 'id_kg': msceleb_row.id_kg})\n", + " except Exception as e:\n", + " pass" ] }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 159, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id_kg</th>\n", + " <th>name_lang</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>m.01008l47</td>\n", + " <td>Patrick Cummins@en</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>m.01008l47</td>\n", + " <td>Patrick Cummins@pt</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>m.01008l96</td>\n", + " <td>Mohamed Guessous@en</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>m.01008l96</td>\n", + " <td>Mohamed Guessous@fr</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>m.01008l96</td>\n", + " <td>محمد جسوس@ar</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id_kg name_lang\n", + "0 m.01008l47 Patrick Cummins@en\n", + "1 m.01008l47 Patrick Cummins@pt\n", + "2 m.01008l96 Mohamed Guessous@en\n", + "3 m.01008l96 Mohamed Guessous@fr\n", + "4 m.01008l96 محمد جسوس@ar" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#df_msceleb_top1m.head(100)" + "df_msceleb_top1m.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 3,481,186 total name variations\n", + "There are 1,000,000 unique identities\n" + ] + } + ], + "source": [ + "print(f'There are {len(df_msceleb_top1m):,} total name variations')\n", + "print(f'There are {n_groups:,} unique identities')" + ] + }, + { + "cell_type": "code", + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ - "abbrev_mappings = {\n", - " 'en-US': 'en',\n", - " 'en-GB': 'en',\n", - " 'es-419': 'es-419',\n", - " 'es'\n", - "}" + "# convert DataFrame to dict\n", + "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ + "# store all identity info here, until creating dataframe\n", "msceleb_identities = {}" ] }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ + "# utility functions\n", "def split_name_lang(name_lang):\n", " '''Split name into name and language'''\n", " if '@' in name_lang:\n", @@ -137,141 +265,142 @@ " else:\n", " name = name_lang\n", " lang = ''\n", - " return {'name': name, 'lang': lang}" + " return {'name': name, 'lang': lang}\n", + "\n", + "# temp save DataFrame to CSV\n", + "def save_identity_master(identities, fp_out=fp_master_identities):\n", + " df_identities_master = pd.DataFrame.from_dict(identities)\n", + " df_identities_master.index.name = 'id'\n", + " df_identities_master.to_csv(fp_master_identities)" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 164, "metadata": {}, "outputs": [ { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "884edc099a404dfcb53e353d2abf6819", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "{'name': 'r@destiny', 'lang': 'en-417'}" + "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" ] }, - "execution_count": 122, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "split_name_lang('r@destiny@en')" + "# convert to \"name@lang\" to dict format\n", + "msceleb_identities = {}\n", + "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", + " id_kg = mseleb_top1m_record['id_kg'].replace('m.','/m/')\n", + " if not id_kg in msceleb_identities.keys():\n", + " msceleb_identities[id_kg] = {'names': {}}\n", + " name_lang = split_name_lang(mseleb_top1m_record['name_lang'])\n", + " name = name_lang['name']\n", + " lang = name_lang['lang']\n", + " if lang == 'en':\n", + " msceleb_identities[id_kg]['names']['canonical'] = name\n", + " msceleb_identities[id_kg]['names'][lang] = name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Patch @en names" ] }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0120e006a7564f5c82729a7050ef0386", + "model_id": "1cd2915f485b4cd299a929e1fb2d5926", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "no english name for /m/017vbn\n", + "no english name for /m/026q0k_\n", + "no english name for /m/02k2kw\n", + "no english name for /m/0bwhrg1\n" + ] } ], "source": [ - "msceleb_identities = {}\n", - "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", - " id_kg = mseleb_top1m_record['id_kg']\n", - " if not id_kg in msceleb_identities.keys():\n", - " msceleb_identities[id_kg] = {}\n", - " name_lang = split_name_lang(mseleb_top1m_record['name'])\n", - " name = name_lang['name']\n", - " lang = name_lang['lang']\n", - " msceleb_identities[id_kg][lang] = name" + "# check for missing english names\n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " lang_attrs = attrs['names']\n", + " name_en = lang_attrs.get('en', None)\n", + " if not name_en:\n", + " print(f'no english name for {id_kg}')" ] }, { "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [], - "source": [ - "import itertools\n", - "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))" - ] - }, - { - "cell_type": "code", - "execution_count": 145, + "execution_count": 166, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Patrick Cummins en\n", - "Patrick Cummins pt\n", - "Mohamed Guessous en\n", - "Mohamed Guessous fr\n", - "محمد جسوس ar\n", - "Tsvetta Kaleynska en\n", - "Tsvetta Kaleynska es\n", - "Tsvetta Kaleynska fr\n", - "Цвета Калейнска bg\n", - "Цвета Калейнска ru\n", - "Caio Henrique Siqueira Sanchez en\n", - "Кајо Санчез sr\n", - "Julio Ríos Gallego ca\n", - "Julio Ríos Gallego en\n", - "Julio Ríos Gallego es\n", - "Nilson Ricardo da Silva Júnior en\n", - "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n", - "니우송 히카르두 다 시우바 주니오르 ko\n", - "Aleksej Aleksandrovič Starobinski sl\n", - "Alexei Alexandrowitsch Starobinski de\n", - "Alexei Starobinski pt\n", - "Alexei Starobinsky en\n", - "Alexeï Starobinski fr\n", - "Алексей Александрович Старобинский ru\n", - "Старобінський Олексій Олександрович uk\n", - "アレクセイ・スタロビンスキー ja\n", - "Hilda Rix Nicholas en\n", - "هیلدا ریکس نیکولاس fa\n", - "Behrouz Makvandi en\n", - "Бехруз Макванди ru\n", - "بهروز مکوندی fa\n", - "Borislav Terzić en\n", - "Борислав Терзић sr\n" + "patched /m/017vbn de to en\n", + "patched /m/026q0k_ nl to en\n", + "patched /m/02k2kw de to en\n", + "patched /m/0bwhrg1 it to en\n" ] } ], "source": [ - "# de-duplicate names that use same spelling for multiple languages\n", - "for id_kg, name_langs in msceleb_identities_sm.items():\n", - " if 'en' in name_langs.keys():\n", - " name_en = name_langs['en']\n", - " for lang, name in name_langs.items():\n", - " print(name, lang)" + "# patch en name exception: 4 names missing english\n", + "en_exceptions = {\n", + " '/m/017vbn': 'de',\n", + " '/m/026q0k_': 'nl',\n", + " '/m/02k2kw': 'de',\n", + " '/m/0bwhrg1': 'it'\n", + "}\n", + "for id_kg, lang in en_exceptions.items():\n", + " msceleb_identities[id_kg]['names']['en'] = msceleb_identities[id_kg]['names'][lang]\n", + " msceleb_identities[id_kg]['names']['canonical'] = msceleb_identities[id_kg]['names']['en']\n", + " print(f'patched {id_kg} {lang} to en')" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### Remove duplicate names" + ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 167, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "374a55f504084f14bd4d77fed0e2f4e4", + "model_id": "33ffa229c16d4a9088087c21210d421e", "version_major": 2, "version_minor": 0 }, @@ -286,62 +415,85 @@ "name": "stdout", "output_type": "stream", "text": [ - "n2 split is long: zh-Hant\n", - "n2 split is long: es-419\n", - "n2 split is long: fil\n", - "n2 split is long: en-GB\n", - "n2 split is long: en-US\n", - "n2 split is long: zh-HK\n", - "n2 split is long: fr-CA\n", - "n2 split is long: pt-PT\n", - "n2 split is long: ceb\n", - "n2 split is long: zorbla.de\n", - "n2 split is long: N\n", - "n2 split is long: hu\n", - "m.03zytg\tΑστέριος\"\n", - "n2 split is long: destiny\n", - "n2 split is long: Teng Boon Soon\n", - "n2 split is long: Yong Khoon Seng\n", - "n2 split is long: Tiki Anak Lafe\n", - "n2 split is long: Marcus Mojigoh\n", - "n2 split is long: Nyallau Anak Badak\n", - "n2 split is long: Bousou P\n", - "n2 split is long: evleaks\n" + "removed 1,485,336 duplicate names\n" ] } ], "source": [ - "messages = []\n", + "# de-duplicate names that use same spelling for multiple languages\n", + "items_removed = []\n", + "msceleb_identities_copy = msceleb_identities.copy()\n", + "\n", + "for id_kg, attrs in tqdm(msceleb_identities_copy.items()):\n", + " lang_attrs = attrs['names']\n", + " name_main = lang_attrs.get('canonical', None)\n", + " if not name_en:\n", + " print('error. all names need \"en\"')\n", + " break\n", + " lang_attrs_copy = attrs['names'].copy()\n", + " for lang, name in lang_attrs_copy.items():\n", + " if name == name_main and lang != 'en' and lang != 'canonical':\n", + " # remove it\n", + " items_removed.append(msceleb_identities[id_kg]['names'].pop(lang))\n", + " del lang_attrs_copy\n", "\n", - "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", + "del msceleb_identities_copy\n", + "print(f'removed {len(items_removed):,} duplicate names')\n", + "del items_removed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count images per person for ms celeb" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6e3a3f659fa6414b80d678d5b991ed0a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5049824), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# calculate total images per id\n", + "msceleb_files = {}\n", + "# load text file\n", + "with open(fp_msceleb_clean_txt,'r') as fp:\n", + " msceleb_lines = fp.readlines()\n", + " \n", + "# iterate lines and append all files\n", + "for filepath in tqdm(msceleb_lines):\n", + " id_kg, fname = filepath.split('/')\n", " id_kg = id_kg.replace('m.', '/m/')\n", - " for df_row in msceleb_group.itertuples():\n", - " if '@' in df_row.name:\n", - " splits = df_row.name.split('@')\n", - " if not len(splits) > 1:\n", - " msg = f'only one split: {df_row.name}'\n", - " if not msg in messages:\n", - " print(msg)\n", - " messages.append(msg)\n", - " elif len(splits) > 1:\n", - " if len(splits[1]) != 2:\n", - " msg = f'n2 split is long: {splits[1]}'\n", - " if not msg in messages:\n", - " print(msg)\n", - " messages.append(msg)\n", - " else:\n", - " print(df_row.name)" + " if not id_kg in msceleb_files.keys():\n", + " msceleb_files[id_kg] = []\n", + " msceleb_files[id_kg].append(fname)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "475871ac6d08484cbec44d5ccf099bd8", + "model_id": "bd0530f0e4634a8dbae0308964cd6e2b", "version_major": 2, "version_minor": 0 }, @@ -354,251 +506,990 @@ } ], "source": [ - "# iterate groups and flatten language variations into named columns\n", - "identities = []\n", - "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", - " id_kg = id_kg.replace('m.', '/m/')\n", - " for df_row in msceleb_group.itertuples():\n", - " if '@' in df_row.name:\n", - " splits = df_row.name.split('@')\n", - " name = splits[0]\n", - " lang = splits[1] if len(splits) > 0 else 'en'\n", - " else:\n", - " # default to 'en'\n", - " lang = 'en'\n", - " name = df_row.name\n", - " col_name = f'ms_name_{lang}'\n", - " identities.append({'id_kg': id_kg, col_name: name})" + "# add count to \n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " if id_kg in msceleb_files.keys():\n", + " count = len(msceleb_files[id_kg])\n", + " else:\n", + " count = 0\n", + " msceleb_identities[id_kg]['count_msceleb'] = count" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "im_counts_idxs = [attrs['count_msceleb'] for id_kg, attrs in msceleb_identities.items()]\n", + "im_counts_id_kg = [id_kg for id_kg, _ in msceleb_identities.items()]" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n" + "Most images 130 for Leelee Sobieski\n", + "88,244 more than 10\n", + "78,027 more than 20\n", + "49,042 more than 50\n", + "5,025 more than 100\n" ] } ], "source": [ - "print(identities[0:10])" + "# print stats\n", + "idx_max = np.argmax(im_counts_idxs)\n", + "id_kg_max = im_counts_id_kg[idx_max]\n", + "count_max = im_counts_idxs[idx_max]\n", + "name_max = msceleb_identities[id_kg_max]['names']['canonical']\n", + "print(f'Most images {count_max:,} for {name_max}')\n", + "# distribution\n", + "im_counts_idxs = np.array(im_counts_idxs)\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 10]):,} more than 10')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 20]):,} more than 20')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 50]):,} more than 50')\n", + "print(f'{len(im_counts_idxs[im_counts_idxs > 100]):,} more than 100')" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "420bc435f447454faa2dba73d7dff982", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# awkward conversion of msceleb_identities to a list of dicts\n", + "identities_flat = []\n", + "for id_kg, attrs in tqdm(msceleb_identities.items()):\n", + " obj = {'id_kg': id_kg}\n", + " for lang, name in attrs['names'].items():\n", + " if lang != 'canonical':\n", + " col_name = f'name_msceleb_{lang}'\n", + " elif lang == 'canonical':\n", + " col_name = 'name_msceleb'\n", + " obj[col_name] = name\n", + " obj['count_msceleb'] = attrs['count_msceleb']\n", + " identities_flat.append(obj)" + ] + }, + { + "cell_type": "code", + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ - "# temp save DataFrame to CSV\n", - "def save_identity_master(identities, fp_out=fp_master_identities):\n", - " df_identities_master = pd.DataFrame.from_dict(identities)\n", - " df_identities_master.index.name = 'id'\n", - " df_identities_master.to_csv(fp_master_identities)" + "# convert to dataframe\n", + "df_identities = pd.DataFrame.from_dict(identities_flat)" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "# save checkpoint CSV\n", + "save_identity_master(identities_flat) # encoding='utf-16' ??" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "# copy to master and delete ref to msceleb\n", + "identities = msceleb_identities.copy()\n", + "del msceleb_identities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Add image count data for MS Celeb" + "## LFW" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 193, "metadata": {}, "outputs": [], "source": [ - "# load lines\n", - "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n", - "with open(fp_msceleb_clean,'r') as fp:\n", - " msceleb_lines = fp.readlines()\n", - "msceleb_files = {}\n", + "# add LFW data\n", + "fp_lfw = '/data_store_hdd/datasets/people/lfw/downloads/lfw-names.txt'\n", + "with open(fp_lfw,'r') as fp:\n", + " lfw_lines = fp.readlines()\n", + "lfw_lines = [x.strip() for x in lfw_lines]\n", "\n", - "# iterate lines and append all files\n", - "for filepath in msceleb_lines:\n", - " id_kg, fname = filepath.split('/')\n", - " id_kg = id_kg.replace('m.', '/m/')\n", - " if not id_kg in msceleb_files.keys():\n", - " msceleb_files[id_kg] = []\n", - " msceleb_files[id_kg].append(fname)\n", - "\n", - " # add count\n", - "for identity in identities:\n", - " id_kg = identity['id_kg']\n", - " if id_kg in msceleb_files.keys():\n", - " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n", - " else:\n", - " identity['msceleb_count'] = 0" + "lfw_meta = []\n", + "for lfw_line in lfw_lines:\n", + " name_orig, count = lfw_line.split('\\t')\n", + " name_clean = name_orig.replace('_',' ')\n", + " obj = {'name_orig': name_orig, 'name': name_clean, 'count':count}\n", + " lfw_meta.append(obj)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 179, "metadata": {}, "outputs": [], "source": [ - "# save (takes 30 seconds)\n", - "save_identity_master(identities) # encoding='utf-16' ??" + "identities_tmp = identities.copy()" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make exact name matches\n", + "lfw_name_matches_tmp = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " lfwnl = lfw_name.lower()\n", + " splits = lfw_name.split(' ')\n", + " matches_tmp = {}\n", + " for word in splits:\n", + " # for each word in names, check if exact word is in master name list\n", + " c = word[0].lower()\n", + " matches_tmp = []\n", + " for name_id_kg in msceleb_top1m_az[c]:\n", + " name = name_id_kg['name']\n", + " id_kg = name_id_kg['id_kg']\n", + " if lfwnl in name.lower():\n", + " lfw_name_matches_tmp[lfw_name] = id_kg\n", + " break\n", + "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8c0f2dbf032145fea3ad5759a97abc44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-212-13b8b31f417d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mlfwnl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mname_id_kg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_top1m_az\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mid_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_id_kg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id_kg'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# make exact name matches\n", + "lfw_name_matches_exact = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " # quickly check if it's in the alphabetized list\n", + " c = lfw_name[0].lower()\n", + " lfwnl = lfw_name.lower()\n", + " for name_id_kg in msceleb_top1m_az[c]:\n", + " name = name_id_kg['name']\n", + " id_kg = name_id_kg['id_kg']\n", + " if lfwnl in name.lower():\n", + " lfw_name_matches_exact[lfw_name] = id_kg\n", + " break\n", + "print(f'found {len(lfw_name_matches_exact)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1949065d12b349ce8bbf28ebd09f1e29", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "matched AJ Cook to A. J. Cook in canonical. Add to matched ids\n", + "matched AJ Lamas to A.J. Lamas in canonical. Add to matched ids\n", + "could not find: Aaron Patterson\n", + "matched Aaron Pena to Aaron Peña in canonical. Add to matched ids\n", + "could not find: Abdel Aziz Al-Hakim\n", + "could not find: Abdel Madi Shabneh\n", + "could not find: Abdel Nasser Assidi\n", + "could not find: Abdul Majeed Shobokshi\n", + "matched Abdulaziz Kamilov to Abdulaziz Komilov in canonical. Add to matched ids\n", + "could not find: Abdullah Nasseef\n", + "could not find: Abdullah al-Attiyah\n", + "could not find: Abdullatif Sener\n", + "could not find: Abner Martinez\n", + "could not find: Aby Har-Even\n", + "could not find: Adam Kennedy\n", + "could not find: Adelina Avila\n", + "could not find: Adisai Bodharamik\n", + "could not find: Adolfo Aguilar Zinser\n", + "could not find: Adoor Gopalakarishnan\n", + "could not find: Adrian Annus\n", + "matched Adrian Fernandez to Adriana Fernández in canonical. Add to matched ids\n", + "could not find: Adrian Nastase\n", + "could not find: Adriana Perez Navarro\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-217-f9d734a428b9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0midentity\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# for each name's language variation, look for match\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mstrict_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0midentity_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlfw_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstrict_match\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mmatched_id_kg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mid_kg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mnames_match_strict\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mnames_match_strict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mclean_a\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mclean_b\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_a\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_b\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclean_b\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclean_a\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36mletter_strip\u001b[0;34m(a, b)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/work/megapixels_dev/megapixels/app/utils/identity_utils.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_strip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maZ9\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# strip every letter from a that is not in b\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mletter_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# make strict name-letter matches\n", + "lfw_name_matches_strict = {}\n", + "for lfw_item in tqdm(lfw_meta):\n", + " lfw_name = lfw_item['name'] # name is transformed original name\n", + " if lfw_name in lfw_name_matches_exact.keys():\n", + " continue\n", + " \n", + " matched_id_kg = None\n", + " for id_kg, identity in identities_tmp.items():\n", + " # for each msceleb identity, look for match\n", + " for lang, name in identity['names'].items():\n", + " # for each name's language variation, look for match\n", + " strict_match = identity_utils.names_match_strict(lfw_name, name)\n", + " if strict_match:\n", + " matched_id_kg = id_kg\n", + " matched_lang = lang\n", + " matched_name = name\n", + " break\n", + " if matched_id_kg:\n", + " print(f'matched {lfw_name} to {matched_name} in {matched_lang}. Add to matched ids')\n", + " lfw_name_matches_strict[lfw_name] = matched_id_kg\n", + " break\n", + " if not matched_id_kg:\n", + " print(f'could not find: {lfw_name}')\n", + "print(f'found {len(lfw_name_matches_strict)} of {len(lfw_meta)} names using exact matches')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make fuzzy name matches\n" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d822a21cc63e4c5c9fe9bb637f5455dd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, description='1st loop', max=5749, style=ProgressStyle(description_width='i…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found: Aaron Eckhart@ca\n", + "Found: Aaron Guiel@en\n", + "Found: Aaron Peirsol@ca\n", + "Found: Aaron Sorkin@ca\n", + "Found: Aaron Tippin@de\n", + "Found: Abba Eban@cs\n", + "Found: Abbas Kiarostami@ca\n", + "Found: Abdoulaye Wade@ca\n", + "Found: Abdul Rahman Lestaluhu@id\n", + "Found: Abdullah Cabir@tr\n", + "Found: Abdullah Ahmad Badawi@da\n", + "Found: Abdullah Gulam Rasoul@en\n", + "Found: Abel Aguilar@cs\n", + "Found: Abel Pacheco de la Espriella@es\n", + "Found: Abid Hamid Mahmud al-Tikriti@nl\n", + "Found: Abraham Foxman@cs\n", + "Found: Adam Ant@cs\n", + "Found: Adam Freier@en\n", + "Found: Adam Herbert@en\n", + "Found: Adam Mair@de\n", + "Found: Adam Richards@en\n", + "Found: Adam Sandler@ca\n", + "Found: George Adam Scott@en\n", + "Found: Adel Al-Jubeir@fr\n", + "Found: Adolfo Rodriguez Saa@id\n", + "Found: Adrian McPherson@en\n", + "Found: Adrian Murrell@en\n", + "Found: Adriana Lima@ca\n", + "Found: Adrien Brody@ca\n", + "Found: Afton Smith@cs\n", + "Found: Agbani Darego@de\n", + "Found: Agnelo Queiroz@en\n", + "Found: Agnes Bruckner@de\n", + "Found: Ahmed Ahmedou@de\n", + "Found: Ahmed Chalabi@en\n", + "Found: Mahmood Ahmed Ghazi@en\n", + "Found: Ahmet Necdet Sezer@ca\n", + "Found: Ai Sugiyama@da\n", + "Found: Aidan Quinn@ca\n", + "Found: Aileen Riggin Soule@fr\n", + "Found: Aishwarya Rai Bachchan@en\n", + "Found: Ajit Agarkar@en\n", + "Found: Akbar Al Baker@en\n", + "Found: Akbar Hashemi Rafsanjani@da\n", + "Found: Akhmed Zakayev@en\n", + "Found: Akiko Morigami@da\n", + "Found: Al Cardenas@en\n", + "Found: Vidal Davis@en\n", + "Found: Al Gore III@en\n", + "Found: Al Leiter@de\n", + "Found: Al Pacino@ca\n", + "Found: Al Sharpton@de\n", + "Found: Alain Cervantes@en\n", + "Found: Alain Ducasse@de\n", + "Found: Alan Ball jr.@nl\n", + "Found: Alan Dershowitz@da\n", + "Found: Alan Greenspan@de\n", + "Found: Alan Mulally@de\n", + "Found: Alan Trammell@de\n", + "Found: Alan Zemaitis@en\n", + "Found: Alanis Morissette@ca\n", + "Found: Alanna Ubach@de\n", + "Found: Alastair Campbell@de\n", + "Found: Alastair Johnston@en\n", + "Found: Albert Costa Balboa@es\n", + "Found: Albert Pujols@da\n", + "Found: Alberto Acosta@ca\n", + "Found: Alberto Fujimori@ca\n", + "Found: Alberto Sordi@ca\n", + "Found: Aldo Paredes@en\n", + "Found: Alec Baldwin@ca\n", + "Found: Alejandro Atchugarry@de\n", + "Found: Alejandro Fernandez Almendras@sl\n", + "Found: Alejandro Lembo@de\n", + "Found: Alejandro Lerner@en\n", + "Found: Alejandro Toledo@en\n", + "Found: Alek Wek@de\n", + "Found: Alessandro Nesta@ca\n", + "Found: Alex Barros@de\n", + "Found: Alex Cabrera@en\n", + "Found: Alex Ferguson@en\n", + "Found: Alex Holmes@en\n", + "Found: Alex Kingston@cs\n", + "Found: Alex Penelas@en\n", + "Found: Alex Popovici@es\n", + "Found: Alex Sink@en\n", + "Found: Alex Wallau@en\n", + "Found: Alex Zanardi@ca\n", + "Found: Alexa Vega@da\n", + "Found: Alexander Downer@de\n", + "Found: Alexander Losyukov@en\n", + "Found: Alexander Lukashenko@en\n", + "Found: Alexander Payne@cs\n", + "Found: Alexandra Pelosi@en\n", + "Found: Alexandra Stevenson@de\n", + "Found: Alexandre Daigle@cs\n", + "Found: Alexandre Despatie@ca\n", + "Found: Alexandre Herchcovitch@en\n", + "Found: Alexandre Vinokourov@fr\n", + "Found: Alexis Bledel@ca\n", + "Found: Alfonso Portillo@en\n", + "Found: Alfonso Soriano@en\n", + "Found: James Alfred Ford@en\n", + "Found: Alfred Santell@en\n", + "Found: Alfredo Moreno@en\n", + "Found: Ali Abbas Al-Hilfi@en\n", + "Found: Ali Abdullah Saleh@da\n", + "Found: Ali Ahmeti@de\n", + "Found: Prince Ali bin Hussein@en\n", + "Found: Ali Fallahian@de\n", + "Found: Ali Hammoud@en\n", + "Found: Ali Khamenei@ca\n", + "Found: Alicia Hollowell@en\n", + "Found: Alicia Keys@ca\n", + "Found: Alicia Molik@de\n", + "Found: Alicia Silverstone@ca\n", + "Found: Alicia Witt@ca\n", + "Found: Alimzhan Tokhtakhounov@pt\n", + "Found: Alina Kabaeva@en\n", + "Found: Alison Krauss@ca\n", + "Found: Alison Lohman@de\n", + "Found: Alistair Macdonald@en\n", + "Found: Allan Houston@ca\n", + "Found: Allan Kemakeza@de\n", + "Found: Allan Wagner Tizón@de\n", + "Found: Allen Iverson@ca\n", + "Found: Allison Janney@da\n", + "Found: Ally Sheedy@ca\n", + "Found: Allyson Felix@ca\n", + "Found: Alma Powell@de\n", + "Found: Alonzo Mourning@ca\n", + "Found: Aly Wagner@de\n", + "Found: Alyson Hannigan@ca\n", + "Found: Amanda Beard@de\n", + "Found: Amanda Bynes@ca\n", + "Found: Amanda Coetzer@de\n", + "Found: Amanda Marshall@de\n", + "Found: Amber Frey@en\n", + "Found: Amber Tamblyn@de\n", + "Found: Ambrose Lee@en\n", + "Found: Amelia Vega@en\n", + "Found: Amelie Mauresmo@ms\n", + "Found: Amr Moussa@ca\n", + "Found: Amram Mitzna@de\n", + "Found: Amy Brenneman@da\n", + "Found: Amy Cotton@en\n", + "Found: Amy Pascal@de\n", + "Found: Amy Redford@de\n", + "Found: Amy Smart@da\n", + "Found: Amy Yasbeck@de\n", + "Found: Ana Guevara@de\n", + "Found: Ananías Maidana Palacios@es\n", + "Found: Anastasia Kelesidou@de\n", + "Found: Anastasia Myskina@en\n", + "Found: Anatoliy Kinakh@en\n", + "Found: Anders Fogh Rasmussen@ca\n", + "Found: Andre Agassi@ca\n", + "Found: Andre Lange@et\n", + "Found: J. Andre Smith@en\n", + "Found: Andrea Bocelli@ca\n", + "Found: Andrea De Cruz@en\n", + "Found: Andrea Yates@en\n", + "Found: Andreas Vinciguerra@de\n", + "Found: Andrei Konchalovsky@en\n", + "Found: Andrei Mikhnevich@en\n", + "Found: Andrei Nikolishin@en\n", + "Found: Andrew Bernard@en\n", + "Found: Andrew Caldecott@en\n", + "Found: Andrew Cuomo@ca\n", + "Found: Andrew Fastow@de\n", + "Found: Andrew Firestone@en\n", + "Found: Andrew Gilligan@en\n", + "Found: Andrew Jarecki@de\n", + "Found: Andrew Luster@de\n", + "Found: Andrew Niccol@cs\n", + "Found: Andy Benes@en\n", + "Found: Andy Dickens@en\n", + "Found: DJ Andy Garcia@en\n", + "Found: Andy Griffith@ca\n", + "Found: Andy Griggs@en\n", + "Found: Andy Lau@cs\n", + "Found: Andy Northey@en\n", + "Found: Sandy Perez Aguila@en\n", + "Found: Andy Roddick@ca\n", + "Found: Andy Rooney@da\n", + "Found: Andy Warhol@ca\n", + "Found: Angela Bassett@ca\n", + "Found: Angela Lansbury@ca\n", + "Found: Angela Merkel@ca\n", + "Found: Angelina Jolie@ca\n", + "Found: Angie Martinez@en\n", + "Found: Anita DeFrantz@de\n", + "Found: Ann Landers@da\n", + "Found: Ann Morgan Guilbert@en\n", + "Found: Ann Veneman@de\n", + "Found: Anna Chicherova@en\n", + "Found: Anna Faris@ca\n", + "Found: Susanna Jones@en\n", + "Found: Anna Kournikova@da\n", + "Found: Anna Nicole Smith@ca\n", + "Found: Anne Donovan@de\n", + "Found: Anne Heche@ca\n", + "Found: Anne Krueger@fr\n", + "Found: Anne McLellan@en\n", + "Found: Annette Bening@ca\n", + "Found: Annette Lu@de\n", + "Found: Annie Machon@de\n", + "Found: Antanas Valionis@de\n", + "Found: Anthony Fauci@de\n", + "Found: Anthony Garotinho@en\n", + "Found: Anthony Hopkins@ca\n", + "Found: Anthony LaPaglia@da\n", + "Found: Anthony Principi@de\n", + "Found: Antje Buschschulte@de\n", + "Found: Anton Balasingham@en\n", + "Found: Antonio Banderas@ca\n", + "Found: Antonio Cassano@ca\n", + "Found: Antonio Catania@de\n", + "Found: Antonio Palocci@de\n", + "Found: Antonio Trillanes IV@fil\n", + "Found: Antony Leung@en\n", + "Found: Antwun Echols@en\n", + "Found: Anwar Ibrahim@da\n", + "Found: Aretha Franklin@ca\n", + "Found: Ari Bousbib@en\n", + "Found: Ari Fleischer@de\n", + "Found: Arianna Huffington@ca\n", + "Found: Arie Haan@de\n", + "Found: Ariel Sharon@ca\n", + "Found: Arif Mardin@de\n", + "Found: Arlen Specter@ca\n", + "Found: Armando Carrillo@en\n", + "Found: Arminio Fraga@en\n", + "Found: Arnold Palmer@da\n", + "Found: Arnold Schwarzenegger@ca\n", + "Found: Rolfe Arnold Scott-James@en\n", + "Found: Aron Ralston@cs\n", + "Found: Stuart Cooper@en\n", + "Found: Stuart Howe@en\n", + "Found: Arthur Johnson@it\n", + "Found: John Arthur Martinez@en\n", + "Found: Arturo Gatti@ca\n", + "Found: Asa Hutchinson@de\n", + "Found: Ashanti Douglas@nl\n", + "Found: Ashley Judd@ca\n", + "Found: Ashley Olsen@ca\n", + "Found: Ashley Postell@en\n", + "Found: Ashraf Ghani Ahmadzai@es\n", + "Found: Ashton Kutcher@ca\n", + "Found: Asif Ali Zardari@ca\n", + "Found: Askar Akayev@en\n", + "Found: Astou Ndiaye-Diatta@en\n", + "Found: Premiership of Atal Bihari Vajpayee@en\n", + "Found: Atom Egoyan@da\n", + "Found: Atsushi Satou@id\n", + "Found: Audrey Lacroix@en\n", + "Found: Audrey Sauret@en\n", + "Found: Augusto Pinochet Ugarte@ca\n", + "Found: Augusto Roa Bastos@de\n", + "Found: Aung San Suu Kyi@ca\n", + "Found: Austin Kearns@en\n", + "Found: Avril Lavigne@ca\n", + "Found: Azmi Bishara@ca\n", + "Found: Azra Akin@id\n", + "Found: Babe Ruth@ca\n", + "Found: Barbara Bach@cs\n", + "Found: Barbara Becker-Cantarino@en\n", + "Found: Barbara Bodine@en\n", + "Found: Barbara Boxer@cs\n", + "Found: Barbara Brezigar@cs\n", + "Found: Barbara Robertson@en\n", + "Found: Barbara Walters@de\n", + "Found: Barbra Streisand@ca\n", + "Found: Barry Alvarez@en\n", + "Found: Barry Bonds@da\n", + "Found: Barry Collier@en\n", + "Found: Barry Diller@de\n", + "Found: Barry Forde@ca\n", + "Found: Barry Hinson@en\n", + "Found: Barry Switzer@de\n", + "Found: Barry Williamson@en\n", + "Found: Barry Zito@de\n", + "Found: Bart Freundlich@de\n", + "Found: Bart Hendricks@en\n", + "Found: Bartosz Kizierowski@de\n", + "Found: Barzan Al-Tikriti@fr\n", + "Found: Basdeo Panday@de\n", + "Found: Baz Luhrmann@ca\n", + "Found: Beatriz Merino Lucero@pl\n", + "Found: Bela Karolyi@ms\n", + "Found: Ben Affleck@ca\n", + "Found: Torben Betts@en\n", + "Found: Ben Braun@en\n", + "Found: Ben Broussard@en\n", + "Found: Ben Cahoon@en\n", + "Found: Reuben Davis@en\n", + "Found: Ben Kingsley@ca\n", + "Found: Ben Lee Tyler@en\n", + "Found: Ben Steinbauer@en\n", + "Found: Benazir Bhutto@ca\n", + "Found: Benedita da Silva@en\n", + "Found: Benicio Del Toro@fi\n", + "Found: Benito Santiago@en\n", + "Found: Benjamin Bratt@cs\n", + "Found: Benjamin Franklin Bailey@en\n", + "Found: Benjamin McKenzie@ca\n", + "Found: Benjamin Netanyahu@da\n", + "Found: Bernadette Peters@ca\n", + "Found: Bernard Ebbers@de\n", + "Found: Bernard Giraudeau@de\n", + "Found: Bernard Kerik@en\n", + "Found: Bernard Landry@de\n", + "Found: Bernard Law@fr\n", + "Found: Bernard Lord@en\n", + "Found: Bernardo Segura@de\n", + "Found: Bertie Ahern@ca\n", + "Found: Bertrand Bonello@de\n", + "Found: A. Elizabeth Jones@en\n", + "Found: Bettina Rheims@cs\n", + "Found: Betty Williams@en\n", + "Found: Bianca Jagger@da\n", + "Found: Bijan Namdar Zangeneh@de\n", + "Found: Bill Belichick@da\n", + "Found: Bill Butler@ca\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found: Bill Callahan@en\n", + "Found: Bill Cartwright@en\n", + "Found: Bill Clancy@en\n", + "Found: Bill Clinton@ca\n", + "Found: Bill Curry@en\n", + "Found: Bill Doba@en\n", + "Found: Bill Elliott@pt\n", + "Found: Bill Fennelly@en\n", + "Found: Bill Frist@de\n", + "Found: Bill Gates@ca\n", + "Found: Bill Grahame@en\n", + "Found: Bill Guerin@de\n", + "Found: Bill Herrion@en\n", + "Found: Bill Hughes@en\n", + "Found: Bill Kollar@en\n", + "Found: Bill Kong@es\n", + "Found: Bill Mauldin@de\n", + "Found: Bill McBride@en\n", + "Found: Bill Nelson@da\n", + "Found: Bill Parcells@de\n", + "Found: Bill Parsons@en\n", + "Found: Bill Paxton@ca\n", + "Found: Bill Self@de\n", + "Found: Bill Sizemore@en\n", + "Found: Bill Stapleton@en\n", + "Found: Bill Steinke@en\n", + "Found: Bill Walton@de\n", + "Found: Billy Andrade@da\n", + "Found: Billy Beane@de\n", + "Found: Billy Bob Thornton@ca\n", + "Found: Billy Boyd@en\n", + "Found: Billy Crawford@de\n", + "Found: Billy Crystal@ca\n", + "Found: Billy Donovan@en\n", + "Found: Billy Gilman@en\n", + "Found: Billy Joel@ca\n", + "Found: Bing Crosby@ca\n", + "Found: Binyamin Ben-Eliezer@en\n", + "Found: Bison Dele@de\n", + "Found: Bixente Lizarazu@ca\n", + "Found: Blas Ople@de\n", + "Found: Blythe Danner@ca\n", + "Found: Blythe Hartley@de\n", + "Found: Bo Pelini@en\n", + "Found: Bo Ryan@en\n", + "Found: Bob Alper@en\n", + "Found: Bob Beauprez@de\n", + "Found: Bob Bowlsby@en\n", + "Found: Bob Dole@ca\n", + "Found: Bob Ferguson@da\n", + "Found: Bob Geldof@ca\n", + "Found: Bob Graham@en\n", + "Found: Bob Guccione@cs\n", + "Found: Bob Hayes@cs\n", + "Found: Bob Holden@de\n", + "Found: Bob Hope@ca\n", + "Found: Bob Huggins@en\n", + "Found: Bob Iger@en\n", + "Found: Bob Krueger@en\n", + "Found: Bob Menendez@da\n", + "Found: Bob Newhart@de\n", + "Found: Bob Stoops@en\n", + "Found: Bob Taft@de\n", + "Found: Bobby Bowden@de\n", + "Found: Bobby Kielty@en\n", + "Found: Bobby Robson@ca\n", + "Found: Bode Miller@ca\n", + "Found: Bonnie Fuller@en\n", + "Found: Bonnie Hunt@ca\n", + "Found: Nella Maria Bonora@de\n", + "Found: Boris Berezovsky@en\n", + "Found: Boris Henry@cs\n", + "Found: Boris Jordan@en\n", + "Found: Boris Trajkovski@ca\n", + "Found: Boris Yeltsin@en\n", + "Found: Brad Banks@en\n", + "Found: Brad Brownell@en\n", + "Found: Brad Garrett@da\n", + "Found: Brad Gushue@de\n", + "Found: Brad Miller@en\n", + "Found: Brad Pitt@ca\n", + "Found: Brad Wilk@cs\n", + "Found: Brajesh Mishra@en\n", + "Found: Brandon Boyd@da\n", + "Found: Brandon Hammond@en\n", + "Found: Brandon Inge@de\n", + "Found: Brandon Jones@en\n", + "Found: Brandon Knight@de\n", + "Found: Brandon Larson@en\n", + "Found: Brandon Lloyd@en\n", + "Found: Brandon Webb@pl\n", + "Found: Branko Crvenkovski@ca\n", + "Found: Brendan Fraser@ca\n", + "Found: Brendan Gaughan@en\n", + "Found: Brendan Hansen@en\n", + "Found: H. Brent Coles@en\n", + "Found: Brett Hawke@en\n", + "Found: Brett Hull@cs\n", + "Found: Brian Billick@de\n", + "Found: Brian Campbell Vickery@de\n", + "Found: Brian Cashman@en\n", + "Found: Brian Clemens@de\n", + "Found: Brian Cook@en\n", + "Found: Brian Cowen@ca\n", + "Found: Brian De Palma@ca\n", + "Found: Brian Gregory@en\n", + "Found: Brian Griese@en\n", + "Found: Brian Heidik@en\n", + "Found: Brian Henson@en\n", + "Found: Brian Kerr@de\n", + "Found: Brian Lara@de\n", + "Found: Brian Mulroney@ca\n", + "Found: Brian Olson@en\n", + "Found: Brian Scalabrine@ca\n", + "Found: Brian Schneider@en\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-141-5351e70c6afa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m# first, grep all rows of the original TSV file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfw_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mmsceleb_row\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_msceleb_top1m\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertuples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlfw_name_clean\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmsceleb_row\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname_lang\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'Found: {msceleb_row.name_lang}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# compare this this to master identity\n", + "for lfw_item in tqdm(lfw_meta, desc='1st loop'):\n", + " \n", + " # for each LFW name, look for match\n", + " lfw_name = lfw_item['name']\n", + " matched_id = None\n", + " \n", + " for id_kg, identity in identities_tmp.items():\n", + " # for each msceleb identity, look for match\n", + " for lang, name in identity['names'].items():\n", + " # for each name's language variation, look for match\n", + " if not len(name) > 0:\n", + " print('no name')\n", + " continue\n", + " strict_match = identity_utils.names_match_strict(lfw_name, name)\n", + " if strict_match:\n", + " #print(f'Strict matched \"{lfw_name}\" to \"{name}\"')\n", + " matched_id = id_kg\n", + " matched_lang = lang\n", + " matched_name = name\n", + " break\n", + " if matched_id:\n", + " matched_lang = lang\n", + " matched_name = name\n", + " print(f'OK. Found match: {lfw_name} == {matched_name} in lang: {matched_lang}')\n", + " pbar_ids.clear()\n", + " pbar_ids.close()\n", + " break\n", + " if not matched_id:\n", + " print(f'ERROR: could not find {lfw_name}')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['id_kg',\n", - " 'ms_name_ Marcus Mojigoh',\n", - " 'ms_name_ Nyallau Anak Badak',\n", - " 'ms_name_ Teng Boon Soon',\n", - " 'ms_name_ Tiki Anak Lafe',\n", - " 'ms_name_ Yong Khoon Seng',\n", - " 'ms_name_Bousou P',\n", - " 'ms_name_N',\n", - " 'ms_name_af',\n", - " 'ms_name_am',\n", - " 'ms_name_ar',\n", - " 'ms_name_az',\n", - " 'ms_name_be',\n", - " 'ms_name_bg',\n", - " 'ms_name_bm',\n", - " 'ms_name_bn',\n", - " 'ms_name_bo',\n", - " 'ms_name_br',\n", - " 'ms_name_bs',\n", - " 'ms_name_ca',\n", - " 'ms_name_ceb',\n", - " 'ms_name_ck',\n", - " 'ms_name_co',\n", - " 'ms_name_cr',\n", - " 'ms_name_cs',\n", - " 'ms_name_cy',\n", - " 'ms_name_da',\n", - " 'ms_name_de',\n", - " 'ms_name_destiny',\n", - " 'ms_name_dz',\n", - " 'ms_name_el',\n", - " 'ms_name_en',\n", - " 'ms_name_en-GB',\n", - " 'ms_name_en-US',\n", - " 'ms_name_eo',\n", - " 'ms_name_es',\n", - " 'ms_name_es-419',\n", - " 'ms_name_et',\n", - " 'ms_name_eu',\n", - " 'ms_name_evleaks',\n", - " 'ms_name_fa',\n", - " 'ms_name_fi',\n", - " 'ms_name_fil',\n", - " 'ms_name_fo',\n", - " 'ms_name_fr',\n", - " 'ms_name_fr-CA',\n", - " 'ms_name_fy',\n", - " 'ms_name_ga',\n", - " 'ms_name_gd',\n", - " 'ms_name_gl',\n", - " 'ms_name_gn',\n", - " 'ms_name_gu',\n", - " 'ms_name_ha',\n", - " 'ms_name_hi',\n", - " 'ms_name_hr',\n", - " 'ms_name_ht',\n", - " 'ms_name_hu',\n", - " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n", - " 'ms_name_hy',\n", - " 'ms_name_id',\n", - " 'ms_name_ig',\n", - " 'ms_name_is',\n", - " 'ms_name_it',\n", - " 'ms_name_iw',\n", - " 'ms_name_ja',\n", - " 'ms_name_ka',\n", - " 'ms_name_kk',\n", - " 'ms_name_kl',\n", - " 'ms_name_km',\n", - " 'ms_name_kn',\n", - " 'ms_name_ko',\n", - " 'ms_name_ku',\n", - " 'ms_name_ky',\n", - " 'ms_name_la',\n", - " 'ms_name_lb',\n", - " 'ms_name_lo',\n", - " 'ms_name_lt',\n", - " 'ms_name_lv',\n", - " 'ms_name_mg',\n", - " 'ms_name_mi',\n", - " 'ms_name_mk',\n", - " 'ms_name_ml',\n", - " 'ms_name_mn',\n", - " 'ms_name_mr',\n", - " 'ms_name_ms',\n", - " 'ms_name_mt',\n", - " 'ms_name_my',\n", - " 'ms_name_ne',\n", - " 'ms_name_nl',\n", - " 'ms_name_nn',\n", - " 'ms_name_no',\n", - " 'ms_name_nv',\n", - " 'ms_name_ny',\n", - " 'ms_name_oc',\n", - " 'ms_name_or',\n", - " 'ms_name_pa',\n", - " 'ms_name_pl',\n", - " 'ms_name_ps',\n", - " 'ms_name_pt',\n", - " 'ms_name_pt-PT',\n", - " 'ms_name_ro',\n", - " 'ms_name_ru',\n", - " 'ms_name_rw',\n", - " 'ms_name_sa',\n", - " 'ms_name_sc',\n", - " 'ms_name_se',\n", - " 'ms_name_si',\n", - " 'ms_name_sk',\n", - " 'ms_name_sl',\n", - " 'ms_name_sn',\n", - " 'ms_name_so',\n", - " 'ms_name_sq',\n", - " 'ms_name_sr',\n", - " 'ms_name_st',\n", - " 'ms_name_su',\n", - " 'ms_name_sv',\n", - " 'ms_name_sw',\n", - " 'ms_name_ta',\n", - " 'ms_name_te',\n", - " 'ms_name_tg',\n", - " 'ms_name_th',\n", - " 'ms_name_tr',\n", - " 'ms_name_ug',\n", - " 'ms_name_uk',\n", - " 'ms_name_ur',\n", - " 'ms_name_uz',\n", - " 'ms_name_vi',\n", - " 'ms_name_xh',\n", - " 'ms_name_yi',\n", - " 'ms_name_yo',\n", - " 'ms_name_zh',\n", - " 'ms_name_zh-HK',\n", - " 'ms_name_zh-Hant',\n", - " 'ms_name_zorbla.de',\n", - " 'ms_name_zu']" + "True" ] }, - "execution_count": 95, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(df_identities_master.keys())" + "identity_utils.names_match_strict('AJ Cook', 'A.J. Cook')" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "names_match('A.J. Cook', 'cook Aj', as_float=True, compound_score=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PubFig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add pubfig data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Face Scrub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add facescrub" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## UMD Faces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add umd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CASIA Webface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add CASIA Webface" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IMDB Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add imdb-wiki" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IMDB-Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add imdb face" ] }, { @@ -625,7 +1516,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.1" } }, "nbformat": 4, |
