{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Identity Master List\n", "\n", "- start with MS Celeb Top1M\n", "- then progressively add smaller datasets" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import requests\n", "import json\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "\n", "import slugify\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## MS Celeb Top 1M\n", "\n", "- add column for each spelling of name\n", "- convert kg id to standard google format" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n", "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name'])\n", "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n", "n_groups = df_msceleb_top1m_groups.ngroups\n", "print(f'{n_groups} groups')\n", "df_msceleb_top1m.head(2)" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [], "source": [ "mseleb_top1m_records = df_msceleb_top1m.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "#df_msceleb_top1m.head(100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "abbrev_mappings = {\n", " 'en-US': 'en',\n", " 'en-GB': 'en',\n", " 'es-419': 'es-419',\n", " 'es'\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "msceleb_identities = {}" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "def split_name_lang(name_lang):\n", " '''Split name into name and language'''\n", " if '@' in name_lang:\n", " indexes = [i for i,x in enumerate(name_lang) if x == '@']\n", " idx_max = (max(indexes))\n", " lang = name_lang[(idx_max + 1):]\n", " name = name_lang[:(idx_max)]\n", " else:\n", " name = name_lang\n", " lang = ''\n", " return {'name': name, 'lang': lang}" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'name': 'r@destiny', 'lang': 'en-417'}" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "split_name_lang('r@destiny@en')" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0120e006a7564f5c82729a7050ef0386", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "msceleb_identities = {}\n", "for mseleb_top1m_record in tqdm(mseleb_top1m_records):\n", " id_kg = mseleb_top1m_record['id_kg']\n", " if not id_kg in msceleb_identities.keys():\n", " msceleb_identities[id_kg] = {}\n", " name_lang = split_name_lang(mseleb_top1m_record['name'])\n", " name = name_lang['name']\n", " lang = name_lang['lang']\n", " msceleb_identities[id_kg][lang] = name" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [], "source": [ "import itertools\n", "msceleb_identities_sm = dict(itertools.islice(msceleb_identities.items(), 0, 10))" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Patrick Cummins en\n", "Patrick Cummins pt\n", "Mohamed Guessous en\n", "Mohamed Guessous fr\n", "محمد جسوس ar\n", "Tsvetta Kaleynska en\n", "Tsvetta Kaleynska es\n", "Tsvetta Kaleynska fr\n", "Цвета Калейнска bg\n", "Цвета Калейнска ru\n", "Caio Henrique Siqueira Sanchez en\n", "Кајо Санчез sr\n", "Julio Ríos Gallego ca\n", "Julio Ríos Gallego en\n", "Julio Ríos Gallego es\n", "Nilson Ricardo da Silva Júnior en\n", "ニルソン・リカルド・ダ・シルバ・ジュニオール ja\n", "니우송 히카르두 다 시우바 주니오르 ko\n", "Aleksej Aleksandrovič Starobinski sl\n", "Alexei Alexandrowitsch Starobinski de\n", "Alexei Starobinski pt\n", "Alexei Starobinsky en\n", "Alexeï Starobinski fr\n", "Алексей Александрович Старобинский ru\n", "Старобінський Олексій Олександрович uk\n", "アレクセイ・スタロビンスキー ja\n", "Hilda Rix Nicholas en\n", "هیلدا ریکس نیکولاس fa\n", "Behrouz Makvandi en\n", "Бехруз Макванди ru\n", "بهروز مکوندی fa\n", "Borislav Terzić en\n", "Борислав Терзић sr\n" ] } ], "source": [ "# de-duplicate names that use same spelling for multiple languages\n", "for id_kg, name_langs in msceleb_identities_sm.items():\n", " if 'en' in name_langs.keys():\n", " name_en = name_langs['en']\n", " for lang, name in name_langs.items():\n", " print(name, lang)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "374a55f504084f14bd4d77fed0e2f4e4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "n2 split is long: zh-Hant\n", "n2 split is long: es-419\n", "n2 split is long: fil\n", "n2 split is long: en-GB\n", "n2 split is long: en-US\n", "n2 split is long: zh-HK\n", "n2 split is long: fr-CA\n", "n2 split is long: pt-PT\n", "n2 split is long: ceb\n", "n2 split is long: zorbla.de\n", "n2 split is long: N\n", "n2 split is long: hu\n", "m.03zytg\tΑστέριος\"\n", "n2 split is long: destiny\n", "n2 split is long: Teng Boon Soon\n", "n2 split is long: Yong Khoon Seng\n", "n2 split is long: Tiki Anak Lafe\n", "n2 split is long: Marcus Mojigoh\n", "n2 split is long: Nyallau Anak Badak\n", "n2 split is long: Bousou P\n", "n2 split is long: evleaks\n" ] } ], "source": [ "messages = []\n", "\n", "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", " id_kg = id_kg.replace('m.', '/m/')\n", " for df_row in msceleb_group.itertuples():\n", " if '@' in df_row.name:\n", " splits = df_row.name.split('@')\n", " if not len(splits) > 1:\n", " msg = f'only one split: {df_row.name}'\n", " if not msg in messages:\n", " print(msg)\n", " messages.append(msg)\n", " elif len(splits) > 1:\n", " if len(splits[1]) != 2:\n", " msg = f'n2 split is long: {splits[1]}'\n", " if not msg in messages:\n", " print(msg)\n", " messages.append(msg)\n", " else:\n", " print(df_row.name)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "475871ac6d08484cbec44d5ccf099bd8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# iterate groups and flatten language variations into named columns\n", "identities = []\n", "for id_kg, msceleb_group in tqdm(df_msceleb_top1m_groups, total=n_groups):\n", " id_kg = id_kg.replace('m.', '/m/')\n", " for df_row in msceleb_group.itertuples():\n", " if '@' in df_row.name:\n", " splits = df_row.name.split('@')\n", " name = splits[0]\n", " lang = splits[1] if len(splits) > 0 else 'en'\n", " else:\n", " # default to 'en'\n", " lang = 'en'\n", " name = df_row.name\n", " col_name = f'ms_name_{lang}'\n", " identities.append({'id_kg': id_kg, col_name: name})" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'id_kg': 'm/01008l47', 'ms_name_en': 'Patrick Cummins'}, {'id_kg': 'm/01008l47', 'ms_name_pt': 'Patrick Cummins'}]\n" ] } ], "source": [ "print(identities[0:10])" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "# temp save DataFrame to CSV\n", "def save_identity_master(identities, fp_out=fp_master_identities):\n", " df_identities_master = pd.DataFrame.from_dict(identities)\n", " df_identities_master.index.name = 'id'\n", " df_identities_master.to_csv(fp_master_identities)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add image count data for MS Celeb" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "# load lines\n", "fp_msceleb_clean = '/data_store_hdd/datasets/people/msceleb/downloads/MS-Celeb-1M_clean_list.txt'\n", "with open(fp_msceleb_clean,'r') as fp:\n", " msceleb_lines = fp.readlines()\n", "msceleb_files = {}\n", "\n", "# iterate lines and append all files\n", "for filepath in msceleb_lines:\n", " id_kg, fname = filepath.split('/')\n", " id_kg = id_kg.replace('m.', '/m/')\n", " if not id_kg in msceleb_files.keys():\n", " msceleb_files[id_kg] = []\n", " msceleb_files[id_kg].append(fname)\n", "\n", " # add count\n", "for identity in identities:\n", " id_kg = identity['id_kg']\n", " if id_kg in msceleb_files.keys():\n", " identity['msceleb_count'] = len(msceleb_files[identity['id_kg']])\n", " else:\n", " identity['msceleb_count'] = 0" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "# save (takes 30 seconds)\n", "save_identity_master(identities) # encoding='utf-16' ??" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['id_kg',\n", " 'ms_name_ Marcus Mojigoh',\n", " 'ms_name_ Nyallau Anak Badak',\n", " 'ms_name_ Teng Boon Soon',\n", " 'ms_name_ Tiki Anak Lafe',\n", " 'ms_name_ Yong Khoon Seng',\n", " 'ms_name_Bousou P',\n", " 'ms_name_N',\n", " 'ms_name_af',\n", " 'ms_name_am',\n", " 'ms_name_ar',\n", " 'ms_name_az',\n", " 'ms_name_be',\n", " 'ms_name_bg',\n", " 'ms_name_bm',\n", " 'ms_name_bn',\n", " 'ms_name_bo',\n", " 'ms_name_br',\n", " 'ms_name_bs',\n", " 'ms_name_ca',\n", " 'ms_name_ceb',\n", " 'ms_name_ck',\n", " 'ms_name_co',\n", " 'ms_name_cr',\n", " 'ms_name_cs',\n", " 'ms_name_cy',\n", " 'ms_name_da',\n", " 'ms_name_de',\n", " 'ms_name_destiny',\n", " 'ms_name_dz',\n", " 'ms_name_el',\n", " 'ms_name_en',\n", " 'ms_name_en-GB',\n", " 'ms_name_en-US',\n", " 'ms_name_eo',\n", " 'ms_name_es',\n", " 'ms_name_es-419',\n", " 'ms_name_et',\n", " 'ms_name_eu',\n", " 'ms_name_evleaks',\n", " 'ms_name_fa',\n", " 'ms_name_fi',\n", " 'ms_name_fil',\n", " 'ms_name_fo',\n", " 'ms_name_fr',\n", " 'ms_name_fr-CA',\n", " 'ms_name_fy',\n", " 'ms_name_ga',\n", " 'ms_name_gd',\n", " 'ms_name_gl',\n", " 'ms_name_gn',\n", " 'ms_name_gu',\n", " 'ms_name_ha',\n", " 'ms_name_hi',\n", " 'ms_name_hr',\n", " 'ms_name_ht',\n", " 'ms_name_hu',\n", " 'ms_name_hu\\r\\nm.03zytg\\tΑστέριος\"',\n", " 'ms_name_hy',\n", " 'ms_name_id',\n", " 'ms_name_ig',\n", " 'ms_name_is',\n", " 'ms_name_it',\n", " 'ms_name_iw',\n", " 'ms_name_ja',\n", " 'ms_name_ka',\n", " 'ms_name_kk',\n", " 'ms_name_kl',\n", " 'ms_name_km',\n", " 'ms_name_kn',\n", " 'ms_name_ko',\n", " 'ms_name_ku',\n", " 'ms_name_ky',\n", " 'ms_name_la',\n", " 'ms_name_lb',\n", " 'ms_name_lo',\n", " 'ms_name_lt',\n", " 'ms_name_lv',\n", " 'ms_name_mg',\n", " 'ms_name_mi',\n", " 'ms_name_mk',\n", " 'ms_name_ml',\n", " 'ms_name_mn',\n", " 'ms_name_mr',\n", " 'ms_name_ms',\n", " 'ms_name_mt',\n", " 'ms_name_my',\n", " 'ms_name_ne',\n", " 'ms_name_nl',\n", " 'ms_name_nn',\n", " 'ms_name_no',\n", " 'ms_name_nv',\n", " 'ms_name_ny',\n", " 'ms_name_oc',\n", " 'ms_name_or',\n", " 'ms_name_pa',\n", " 'ms_name_pl',\n", " 'ms_name_ps',\n", " 'ms_name_pt',\n", " 'ms_name_pt-PT',\n", " 'ms_name_ro',\n", " 'ms_name_ru',\n", " 'ms_name_rw',\n", " 'ms_name_sa',\n", " 'ms_name_sc',\n", " 'ms_name_se',\n", " 'ms_name_si',\n", " 'ms_name_sk',\n", " 'ms_name_sl',\n", " 'ms_name_sn',\n", " 'ms_name_so',\n", " 'ms_name_sq',\n", " 'ms_name_sr',\n", " 'ms_name_st',\n", " 'ms_name_su',\n", " 'ms_name_sv',\n", " 'ms_name_sw',\n", " 'ms_name_ta',\n", " 'ms_name_te',\n", " 'ms_name_tg',\n", " 'ms_name_th',\n", " 'ms_name_tr',\n", " 'ms_name_ug',\n", " 'ms_name_uk',\n", " 'ms_name_ur',\n", " 'ms_name_uz',\n", " 'ms_name_vi',\n", " 'ms_name_xh',\n", " 'ms_name_yi',\n", " 'ms_name_yo',\n", " 'ms_name_zh',\n", " 'ms_name_zh-HK',\n", " 'ms_name_zh-Hant',\n", " 'ms_name_zorbla.de',\n", " 'ms_name_zu']" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(df_identities_master.keys())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }