{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Identity Master List\n", "\n", "- [x] MS Celeb 1M\n", "- UMD Faces\n", "- FaceScrub\n", "- LFW\n", "- PubFig\n", "- PubFig83\n", "- VGG Face\n", "- VGG Face2\n", "- IJB-C\n", "- CASIA Webface\n", "- IMDB-Face\n", "- IMDB-Wiki" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "from glob import glob\n", "from pathlib import Path\n", "import requests\n", "import json\n", "from pprint import pprint\n", "from multiprocessing.pool import ThreadPool\n", "import threading\n", "import urllib.request\n", "import difflib\n", "import unidecode\n", "\n", "import slugify\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "from scipy.io import loadmat\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels')\n", "from app.utils import api_utils, identity_utils\n", "from app.settings import app_cfg\n", "from app.settings import types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## MS Celeb Top 1M\n", "\n", "- add column for each spelling of name\n", "- convert kg id to standard google format" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "fp_master_identities = '/data_store_hdd/apps/megapixels/metadata/identities_master_02.csv'\n", "dir_msceleb_dloads = '/data_store_hdd/datasets/people/msceleb/downloads/'\n", "fp_msceleb_clean_txt = join(dir_msceleb_dloads,'MS-Celeb-1M_clean_list.txt')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "fp_msceleb_top1m = '/data_store_hdd/datasets/people/msceleb/downloads/Top1M_MidList.Name.tsv'\n", "df_msceleb_top1m = pd.read_csv(fp_msceleb_top1m, delimiter='\\t', header=None, encoding='utf-8', names=['id_kg', 'name_lang'])\n", "df_msceleb_top1m_groups = df_msceleb_top1m.groupby('id_kg')\n", "n_groups = df_msceleb_top1m_groups.ngroups" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "346759995bbe45bebb81afbfb9a21853", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=3481186), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# create alphabetically sorted dict\n", "msceleb_top1m_az = {}\n", "for msceleb_row in tqdm(df_msceleb_top1m.itertuples(), total=len(df_msceleb_top1m)):\n", " name_lang = split_name_lang(msceleb_row.name_lang)\n", " name = name_lang['name']\n", " c = name[0].lower()\n", " if not c in msceleb_top1m_az.keys():\n", " msceleb_top1m_az[c] = []\n", " msceleb_top1m_az[c].append({'name': name, 'id_kg': msceleb_row.id_kg})" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | id_kg | \n", "name_lang | \n", "
|---|---|---|
| 0 | \n", "m.01008l47 | \n", "Patrick Cummins@en | \n", "
| 1 | \n", "m.01008l47 | \n", "Patrick Cummins@pt | \n", "
| 2 | \n", "m.01008l96 | \n", "Mohamed Guessous@en | \n", "
| 3 | \n", "m.01008l96 | \n", "Mohamed Guessous@fr | \n", "
| 4 | \n", "m.01008l96 | \n", "محمد جسوس@ar | \n", "