{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IJB-C Check MS Celeb MIDS\n", "\n", "- read in MS Celeb Master file\n", "- read in MS Celeb clean file\n", "- read in IJB-C name list" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "from os.path import join\n", "import math\n", "from glob import glob\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.settings import app_cfg as cfg\n", "from app.utils import identity_utils" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "a = '\"Arnold Rüütel\"@ca'" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "b = a.split('\"')[1]" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9545454545454546" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "fp_msceleb_full = '/data_store/datasets/people/msceleb/dataset/Top1M_MidList.Name.csv'\n", "#fp_msceleb_clean = '/data_store/datasets/people/msceleb/dataset/MS-Celeb-1M_clean_list.txt'\n", "fp_ijbc = '/data_store/datasets/people/ijb_c/downloads/ijbc_subject_names.csv'" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "df_msceleb_full = pd.read_csv(fp_msceleb_full, sep=',', names=[\"kg_id\", 'name_msceleb'], quotechar = ',', skipinitialspace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "records_msceleb = df_msceleb_full.to_dict('records')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#df_msceleb_full.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df_ijbc = pd.read_csv(fp_ijbc)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "records_ijbc = df_ijbc.to_dict('records')" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "#df_ijbc.head()" ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "# pre-compute msceleb data\n", "for i, msceleb_item in enumerate(records_msceleb.copy()):\n", " splits = msceleb_item['name_msceleb'].split('\"')\n", " msceleb_name = splits[1]\n", " try:\n", " lang = splits[2][1:]\n", " except Exception as e:\n", " lang = ''\n", " records_msceleb[i]['name_clean'] = msceleb_name\n", " records_msceleb[i]['lang'] = lang\n", " records_msceleb[i]['name_stripped_str'] = identity_utils.sanitize_name(msceleb_name, as_str=True)\n", " records_msceleb[i]['name_stripped_arr'] = identity_utils.sanitize_name(msceleb_name, as_str=False)" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'kg_id': 'm.01008lp2',\n", " 'name_msceleb': '\"Caio Henrique Siqueira Sanchez\"@en',\n", " 'name_clean': 'Caio Henrique Siqueira Sanchez',\n", " 'name_stripped': ['caio', 'henrique', 'siqueira', 'sanchez'],\n", " 'name_stripped_str': 'caio henrique siqueira sanchez',\n", " 'name_stripped_arr': ['caio', 'henrique', 'siqueira', 'sanchez'],\n", " 'lang': 'en'}" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "records_msceleb[10]" ] }, { "cell_type": "code", "execution_count": 165, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1fce6cb2b3eb4cf294a794a27209b312", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=3531), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "found_names = []\n", "unfound_names = []\n", "\n", "for ijbc_item in tqdm(records_ijbc):\n", " ijbc_name = ijbc_item.get('SUBJECT_NAME')\n", " \n", " # ensure name does not exist\n", " if ijbc_name in found_names or ijbc_name in unfound_names:\n", " continue\n", "\n", " # init loop vars\n", " found = False\n", " item_match = None\n", " \n", " # first look through all msceleb records for simple match\n", " for msceleb_item in records_msceleb:\n", " msceleb_name_clean = msceleb_item['name_clean']\n", " if ijbc_name.lower() in msceleb_name_clean.lower():\n", " item_match = msceleb_item\n", " break\n", " \n", " # if not, do more aggressive/cpu intensive matching\n", " if not item_match:\n", " ijbc_name_stripped = identity_utils.sanitize_name(ijbc_name, as_str=False)\n", " for msceleb_item in records_msceleb:\n", " msceleb_name_stripped = msceleb_item['name_stripped_arr']\n", " score = identity_utils.names_match(ijbc_name_stripped, msceleb_name_stripped, as_float=True, name_a_pre=True, name_b_pre=True)\n", " if score > 0.9:\n", " item_match = msceleb_item\n", " break\n", " \n", " # default match obj\n", " match_obj = {'name_ijb_c': ijbc_name, 'kg_id': '', 'name_msceleb': '', 'lang_msceleb': ''}\n", " \n", " if item_match:\n", " # append matched info from msceleb\n", " match_obj['name_msceleb'] = item_match['name_msceleb'] # keep orig msceleb name\n", " match_obj['lang_msceleb'] = item_match['lang']\n", " match_obj['kg_id'] = item_match.get('kg_id').replace('m.', '/m/')\n", " found_names.append(match_obj)\n", " else:\n", " # default obj\n", " unfound_names.append(match_obj)" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3299 232\n" ] } ], "source": [ "print(len(found_names), len(unfound_names))" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Raffaele Bonanni',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'King Abdullah II',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'George HW Bush',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Edmund Stoiber',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'JK Rowling',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ashton B. Carter',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Linah K. Mohohlo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': \"Alhaji Yar'Adua\",\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Hesham Mohamed Qandil',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Stephen Harper',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Martin Dempsey',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Horst Teltschik',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Andri Piebalgu',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Donald Tusk',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Richard Nixon',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Zulu Araujo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Eliana Calmon',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Edu Guedes',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Milene Uehara',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Celso Zucatelli',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jose Roberto Arruda',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Igor Slyunyayev',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Yevhenia Tymoshenko',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ernest Bower',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Seo Joo-hyun',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Doreen Lorenzo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Patrick Leahy',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Maria Soledad Alvear Valenzuela',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mario Kreutzberger',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ken Salazar',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Tim Allen',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mark A. Welsh III',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alessandro Molon',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Omobola Johnson',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jon Hamm',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Tammy Baldwin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Rajiv Shah',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Tedros Adhanom Ghebreyesus',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jerry Garcia',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Arlette Chabot',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Pedro Aznar',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Prakash Raj',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alex Jones',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gloria Álvarez',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Lakshmi Rai',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Waseem Abbas',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alicia Castro',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Aminata Traoré',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Paola Taverna',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Sanjay Kapoor',\n", " 'name_msceleb': ''},\n", " {'kg_id': '', 'lang_msceleb': '', 'name_ijb_c': 'AJ Lee', 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Andressa Urach',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Baby Margaretha',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Bob Garcia',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Dalila Nesci',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Dirk Müller',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mahesh Babu',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Maram al-Masri',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Minh Tran Huy',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Andressa Soares',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Barbara Lezzi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Eesha Koppikhar',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gabriel Jesus',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gayle San',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kaajal Oza Vaidya',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Marcelo Freixo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Óscar Santos',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Sam Smith',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Naomi Klein',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Aditi Arya',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alberto Garzón',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Anna Ráckevei',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Eleonora Menicucci',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Helvy Tiana Rosa',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Houda-Imane Faraoun',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Karen Paola',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Laura Bottici',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Laura Castelli',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mihai Voicu',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Patti Smith',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Puan Maharani',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Savitha Sastry',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Sharon la Hechicera',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Bahram Moshiri',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Luz Salgado',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Nouria Benghabrit-Remaoun',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Sant Asaram Ji Bapu',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Selin Sayek Böke',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Angelina Love',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Anna Aaron',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Anna Kendrick',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Carla Ruocco',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Deepak Chopra',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Giulia Grillo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Maine Mendoza',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'MC Pedrinho',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mònica Oltra',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Rahul Gandhi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Saad Lamjarred',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Song So-hee',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Sơn Tùng M-TP',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Susi Pudjiastuti',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Fabiana Dadone',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Giulia Di Vita',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Juan Carlos De Martin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Luigi Di Maio',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Money Boy',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Retno Marsudi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alejandro Valverde',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alek Skarlatos',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alexa Clay',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Alice Zeniter',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Aminta Granera',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Andrés Palomino',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Anita Anand',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ann Harding',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Anne Bouverot',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Antoine Compagnon',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Antoine Westermann',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Axel Kahn',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ayman Odeh',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Bastien Vivès',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Betty Cantrell',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Blake Griffin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Blossom Chukwujekwu',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Camilla Toulmin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Carlos Reichenbach',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Cristina Pedroche',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Cristóbal Cobo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Céline Curiol',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Daniel Bilalian',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Daniel R. Russel',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'David Kobia',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Dominique Manotti',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Dragan Bender',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Edgar Morin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Eka Zguladze',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Erdős Virág',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ernestina Naadu Mills',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Fiona Wood',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Franck Pavloff',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Franck Thilliez',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Frank Fabra',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Fred Swaniker',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gabe Zichermann',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gherardo Colombo',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gilles Verdez',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Gizele Thakral',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Grady Jarrett',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Helen Arney',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Hessa Al Jaber',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Hiromi Uehara',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Houlin Zhao',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Hyvin Jepkemoi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Inguna Sudraba',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Iványi Gábor',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jackie Kay',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jamshyd Godrej',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jay Vinchi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jayati Ghosh',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jeffrey Tucker',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Jens Lehmann',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Josh Fox',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Josh Smith',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'João Sayad',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Juan Carlos Monedero',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kalyan Varma',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kate Brown',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kate Clinton',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kawlo Iyun Pacidal',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Kelly Benoit-Bird',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Khem Veasna',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Laila Al-Arian',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Leila Chudori',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Lhadon Tethong',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Lil Kesh',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Maguy Bou Ghosn',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Makase Nyaphisi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Malek Chebel',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mandana Karimi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Manuela Carmena',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Marc Laménie',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Marina Ruy Barbosa',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Marion Montaigne',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Marisa Matias',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mark Steyn',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mauricio Rodas',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mawra Hocane',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Melissa Gira Grant',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mhairi Black',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Michel Bauwens',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Michel Bussi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Morgan Marquis-Boire',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Mrinal Kulkarni',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Naomi Shelton',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Natalio Botana',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Nguyễn Thị Kim Ngân',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Nina Tandon',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Nina Turner',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Nyle DiMarco',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Okyeame Kwame',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Pamela Samuelson',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Papis Loveday',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Patrick Pelloux',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Piya Sorcar',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Rajkummar Rao',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Rami Ranger',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Raul Krauthausen',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Richard Nguema',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Robert Lefkowitz',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ronnie Ash',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ruby Yadav',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ryan Crocker',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Ryu Jun-yeol',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Samiksha Bhatnagar',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Shantell Martin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Shiho Yano',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Shirin Sharmin Chaudhury',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Spencer Stone',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Takaaki Kajita',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Tavis Smiley',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Tope Folarin',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Virginia Raggi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Walidah Imarisha',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Wes Schweitzer',\n", " 'name_msceleb': ''},\n", " {'kg_id': '', 'lang_msceleb': '', 'name_ijb_c': 'Win Tin', 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Wutt Hmone Shwe Yi',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Yassine Brahim',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Yemi Alade',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Young Guru',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Zakhar May',\n", " 'name_msceleb': ''},\n", " {'kg_id': '',\n", " 'lang_msceleb': '',\n", " 'name_ijb_c': 'Zoë Keating',\n", " 'name_msceleb': ''}]\n" ] } ], "source": [ "pprint(unfound_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save CSV" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "df_ijbc_found = pd.DataFrame.from_dict(found_names)\n", "df_ijbc_unfound = pd.DataFrame.from_dict(unfound_names)" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "fp_found = '/data_store/datasets/people/ijb_c/processed/ijb_c_msceleb_found.csv'\n", "fp_unfound = '/data_store/datasets/people/ijb_c/processed/ijb_c_msceleb_unfound.csv'" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [], "source": [ "df_ijbc_found.to_csv(fp_found, index=False)\n", "df_ijbc_unfound.to_csv(fp_unfound, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "megapixels", "language": "python", "name": "megapixels" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }