{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Add identity ID to index" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from os.path import join\n", "from pathlib import Path\n", "import difflib\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# names\n", "DATA_STORE_NAS = '/data_store_nas/'\n", "dir_dataset = 'datasets/people/lfw/metadata'" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "# split records into index and uuids\n", "fp_records = join(DATA_STORE_NAS, dir_dataset, 'records.csv')\n", "fp_index = join(DATA_STORE_NAS, dir_dataset, 'index.csv')\n", "fp_uuids = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n", "fp_identities = join(DATA_STORE_NAS, dir_dataset, 'identities.csv')\n", "fp_files = join(DATA_STORE_NAS, dir_dataset, 'files.csv')\n", "# load\n", "df_records = pd.read_csv(fp_records).set_index('index')\n", "df_index = pd.read_csv(fp_index).set_index('index')\n", "df_uuids = pd.read_csv(fp_uuids).set_index('index')\n", "df_files = pd.read_csv(fp_files).set_index('index')\n", "df_identities = pd.read_csv(fp_identities)" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuid
index
0jpgAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a72287
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bb
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181
\n", "
" ], "text/plain": [ " ext fn \\\n", "index \n", "0 jpg AJ_Cook_0001 \n", "1 jpg AJ_Lamas_0001 \n", "2 jpg Aaron_Eckhart_0001 \n", "3 jpg Aaron_Guiel_0001 \n", "4 jpg Aaron_Patterson_0001 \n", "\n", " sha256 subdir \\\n", "index \n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", "\n", " uuid \n", "index \n", "0 f03fd921-2d56-4e83-8115-f658d6a72287 \n", "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n", "2 8221e75c-9537-4a4f-9693-483b445244b4 \n", "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n", "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 " ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_records.head()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sha256
index
0550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...
146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...
\n", "
" ], "text/plain": [ " sha256\n", "index \n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...\n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193..." ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_index.head(2)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uuid
index
0f03fd921-2d56-4e83-8115-f658d6a72287
10c96c5bb-dbd1-4584-bd68-af11664b98bb
\n", "
" ], "text/plain": [ " uuid\n", "index \n", "0 f03fd921-2d56-4e83-8115-f658d6a72287\n", "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_uuids.head(2)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsubdir
index
0jpgAJ_Cook_0001AJ_Cook
1jpgAJ_Lamas_0001AJ_Lamas
\n", "
" ], "text/plain": [ " ext fn subdir\n", "index \n", "0 jpg AJ_Cook_0001 AJ_Cook\n", "1 jpg AJ_Lamas_0001 AJ_Lamas" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_files.head(2)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexnamedescriptiongenderimagesindex_image
00A. J. CookCanadian actressf10
11AJ LamasAmerican actorm11
\n", "
" ], "text/plain": [ " index name description gender images index_image\n", "0 0 A. J. Cook Canadian actress f 1 0\n", "1 1 AJ Lamas American actor m 1 1" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_identities.head(2)" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sha256identity
index
0550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...-1
146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...-1
\n", "
" ], "text/plain": [ " sha256 identity\n", "index \n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... -1\n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... -1" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# associate each file with an identity\n", "df_index['identity'] = [-1] * len(df_index)\n", "df_index.head(2)" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "# add index column to identities\n", "#df_identities.index.name = 'index'\n", "#df_identities.to_csv(fp_identities)" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6d5c1afd241142a99cd654b161ce003e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=13233), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for row in tqdm(df_index.itertuples(), total=len(df_index)):\n", " record_index = row.Index\n", " sha256 = row.sha256\n", " row_record_match = df_records.loc[(df_records['sha256'] == sha256)]\n", " name = row_record_match.subdir.values[0]\n", " name = name.replace('_',' ')\n", " row_identity_match = df_identities.loc[(df_identities['name_orig'] == name)]\n", " identity_index = row_identity_match.index.values[0]\n", " df_index.at[row.Index, 'identity'] = identity_index" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "df_index.to_csv(fp_index_new)" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "# make a clean index separate from files" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AJ Lamas'" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#df_identies = pd.read_csv('identities.csv')\n", "df_identities.iloc[1]['name']" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 2 3 4\n" ] } ], "source": [ "a = [1,2,3,4]\n", "\n", "print(*a)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }