{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add identity ID to index"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"from pathlib import Path\n",
"import difflib\n",
"\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# names\n",
"DATA_STORE_NAS = '/data_store_nas/'\n",
"dir_dataset = 'datasets/people/lfw/metadata'"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"# split records into index and uuids\n",
"fp_records = join(DATA_STORE_NAS, dir_dataset, 'records.csv')\n",
"fp_index = join(DATA_STORE_NAS, dir_dataset, 'index.csv')\n",
"fp_uuids = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n",
"fp_identities = join(DATA_STORE_NAS, dir_dataset, 'identities.csv')\n",
"fp_files = join(DATA_STORE_NAS, dir_dataset, 'files.csv')\n",
"# load\n",
"df_records = pd.read_csv(fp_records).set_index('index')\n",
"df_index = pd.read_csv(fp_index).set_index('index')\n",
"df_uuids = pd.read_csv(fp_uuids).set_index('index')\n",
"df_files = pd.read_csv(fp_files).set_index('index')\n",
"df_identities = pd.read_csv(fp_identities)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" AJ_Cook_0001 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" AJ_Cook | \n",
" f03fd921-2d56-4e83-8115-f658d6a72287 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" AJ_Lamas | \n",
" 0c96c5bb-dbd1-4584-bd68-af11664b98bb | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" Aaron_Eckhart_0001 | \n",
" b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... | \n",
" Aaron_Eckhart | \n",
" 8221e75c-9537-4a4f-9693-483b445244b4 | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" Aaron_Guiel_0001 | \n",
" 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... | \n",
" Aaron_Guiel | \n",
" a2955610-ed5e-433c-bdd4-e3a72ff44736 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" Aaron_Patterson_0001 | \n",
" 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... | \n",
" Aaron_Patterson | \n",
" 1d0782e9-ed16-4550-b1e9-d9c03eef6181 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn \\\n",
"index \n",
"0 jpg AJ_Cook_0001 \n",
"1 jpg AJ_Lamas_0001 \n",
"2 jpg Aaron_Eckhart_0001 \n",
"3 jpg Aaron_Guiel_0001 \n",
"4 jpg Aaron_Patterson_0001 \n",
"\n",
" sha256 subdir \\\n",
"index \n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n",
"2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n",
"3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n",
"4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n",
"\n",
" uuid \n",
"index \n",
"0 f03fd921-2d56-4e83-8115-f658d6a72287 \n",
"1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n",
"2 8221e75c-9537-4a4f-9693-483b445244b4 \n",
"3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n",
"4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 "
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_records.head()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sha256 | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
"
\n",
" \n",
" | 1 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sha256\n",
"index \n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...\n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193..."
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_index.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" uuid | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" f03fd921-2d56-4e83-8115-f658d6a72287 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0c96c5bb-dbd1-4584-bd68-af11664b98bb | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" uuid\n",
"index \n",
"0 f03fd921-2d56-4e83-8115-f658d6a72287\n",
"1 0c96c5bb-dbd1-4584-bd68-af11664b98bb"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_uuids.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" AJ_Cook_0001 | \n",
" AJ_Cook | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" AJ_Lamas | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn subdir\n",
"index \n",
"0 jpg AJ_Cook_0001 AJ_Cook\n",
"1 jpg AJ_Lamas_0001 AJ_Lamas"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_files.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" name | \n",
" description | \n",
" gender | \n",
" images | \n",
" index_image | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" A. J. Cook | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" AJ Lamas | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index name description gender images index_image\n",
"0 0 A. J. Cook Canadian actress f 1 0\n",
"1 1 AJ Lamas American actor m 1 1"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identities.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sha256 | \n",
" identity | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" -1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" -1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sha256 identity\n",
"index \n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... -1\n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... -1"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# associate each file with an identity\n",
"df_index['identity'] = [-1] * len(df_index)\n",
"df_index.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"# add index column to identities\n",
"#df_identities.index.name = 'index'\n",
"#df_identities.to_csv(fp_identities)"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6d5c1afd241142a99cd654b161ce003e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=13233), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"for row in tqdm(df_index.itertuples(), total=len(df_index)):\n",
" record_index = row.Index\n",
" sha256 = row.sha256\n",
" row_record_match = df_records.loc[(df_records['sha256'] == sha256)]\n",
" name = row_record_match.subdir.values[0]\n",
" name = name.replace('_',' ')\n",
" row_identity_match = df_identities.loc[(df_identities['name_orig'] == name)]\n",
" identity_index = row_identity_match.index.values[0]\n",
" df_index.at[row.Index, 'identity'] = identity_index"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"df_index.to_csv(fp_index_new)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"# make a clean index separate from files"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'AJ Lamas'"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#df_identies = pd.read_csv('identities.csv')\n",
"df_identities.iloc[1]['name']"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 2 3 4\n"
]
}
],
"source": [
"a = [1,2,3,4]\n",
"\n",
"print(*a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}