{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Add identity ID to index"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"from pathlib import Path\n",
"import difflib\n",
"\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# names\n",
"DATA_STORE = '/data_store_ssd/'\n",
"dir_dataset = 'datasets/people/lfw/metadata'"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# split records into index and uuids\n",
"fp_identity_in = join(DATA_STORE, dir_dataset, 'identities_old.csv')\n",
"fp_identity_out = join(DATA_STORE, dir_dataset, 'identity_lookup.csv')\n",
"\n",
"df_identity = pd.read_csv(fp_identity_in).set_index('index')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" name_orig | \n",
" description | \n",
" gender | \n",
" images | \n",
" image_index | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A. J. Cook | \n",
" AJ Cook | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" Aaron Eckhart | \n",
" Aaron Eckhart | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" Aaron Guiel | \n",
" Aaron Guiel | \n",
" Professional baseball player | \n",
" m | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" Aaron Patterson | \n",
" Aaron Patterson | \n",
" Author | \n",
" m | \n",
" 1 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name name_orig description gender \\\n",
"index \n",
"0 A. J. Cook AJ Cook Canadian actress f \n",
"1 AJ Lamas AJ Lamas American actor m \n",
"2 Aaron Eckhart Aaron Eckhart American actor m \n",
"3 Aaron Guiel Aaron Guiel Professional baseball player m \n",
"4 Aaron Patterson Aaron Patterson Author m \n",
"\n",
" images image_index \n",
"index \n",
"0 1 0 \n",
"1 1 1 \n",
"2 1 2 \n",
"3 1 3 \n",
"4 1 4 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identity.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" name_orig | \n",
" description | \n",
" gender | \n",
" images | \n",
" image_index | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A. J. Cook | \n",
" AJ Cook | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" 0 | \n",
" | \n",
"
\n",
" \n",
" | 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 1 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name name_orig description gender images image_index \\\n",
"index \n",
"0 A. J. Cook AJ Cook Canadian actress f 1 0 \n",
"1 AJ Lamas AJ Lamas American actor m 1 1 \n",
"\n",
" subdir \n",
"index \n",
"0 \n",
"1 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# associate each file with an identity\n",
"df_identity['subdir'] = [''] * len(df_identity)\n",
"df_identity.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ece5c11b90954b25b1f1e28fc2fe6b55",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=5749), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"for row in tqdm(df_identity.itertuples(), total=len(df_identity)):\n",
" name = row.name_orig\n",
" subdir = name.replace(' ','_')\n",
" df_identity.at[row.Index, 'subdir'] = subdir"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" name_orig | \n",
" description | \n",
" gender | \n",
" images | \n",
" image_index | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" A. J. Cook | \n",
" AJ Cook | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" 0 | \n",
" AJ_Cook | \n",
"
\n",
" \n",
" | 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 1 | \n",
" AJ_Lamas | \n",
"
\n",
" \n",
" | 2 | \n",
" Aaron Eckhart | \n",
" Aaron Eckhart | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" 2 | \n",
" Aaron_Eckhart | \n",
"
\n",
" \n",
" | 3 | \n",
" Aaron Guiel | \n",
" Aaron Guiel | \n",
" Professional baseball player | \n",
" m | \n",
" 1 | \n",
" 3 | \n",
" Aaron_Guiel | \n",
"
\n",
" \n",
" | 4 | \n",
" Aaron Patterson | \n",
" Aaron Patterson | \n",
" Author | \n",
" m | \n",
" 1 | \n",
" 4 | \n",
" Aaron_Patterson | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name name_orig description gender \\\n",
"index \n",
"0 A. J. Cook AJ Cook Canadian actress f \n",
"1 AJ Lamas AJ Lamas American actor m \n",
"2 Aaron Eckhart Aaron Eckhart American actor m \n",
"3 Aaron Guiel Aaron Guiel Professional baseball player m \n",
"4 Aaron Patterson Aaron Patterson Author m \n",
"\n",
" images image_index subdir \n",
"index \n",
"0 1 0 AJ_Cook \n",
"1 1 1 AJ_Lamas \n",
"2 1 2 Aaron_Eckhart \n",
"3 1 3 Aaron_Guiel \n",
"4 1 4 Aaron_Patterson "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identity.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"df_identity.to_csv(fp_identity_out)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"# make a clean index separate from files"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'AJ Lamas'"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#df_identies = pd.read_csv('identities.csv')\n",
"df_identities.iloc[1]['name']"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 2 3 4\n"
]
}
],
"source": [
"a = [1,2,3,4]\n",
"\n",
"print(*a)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}