{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Append UUID to SHA256 CSV"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"from pathlib import Path\n",
"import difflib\n",
"\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import pandas as pd\n",
"import dask.dataframe as dd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# names\n",
"DATA_STORE_NAS = '/data_store_nas/'\n",
"dir_dataset = 'datasets/people/vgg_face2/metadata'"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sha256 | \n",
" identity | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" -1 | \n",
"
\n",
" \n",
" | 1 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" -1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sha256 identity\n",
"index \n",
"0 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... -1\n",
"1 e360f93613baa68cede6731d2603873cdabd3993841cfd... -1"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# associate each file with an identity\n",
"fp_index = join(DATA_STORE_NAS, dir_dataset, 'index.csv')\n",
"df_index = pd.read_csv(fp_index).set_index('index')\n",
"df_index['identity'] = [-1] * len(df_index)\n",
"df_index.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3311286\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" 0213_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" 0010_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" 0115_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn subdir\n",
"index \n",
"0 jpg 0089_01 test/n006211\n",
"1 jpg 0168_01 test/n006211\n",
"2 jpg 0213_01 test/n006211\n",
"3 jpg 0010_01 test/n006211\n",
"4 jpg 0115_01 test/n006211"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get file info\n",
"fp_files = join(DATA_STORE_NAS, dir_dataset, 'files.csv')\n",
"df_files = pd.read_csv(fp_files).set_index('index')\n",
"print(len(df_files))\n",
"df_files.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9131\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" n000001 | \n",
" Dalai Lama | \n",
" m | \n",
" 424 | \n",
" 14th Dalai Lama | \n",
"
\n",
" \n",
" | 1 | \n",
" n000002 | \n",
" American singer-songwriter | \n",
" f | \n",
" 315 | \n",
" A Fine Frenzy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id description gender images name\n",
"index \n",
"0 n000001 Dalai Lama m 424 14th Dalai Lama\n",
"1 n000002 American singer-songwriter f 315 A Fine Frenzy"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fp_identities = join(DATA_STORE_NAS, dir_dataset, 'identities.csv')\n",
"df_identities = dd.read_csv(fp_identities).set_index('index')\n",
"print(len(df_identities))\n",
"df_identities.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"class_ids = {}\n",
"for row in df_identities.itertuples():\n",
" class_id = row.class_id\n",
" index = row.Index\n",
" if class_id not in class_ids.keys():\n",
" class_ids[class_id] = index"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ac332e87bee54d53a0e29efbdfa86d65",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=3311286), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for row in tqdm(df_index.itertuples(), total=len(df_index)):\n",
" file_index = row.Index\n",
" file_row = df_files.iloc[file_index]\n",
" subdir = file_row['subdir']\n",
" class_id = subdir.split('/')[1]\n",
" identity_index = class_ids[class_id]\n",
" df_index.at[row.Index, 'identity'] = identity_index"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"fp_index_new = join(DATA_STORE_NAS, dir_dataset, 'index_new.csv')\n",
"df_index.to_csv(fp_index_new)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sha256 | \n",
" identity | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" 6123 | \n",
"
\n",
" \n",
" | 1 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" 6123 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... | \n",
" 6123 | \n",
"
\n",
" \n",
" | 3 | \n",
" 577ce218e4a61e612942c55fd172cac4b48becacbfc708... | \n",
" 6123 | \n",
"
\n",
" \n",
" | 4 | \n",
" b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... | \n",
" 6123 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sha256 identity\n",
"index \n",
"0 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... 6123\n",
"1 e360f93613baa68cede6731d2603873cdabd3993841cfd... 6123\n",
"2 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... 6123\n",
"3 577ce218e4a61e612942c55fd172cac4b48becacbfc708... 6123\n",
"4 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... 6123"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_index.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}