{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Append UUID to SHA256 CSV"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [],
"source": [
"from os.path import join\n",
"from pathlib import Path\n",
"import difflib\n",
"\n",
"from tqdm import tqdm_notebook as tqdm\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/data_store_nas/datasets/people/vgg_face2/metadata/records_uuid.csv True\n",
"/data_store_nas/datasets/people/vgg_face2/metadata/records.csv True\n"
]
}
],
"source": [
"# names\n",
"DATA_STORE_NAS = '/data_store_nas/'\n",
"dir_dataset = 'datasets/people/vgg_face2/metadata'\n",
"fp_records_uuids = join(DATA_STORE_NAS, dir_dataset, 'records_uuid.csv')\n",
"fp_uuids_new = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n",
"# record\n",
"fp_records = join(DATA_STORE_NAS, dir_datset, 'records.csv')\n",
"fp_records_new = join(DATA_STORE_NAS, dir_datset, 'records_new.csv')\n",
"print(fp_uuids, Path(fp_uuids).is_file())\n",
"print(fp_records, Path(fp_records).is_file())"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"def similarity(a, b):\n",
" seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n",
" return seq.ratio()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"df_records = pd.read_csv(fp_records)\n",
"df_records_uuids = pd.read_csv(fp_records_uuids)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" test/n006211 | \n",
" 88ac6abd-6039-442b-b31f-2db8d575363a | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" test/n006211 | \n",
" 73acbc00-2cb5-4260-8db3-b88ca7c29c72 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn sha256 \\\n",
"0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n",
"1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n",
"\n",
" subdir uuid \n",
"0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n",
"1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 "
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_uuids.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" test/n006211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn sha256 \\\n",
"0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n",
"1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n",
"\n",
" subdir \n",
"0 test/n006211 \n",
"1 test/n006211 "
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_records.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"# fix the records and save to new csv\n",
"df_records['index'] = [''] * len(df_records)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"for idx, row in tqdm(df_records.iterrows(), total=len(df_records)):\n",
" df_records.at[idx, 'index'] = idx"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" 0213_01 | \n",
" 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" 0010_01 | \n",
" 577ce218e4a61e612942c55fd172cac4b48becacbfc708... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" 0115_01 | \n",
" b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... | \n",
" test/n006211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn sha256 \\\n",
"index \n",
"0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n",
"1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n",
"2 jpg 0213_01 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... \n",
"3 jpg 0010_01 577ce218e4a61e612942c55fd172cac4b48becacbfc708... \n",
"4 jpg 0115_01 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... \n",
"\n",
" subdir \n",
"index \n",
"0 test/n006211 \n",
"1 test/n006211 \n",
"2 test/n006211 \n",
"3 test/n006211 \n",
"4 test/n006211 "
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#df_records.set_index('index')\n",
"#df_records.head()\n",
"df_records.to_csv(fp_records_new)"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"df_files = df_records.copy()\n",
"fp_files = join(DATA_STORE_NAS, dir_datset, 'files.csv')"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"df_files = df_files.drop(['sha256'], axis=1)\n",
"df_files.to_csv(fp_files)"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"# make another CSV just for the sha\n",
"df_sha256s = df_records.copy()\n",
"fp_sha256s = join(DATA_STORE_NAS, dir_datset, 'sha256s.csv')\n",
"df_sha256s = df_sha256s.drop(['ext', 'fn', 'subdir'], axis=1)\n",
"df_sha256s.to_csv(fp_sha256s)\n"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" test/n006211 | \n",
" 88ac6abd-6039-442b-b31f-2db8d575363a | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" test/n006211 | \n",
" 73acbc00-2cb5-4260-8db3-b88ca7c29c72 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn sha256 \\\n",
"0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n",
"1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n",
"\n",
" subdir uuid \n",
"0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n",
"1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 "
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create another CSV just for the UUIDs\n",
"df_records_uuids.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"df_uuids = df_records_uuids.copy()"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"df_uuids = df_uuids.drop(['subdir', 'fn', 'ext', 'sha256'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 88ac6abd-6039-442b-b31f-2db8d575363a | \n",
"
\n",
" \n",
" | 1 | \n",
" 73acbc00-2cb5-4260-8db3-b88ca7c29c72 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" uuid\n",
"0 88ac6abd-6039-442b-b31f-2db8d575363a\n",
"1 73acbc00-2cb5-4260-8db3-b88ca7c29c72"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_uuids.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"df_uuids.index.name = 'index'\n",
"df_uuids.to_csv(fp_uuids_new)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for idx, row in tqdm(df_records_uuids[:2].iterrows(), total=len(df_records_uuids)):\n",
" sha256 = row['sha256']\n",
" row_match = df_records.loc[(df_records['subdir'] == subdir)]\n",
" df_rois.at[idx, 'idx'] = int(row_match.index[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Change ROI to use index"
]
},
{
"cell_type": "code",
"execution_count": 372,
"metadata": {},
"outputs": [],
"source": [
"fp_rois = join(DATA_STORE_NAS, dir_datset, 'rois.csv')\n",
"df_rois = pd.read_csv(fp_rois)"
]
},
{
"cell_type": "code",
"execution_count": 373,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" e360f93613baa68cede6731d2603873cdabd3993841cfd... | \n",
" test/n006211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn sha256 \\\n",
"index \n",
"0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n",
"1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n",
"\n",
" subdir \n",
"index \n",
"0 test/n006211 \n",
"1 test/n006211 "
]
},
"execution_count": 373,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_records.index.name = 'index'\n",
"df_records.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 374,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" ext | \n",
" fn | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" subdir | \n",
" w | \n",
" x | \n",
" y | \n",
" index_new | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.473333 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.668246 | \n",
" 0.279621 | \n",
" 0.28 | \n",
" -1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.326667 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.464455 | \n",
" -0.156398 | \n",
" 0.12 | \n",
" -1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 ext fn h image_height image_width \\\n",
"index \n",
"0 0 jpg 0089_01 0.473333 304 214 \n",
"1 1 jpg 0089_01 0.326667 304 214 \n",
"\n",
" subdir w x y index_new \n",
"index \n",
"0 test/n006211 0.668246 0.279621 0.28 -1 \n",
"1 test/n006211 0.464455 -0.156398 0.12 -1 "
]
},
"execution_count": 374,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rois['index_new'] = [-1] * len(df_rois)\n",
"df_rois.index.name = 'index'\n",
"df_rois.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 375,
"metadata": {},
"outputs": [],
"source": [
"df_records_copy = df_records.copy()"
]
},
{
"cell_type": "code",
"execution_count": 376,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9131\n"
]
}
],
"source": [
"df_records_subdirs = df_records_copy.groupby('subdir')\n",
"print(len(df_records_subdirs))"
]
},
{
"cell_type": "code",
"execution_count": 377,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9131\n"
]
}
],
"source": [
"roi_subdir_groups = df_rois.groupby('subdir')\n",
"print(len(roi_subdir_groups))"
]
},
{
"cell_type": "code",
"execution_count": 387,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "module 'pandas' has no attribute 'index'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhelp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'index'"
]
}
],
"source": [
"help(pd.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 390,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"81817\n"
]
}
],
"source": [
"print(row.Index)"
]
},
{
"cell_type": "code",
"execution_count": 392,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "91680fc6bee04ce087a60be57ab5a58c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=9131), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for subdir, record_group in tqdm(df_records_subdirs, total=df_records_subdirs.ngroups):\n",
" #print(name) # 'test/n00001'\n",
" roi_group = roi_subdir_groups.get_group(subdir)\n",
"# print(type(roi_group))\n",
" # for every item in the roi_group, get index from record group\n",
" for row in roi_group.itertuples():\n",
" #row_match = record_group.loc[record_group['fn'] == row.fn]\n",
" # get the index from record group where it matches this fn\n",
" #print(len(record_group))\n",
" #record_group.where('fn',row.fn)\n",
" row_match = record_group.loc[(record_group['fn'] == row.fn)]\n",
" df_rois.at[row.Index, 'index_new'] = int(row_match.index[0])\n",
" #record_group[record_group['fn'].str.match(fn)]\n",
" \n",
" #print(int(row_match.index[0]))\n",
" #print('subdir: {}, fn: {}, index: {}'.format(row.subdir, row.fn, master_index))\n",
" \n",
" # NB avoid using iterrows() is very slow. use iteritems\n",
" #print(roi_row['fn'])\n",
" #print(row.at['subdir', 0])"
]
},
{
"cell_type": "code",
"execution_count": 411,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" ext | \n",
" fn | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" subdir | \n",
" w | \n",
" x | \n",
" y | \n",
" index_new | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.473333 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.668246 | \n",
" 0.279621 | \n",
" 0.28 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.326667 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.464455 | \n",
" -0.156398 | \n",
" 0.12 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 ext fn h image_height image_width \\\n",
"index \n",
"0 0 jpg 0089_01 0.473333 304 214 \n",
"1 1 jpg 0089_01 0.326667 304 214 \n",
"\n",
" subdir w x y index_new \n",
"index \n",
"0 test/n006211 0.668246 0.279621 0.28 0 \n",
"1 test/n006211 0.464455 -0.156398 0.12 0 "
]
},
"execution_count": 411,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rois_new = df_rois.copy()\n",
"df_rois_new.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 413,
"metadata": {},
"outputs": [],
"source": [
"df_rois_new = df_rois_new.drop(df_rois_new.columns[df_rois_new.columns.str.contains('unnamed',case = False)],axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 422,
"metadata": {},
"outputs": [],
"source": [
"df_rois_new = df_rois_new.set_index('index_new')"
]
},
{
"cell_type": "code",
"execution_count": 423,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ext', 'fn', 'h', 'image_height', 'image_width', 'subdir', 'w', 'x', 'y']"
]
},
"execution_count": 423,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(df_rois_new.columns.values)"
]
},
{
"cell_type": "code",
"execution_count": 425,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" subdir | \n",
" w | \n",
" x | \n",
" y | \n",
"
\n",
" \n",
" | index_new | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.473333 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.668246 | \n",
" 0.279621 | \n",
" 0.280000 | \n",
"
\n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.326667 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.464455 | \n",
" -0.156398 | \n",
" 0.120000 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" 0.393333 | \n",
" 471 | \n",
" 419 | \n",
" test/n006211 | \n",
" 0.443609 | \n",
" 0.263158 | \n",
" 0.273333 | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" 0213_01 | \n",
" 0.462745 | \n",
" 408 | \n",
" 480 | \n",
" test/n006211 | \n",
" 0.393333 | \n",
" 0.246667 | \n",
" 0.082353 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" 0115_01 | \n",
" 0.438662 | \n",
" 360 | \n",
" 401 | \n",
" test/n006211 | \n",
" 0.393333 | \n",
" 0.286667 | \n",
" 0.245353 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn h image_height image_width subdir \\\n",
"index_new \n",
"0 jpg 0089_01 0.473333 304 214 test/n006211 \n",
"0 jpg 0089_01 0.326667 304 214 test/n006211 \n",
"1 jpg 0168_01 0.393333 471 419 test/n006211 \n",
"2 jpg 0213_01 0.462745 408 480 test/n006211 \n",
"4 jpg 0115_01 0.438662 360 401 test/n006211 \n",
"\n",
" w x y \n",
"index_new \n",
"0 0.668246 0.279621 0.280000 \n",
"0 0.464455 -0.156398 0.120000 \n",
"1 0.443609 0.263158 0.273333 \n",
"2 0.393333 0.246667 0.082353 \n",
"4 0.393333 0.286667 0.245353 "
]
},
"execution_count": 425,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rois_new.head()"
]
},
{
"cell_type": "code",
"execution_count": 426,
"metadata": {},
"outputs": [],
"source": [
"fp_rois_new = join(DATA_STORE_NAS, dir_dataset, 'rois_new.csv')\n",
"df_rois_new.to_csv(fp_rois_new)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fix identity meta"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 458,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta = pd.read_csv(fp_identity_meta)\n",
"df_files = pd.read_csv(fp_files).set_index('index')"
]
},
{
"cell_type": "code",
"execution_count": 459,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" subdir | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" 0168_01 | \n",
" test/n006211 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn subdir\n",
"index \n",
"0 jpg 0089_01 test/n006211\n",
"1 jpg 0168_01 test/n006211"
]
},
"execution_count": 459,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_files.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 460,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" n009279 | \n",
" Former soccer player | \n",
" f | \n",
" 365 | \n",
" Noriko Baba | \n",
"
\n",
" \n",
" | 1 | \n",
" n009278 | \n",
" Japanese singer-songwriter | \n",
" f | \n",
" 181 | \n",
" Hiromi Satō | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id description gender images name\n",
"0 n009279 Former soccer player f 365 Noriko Baba\n",
"1 n009278 Japanese singer-songwriter f 181 Hiromi Satō"
]
},
"execution_count": 460,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identity_meta.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 461,
"metadata": {},
"outputs": [],
"source": [
"# create a lookup table of ids\n",
"class_ids = {}\n",
"for row in df_files.itertuples():\n",
" class_id = row.subdir.split('/')[1]\n",
" if class_id not in class_ids.keys():\n",
" class_ids[class_id] = row.Index"
]
},
{
"cell_type": "code",
"execution_count": 463,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta['index_new'] = [-1] * len(df_identity_meta)"
]
},
{
"cell_type": "code",
"execution_count": 464,
"metadata": {},
"outputs": [],
"source": [
"# add col to identity for 'index_new'\n",
"for row in df_identity_meta.itertuples():\n",
" df_identity_meta.at[row.Index, 'index_new'] = class_ids[row.class_id]\n",
"# iterate through"
]
},
{
"cell_type": "code",
"execution_count": 465,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
" index_new | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" n009279 | \n",
" Former soccer player | \n",
" f | \n",
" 365 | \n",
" Noriko Baba | \n",
" 1808008 | \n",
"
\n",
" \n",
" | 1 | \n",
" n009278 | \n",
" Japanese singer-songwriter | \n",
" f | \n",
" 181 | \n",
" Hiromi Satō | \n",
" 943052 | \n",
"
\n",
" \n",
" | 2 | \n",
" n009277 | \n",
" Japanese fashion model | \n",
" m | \n",
" 409 | \n",
" Ranko Kanbe | \n",
" 595852 | \n",
"
\n",
" \n",
" | 3 | \n",
" n009276 | \n",
" Japanese musician | \n",
" f | \n",
" 177 | \n",
" Yurie Matsui | \n",
" 2922103 | \n",
"
\n",
" \n",
" | 4 | \n",
" n009275 | \n",
" Japanese idol | \n",
" f | \n",
" 501 | \n",
" Karin Miyamoto | \n",
" 1388262 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id description gender images name \\\n",
"0 n009279 Former soccer player f 365 Noriko Baba \n",
"1 n009278 Japanese singer-songwriter f 181 Hiromi Satō \n",
"2 n009277 Japanese fashion model m 409 Ranko Kanbe \n",
"3 n009276 Japanese musician f 177 Yurie Matsui \n",
"4 n009275 Japanese idol f 501 Karin Miyamoto \n",
"\n",
" index_new \n",
"0 1808008 \n",
"1 943052 \n",
"2 595852 \n",
"3 2922103 \n",
"4 1388262 "
]
},
"execution_count": 465,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identity_meta.set_index('index_new')\n",
"df_identity_meta.head()"
]
},
{
"cell_type": "code",
"execution_count": 466,
"metadata": {},
"outputs": [],
"source": [
"fp_identity_meta_new = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')\n",
"df_identity_meta.to_csv(fp_identity_meta_new)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for idx, row in tqdm(df_records.iterrows(), total=len(df_rois)):\n",
" subdir = row['subdir']\n",
" fn = row['fn']\n",
" row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n",
" for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n",
" row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n",
" df_rois.at[idx, 'index'] = \n",
" int(row_match.index[0])\n",
" df_records.drop(df.index[2])"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "04ad99a7cba9443ebee5b26a1c4cddf1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=3325795), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"Exception ignored in: \n",
"Traceback (most recent call last):\n",
" File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 931, in __del__\n",
" self.close()\n",
" File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 1133, in close\n",
" self._decr_instances(self)\n",
" File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 496, in _decr_instances\n",
" cls.monitor.exit()\n",
" File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_monitor.py\", line 52, in exit\n",
" self.join()\n",
" File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/threading.py\", line 1053, in join\n",
" raise RuntimeError(\"cannot join current thread\")\n",
"RuntimeError: cannot join current thread\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_records\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_records\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_rois\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1282\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrstate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1283\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mna_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1284\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1285\u001b[0m raise TypeError('Could not compare {typ} type with Series'\n",
"\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mna_op\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_comp_method_OBJECT_ARRAY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_datetimelike_v_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36m_comp_method_OBJECT_ARRAY\u001b[0;34m(op, x, y)\u001b[0m\n\u001b[1;32m 1120\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvec_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1121\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"for idx, row in tqdm(df_rois.iterrows(), total=len(df_rois)):\n",
" subdir = row['subdir']\n",
" fn = row['fn']\n",
" row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n",
" for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n",
" row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n",
" df_rois.at[idx, 'index'] = int(row_match.index[0])\n",
" df_records.drop(df.index[2])"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"subdir = 'test/n006211'\n",
"rows_records = df_records.loc[df_records['subdir'] == subdir]\n",
"rows_rois = df_rois.loc[df_rois['subdir'] == subdir ]"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" ext | \n",
" fn | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" subdir | \n",
" w | \n",
" x | \n",
" y | \n",
" index | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.473333 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.668246 | \n",
" 0.279621 | \n",
" 0.280000 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" jpg | \n",
" 0089_01 | \n",
" 0.326667 | \n",
" 304 | \n",
" 214 | \n",
" test/n006211 | \n",
" 0.464455 | \n",
" -0.156398 | \n",
" 0.120000 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" jpg | \n",
" 0168_01 | \n",
" 0.393333 | \n",
" 471 | \n",
" 419 | \n",
" test/n006211 | \n",
" 0.443609 | \n",
" 0.263158 | \n",
" 0.273333 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" jpg | \n",
" 0213_01 | \n",
" 0.462745 | \n",
" 408 | \n",
" 480 | \n",
" test/n006211 | \n",
" 0.393333 | \n",
" 0.246667 | \n",
" 0.082353 | \n",
" 2 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" jpg | \n",
" 0115_01 | \n",
" 0.438662 | \n",
" 360 | \n",
" 401 | \n",
" test/n006211 | \n",
" 0.393333 | \n",
" 0.286667 | \n",
" 0.245353 | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
" jpg | \n",
" 0511_02 | \n",
" 0.393333 | \n",
" 417 | \n",
" 415 | \n",
" test/n006211 | \n",
" 0.395973 | \n",
" 0.265101 | \n",
" 0.233333 | \n",
" 6 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
" jpg | \n",
" 0032_01 | \n",
" 0.393333 | \n",
" 143 | \n",
" 125 | \n",
" test/n006211 | \n",
" 0.450382 | \n",
" 0.255725 | \n",
" 0.313333 | \n",
" 7 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
" jpg | \n",
" 0502_01 | \n",
" 0.326667 | \n",
" 288 | \n",
" 252 | \n",
" test/n006211 | \n",
" 0.377863 | \n",
" 0.603053 | \n",
" 0.583333 | \n",
" 8 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
" jpg | \n",
" 0502_01 | \n",
" 0.393333 | \n",
" 288 | \n",
" 252 | \n",
" test/n006211 | \n",
" 0.450382 | \n",
" 0.301527 | \n",
" 0.313333 | \n",
" 8 | \n",
"
\n",
" \n",
" | 9 | \n",
" 9 | \n",
" jpg | \n",
" 0201_01 | \n",
" 0.393333 | \n",
" 187 | \n",
" 164 | \n",
" test/n006211 | \n",
" 0.448669 | \n",
" 0.243346 | \n",
" 0.313333 | \n",
" 9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 ext fn h image_height image_width \\\n",
"0 0 jpg 0089_01 0.473333 304 214 \n",
"1 1 jpg 0089_01 0.326667 304 214 \n",
"2 2 jpg 0168_01 0.393333 471 419 \n",
"3 3 jpg 0213_01 0.462745 408 480 \n",
"4 4 jpg 0115_01 0.438662 360 401 \n",
"5 5 jpg 0511_02 0.393333 417 415 \n",
"6 6 jpg 0032_01 0.393333 143 125 \n",
"7 7 jpg 0502_01 0.326667 288 252 \n",
"8 8 jpg 0502_01 0.393333 288 252 \n",
"9 9 jpg 0201_01 0.393333 187 164 \n",
"\n",
" subdir w x y index \n",
"0 test/n006211 0.668246 0.279621 0.280000 0 \n",
"1 test/n006211 0.464455 -0.156398 0.120000 0 \n",
"2 test/n006211 0.443609 0.263158 0.273333 1 \n",
"3 test/n006211 0.393333 0.246667 0.082353 2 \n",
"4 test/n006211 0.393333 0.286667 0.245353 4 \n",
"5 test/n006211 0.395973 0.265101 0.233333 6 \n",
"6 test/n006211 0.450382 0.255725 0.313333 7 \n",
"7 test/n006211 0.377863 0.603053 0.583333 8 \n",
"8 test/n006211 0.450382 0.301527 0.313333 8 \n",
"9 test/n006211 0.448669 0.243346 0.313333 9 "
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rois.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"df_meta.to_csv(fp_out)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"fp_meta = join(dir_dataset, 'identity_meta_kg_clean.csv')\n",
"df = pd.read_csv(fp_meta)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"descs = []\n",
"for idx, row in df.iterrows():\n",
" descs.append(row['description'])"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"descs = set(descs)"
]
},
{
"cell_type": "code",
"execution_count": 472,
"metadata": {},
"outputs": [],
"source": [
"fp_identity_meta = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')"
]
},
{
"cell_type": "code",
"execution_count": 473,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta = pd.read_csv(fp_identity_meta)"
]
},
{
"cell_type": "code",
"execution_count": 474,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" class_id | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" n000001 | \n",
" Dalai Lama | \n",
" m | \n",
" 424 | \n",
" 14th Dalai Lama | \n",
"
\n",
" \n",
" | 1 | \n",
" n000002 | \n",
" American singer-songwriter | \n",
" f | \n",
" 315 | \n",
" A Fine Frenzy | \n",
"
\n",
" \n",
" | 2 | \n",
" n000003 | \n",
" British writer | \n",
" m | \n",
" 205 | \n",
" A. A. Gill | \n",
"
\n",
" \n",
" | 3 | \n",
" n000004 | \n",
" Canadian-Irish actor | \n",
" m | \n",
" 387 | \n",
" AJ Buckley | \n",
"
\n",
" \n",
" | 4 | \n",
" n000005 | \n",
" Baseball catcher | \n",
" m | \n",
" 229 | \n",
" AJ Pierzynski | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" class_id description gender images name\n",
"0 n000001 Dalai Lama m 424 14th Dalai Lama\n",
"1 n000002 American singer-songwriter f 315 A Fine Frenzy\n",
"2 n000003 British writer m 205 A. A. Gill\n",
"3 n000004 Canadian-Irish actor m 387 AJ Buckley\n",
"4 n000005 Baseball catcher m 229 AJ Pierzynski"
]
},
"execution_count": 474,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_identity_meta.head()"
]
},
{
"cell_type": "code",
"execution_count": 475,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta.index.name = 'index'"
]
},
{
"cell_type": "code",
"execution_count": 476,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta.to_csv(fp_identity_meta)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}