{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Append UUID to SHA256 CSV" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [], "source": [ "from os.path import join\n", "from pathlib import Path\n", "import difflib\n", "\n", "from tqdm import tqdm_notebook as tqdm\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/data_store_nas/datasets/people/vgg_face2/metadata/records_uuid.csv True\n", "/data_store_nas/datasets/people/vgg_face2/metadata/records.csv True\n" ] } ], "source": [ "# names\n", "DATA_STORE_NAS = '/data_store_nas/'\n", "dir_dataset = 'datasets/people/vgg_face2/metadata'\n", "fp_records_uuids = join(DATA_STORE_NAS, dir_dataset, 'records_uuid.csv')\n", "fp_uuids_new = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n", "# record\n", "fp_records = join(DATA_STORE_NAS, dir_datset, 'records.csv')\n", "fp_records_new = join(DATA_STORE_NAS, dir_datset, 'records_new.csv')\n", "print(fp_uuids, Path(fp_uuids).is_file())\n", "print(fp_records, Path(fp_records).is_file())" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [], "source": [ "def similarity(a, b):\n", " seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n", " return seq.ratio()" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [], "source": [ "df_records = pd.read_csv(fp_records)\n", "df_records_uuids = pd.read_csv(fp_records_uuids)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuid
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n00621188ac6abd-6039-442b-b31f-2db8d575363a
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n00621173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", "
" ], "text/plain": [ " ext fn sha256 \\\n", "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", "\n", " subdir uuid \n", "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_uuids.head(2)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdir
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
\n", "
" ], "text/plain": [ " ext fn sha256 \\\n", "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", "\n", " subdir \n", "0 test/n006211 \n", "1 test/n006211 " ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_records.head(2)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "# fix the records and save to new csv\n", "df_records['index'] = [''] * len(df_records)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "for idx, row in tqdm(df_records.iterrows(), total=len(df_records)):\n", " df_records.at[idx, 'index'] = idx" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdir
index
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
2jpg0213_013920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...test/n006211
3jpg0010_01577ce218e4a61e612942c55fd172cac4b48becacbfc708...test/n006211
4jpg0115_01b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...test/n006211
\n", "
" ], "text/plain": [ " ext fn sha256 \\\n", "index \n", "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", "2 jpg 0213_01 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... \n", "3 jpg 0010_01 577ce218e4a61e612942c55fd172cac4b48becacbfc708... \n", "4 jpg 0115_01 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... \n", "\n", " subdir \n", "index \n", "0 test/n006211 \n", "1 test/n006211 \n", "2 test/n006211 \n", "3 test/n006211 \n", "4 test/n006211 " ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#df_records.set_index('index')\n", "#df_records.head()\n", "df_records.to_csv(fp_records_new)" ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "df_files = df_records.copy()\n", "fp_files = join(DATA_STORE_NAS, dir_datset, 'files.csv')" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "df_files = df_files.drop(['sha256'], axis=1)\n", "df_files.to_csv(fp_files)" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [], "source": [ "# make another CSV just for the sha\n", "df_sha256s = df_records.copy()\n", "fp_sha256s = join(DATA_STORE_NAS, dir_datset, 'sha256s.csv')\n", "df_sha256s = df_sha256s.drop(['ext', 'fn', 'subdir'], axis=1)\n", "df_sha256s.to_csv(fp_sha256s)\n" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuid
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n00621188ac6abd-6039-442b-b31f-2db8d575363a
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n00621173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", "
" ], "text/plain": [ " ext fn sha256 \\\n", "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", "\n", " subdir uuid \n", "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create another CSV just for the UUIDs\n", "df_records_uuids.head(2)" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [], "source": [ "df_uuids = df_records_uuids.copy()" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "df_uuids = df_uuids.drop(['subdir', 'fn', 'ext', 'sha256'], axis=1)" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uuid
088ac6abd-6039-442b-b31f-2db8d575363a
173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", "
" ], "text/plain": [ " uuid\n", "0 88ac6abd-6039-442b-b31f-2db8d575363a\n", "1 73acbc00-2cb5-4260-8db3-b88ca7c29c72" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_uuids.head(2)" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "df_uuids.index.name = 'index'\n", "df_uuids.to_csv(fp_uuids_new)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for idx, row in tqdm(df_records_uuids[:2].iterrows(), total=len(df_records_uuids)):\n", " sha256 = row['sha256']\n", " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", " df_rois.at[idx, 'idx'] = int(row_match.index[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Change ROI to use index" ] }, { "cell_type": "code", "execution_count": 372, "metadata": {}, "outputs": [], "source": [ "fp_rois = join(DATA_STORE_NAS, dir_datset, 'rois.csv')\n", "df_rois = pd.read_csv(fp_rois)" ] }, { "cell_type": "code", "execution_count": 373, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdir
index
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
\n", "
" ], "text/plain": [ " ext fn sha256 \\\n", "index \n", "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", "\n", " subdir \n", "index \n", "0 test/n006211 \n", "1 test/n006211 " ] }, "execution_count": 373, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_records.index.name = 'index'\n", "df_records.head(2)" ] }, { "cell_type": "code", "execution_count": 374, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex_new
index
00jpg0089_010.473333304214test/n0062110.6682460.2796210.28-1
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.12-1
\n", "
" ], "text/plain": [ " Unnamed: 0 ext fn h image_height image_width \\\n", "index \n", "0 0 jpg 0089_01 0.473333 304 214 \n", "1 1 jpg 0089_01 0.326667 304 214 \n", "\n", " subdir w x y index_new \n", "index \n", "0 test/n006211 0.668246 0.279621 0.28 -1 \n", "1 test/n006211 0.464455 -0.156398 0.12 -1 " ] }, "execution_count": 374, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois['index_new'] = [-1] * len(df_rois)\n", "df_rois.index.name = 'index'\n", "df_rois.head(2)" ] }, { "cell_type": "code", "execution_count": 375, "metadata": {}, "outputs": [], "source": [ "df_records_copy = df_records.copy()" ] }, { "cell_type": "code", "execution_count": 376, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9131\n" ] } ], "source": [ "df_records_subdirs = df_records_copy.groupby('subdir')\n", "print(len(df_records_subdirs))" ] }, { "cell_type": "code", "execution_count": 377, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9131\n" ] } ], "source": [ "roi_subdir_groups = df_rois.groupby('subdir')\n", "print(len(roi_subdir_groups))" ] }, { "cell_type": "code", "execution_count": 387, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "module 'pandas' has no attribute 'index'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhelp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'index'" ] } ], "source": [ "help(pd.index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 390, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "81817\n" ] } ], "source": [ "print(row.Index)" ] }, { "cell_type": "code", "execution_count": 392, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "91680fc6bee04ce087a60be57ab5a58c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=9131), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for subdir, record_group in tqdm(df_records_subdirs, total=df_records_subdirs.ngroups):\n", " #print(name) # 'test/n00001'\n", " roi_group = roi_subdir_groups.get_group(subdir)\n", "# print(type(roi_group))\n", " # for every item in the roi_group, get index from record group\n", " for row in roi_group.itertuples():\n", " #row_match = record_group.loc[record_group['fn'] == row.fn]\n", " # get the index from record group where it matches this fn\n", " #print(len(record_group))\n", " #record_group.where('fn',row.fn)\n", " row_match = record_group.loc[(record_group['fn'] == row.fn)]\n", " df_rois.at[row.Index, 'index_new'] = int(row_match.index[0])\n", " #record_group[record_group['fn'].str.match(fn)]\n", " \n", " #print(int(row_match.index[0]))\n", " #print('subdir: {}, fn: {}, index: {}'.format(row.subdir, row.fn, master_index))\n", " \n", " # NB avoid using iterrows() is very slow. use iteritems\n", " #print(roi_row['fn'])\n", " #print(row.at['subdir', 0])" ] }, { "cell_type": "code", "execution_count": 411, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex_new
index
00jpg0089_010.473333304214test/n0062110.6682460.2796210.280
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.120
\n", "
" ], "text/plain": [ " Unnamed: 0 ext fn h image_height image_width \\\n", "index \n", "0 0 jpg 0089_01 0.473333 304 214 \n", "1 1 jpg 0089_01 0.326667 304 214 \n", "\n", " subdir w x y index_new \n", "index \n", "0 test/n006211 0.668246 0.279621 0.28 0 \n", "1 test/n006211 0.464455 -0.156398 0.12 0 " ] }, "execution_count": 411, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois_new = df_rois.copy()\n", "df_rois_new.head(2)" ] }, { "cell_type": "code", "execution_count": 413, "metadata": {}, "outputs": [], "source": [ "df_rois_new = df_rois_new.drop(df_rois_new.columns[df_rois_new.columns.str.contains('unnamed',case = False)],axis = 1)" ] }, { "cell_type": "code", "execution_count": 422, "metadata": {}, "outputs": [], "source": [ "df_rois_new = df_rois_new.set_index('index_new')" ] }, { "cell_type": "code", "execution_count": 423, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['ext', 'fn', 'h', 'image_height', 'image_width', 'subdir', 'w', 'x', 'y']" ] }, "execution_count": 423, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(df_rois_new.columns.values)" ] }, { "cell_type": "code", "execution_count": 425, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnhimage_heightimage_widthsubdirwxy
index_new
0jpg0089_010.473333304214test/n0062110.6682460.2796210.280000
0jpg0089_010.326667304214test/n0062110.464455-0.1563980.120000
1jpg0168_010.393333471419test/n0062110.4436090.2631580.273333
2jpg0213_010.462745408480test/n0062110.3933330.2466670.082353
4jpg0115_010.438662360401test/n0062110.3933330.2866670.245353
\n", "
" ], "text/plain": [ " ext fn h image_height image_width subdir \\\n", "index_new \n", "0 jpg 0089_01 0.473333 304 214 test/n006211 \n", "0 jpg 0089_01 0.326667 304 214 test/n006211 \n", "1 jpg 0168_01 0.393333 471 419 test/n006211 \n", "2 jpg 0213_01 0.462745 408 480 test/n006211 \n", "4 jpg 0115_01 0.438662 360 401 test/n006211 \n", "\n", " w x y \n", "index_new \n", "0 0.668246 0.279621 0.280000 \n", "0 0.464455 -0.156398 0.120000 \n", "1 0.443609 0.263158 0.273333 \n", "2 0.393333 0.246667 0.082353 \n", "4 0.393333 0.286667 0.245353 " ] }, "execution_count": 425, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois_new.head()" ] }, { "cell_type": "code", "execution_count": 426, "metadata": {}, "outputs": [], "source": [ "fp_rois_new = join(DATA_STORE_NAS, dir_dataset, 'rois_new.csv')\n", "df_rois_new.to_csv(fp_rois_new)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fix identity meta" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 458, "metadata": {}, "outputs": [], "source": [ "df_identity_meta = pd.read_csv(fp_identity_meta)\n", "df_files = pd.read_csv(fp_files).set_index('index')" ] }, { "cell_type": "code", "execution_count": 459, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsubdir
index
0jpg0089_01test/n006211
1jpg0168_01test/n006211
\n", "
" ], "text/plain": [ " ext fn subdir\n", "index \n", "0 jpg 0089_01 test/n006211\n", "1 jpg 0168_01 test/n006211" ] }, "execution_count": 459, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_files.head(2)" ] }, { "cell_type": "code", "execution_count": 460, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
class_iddescriptiongenderimagesname
0n009279Former soccer playerf365Noriko Baba
1n009278Japanese singer-songwriterf181Hiromi Satō
\n", "
" ], "text/plain": [ " class_id description gender images name\n", "0 n009279 Former soccer player f 365 Noriko Baba\n", "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō" ] }, "execution_count": 460, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_identity_meta.head(2)" ] }, { "cell_type": "code", "execution_count": 461, "metadata": {}, "outputs": [], "source": [ "# create a lookup table of ids\n", "class_ids = {}\n", "for row in df_files.itertuples():\n", " class_id = row.subdir.split('/')[1]\n", " if class_id not in class_ids.keys():\n", " class_ids[class_id] = row.Index" ] }, { "cell_type": "code", "execution_count": 463, "metadata": {}, "outputs": [], "source": [ "df_identity_meta['index_new'] = [-1] * len(df_identity_meta)" ] }, { "cell_type": "code", "execution_count": 464, "metadata": {}, "outputs": [], "source": [ "# add col to identity for 'index_new'\n", "for row in df_identity_meta.itertuples():\n", " df_identity_meta.at[row.Index, 'index_new'] = class_ids[row.class_id]\n", "# iterate through" ] }, { "cell_type": "code", "execution_count": 465, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
class_iddescriptiongenderimagesnameindex_new
0n009279Former soccer playerf365Noriko Baba1808008
1n009278Japanese singer-songwriterf181Hiromi Satō943052
2n009277Japanese fashion modelm409Ranko Kanbe595852
3n009276Japanese musicianf177Yurie Matsui2922103
4n009275Japanese idolf501Karin Miyamoto1388262
\n", "
" ], "text/plain": [ " class_id description gender images name \\\n", "0 n009279 Former soccer player f 365 Noriko Baba \n", "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō \n", "2 n009277 Japanese fashion model m 409 Ranko Kanbe \n", "3 n009276 Japanese musician f 177 Yurie Matsui \n", "4 n009275 Japanese idol f 501 Karin Miyamoto \n", "\n", " index_new \n", "0 1808008 \n", "1 943052 \n", "2 595852 \n", "3 2922103 \n", "4 1388262 " ] }, "execution_count": 465, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_identity_meta.set_index('index_new')\n", "df_identity_meta.head()" ] }, { "cell_type": "code", "execution_count": 466, "metadata": {}, "outputs": [], "source": [ "fp_identity_meta_new = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')\n", "df_identity_meta.to_csv(fp_identity_meta_new)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for idx, row in tqdm(df_records.iterrows(), total=len(df_rois)):\n", " subdir = row['subdir']\n", " fn = row['fn']\n", " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", " df_rois.at[idx, 'index'] = \n", " int(row_match.index[0])\n", " df_records.drop(df.index[2])" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "04ad99a7cba9443ebee5b26a1c4cddf1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, max=3325795), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Exception ignored in: \n", "Traceback (most recent call last):\n", " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 931, in __del__\n", " self.close()\n", " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 1133, in close\n", " self._decr_instances(self)\n", " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 496, in _decr_instances\n", " cls.monitor.exit()\n", " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_monitor.py\", line 52, in exit\n", " self.join()\n", " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/threading.py\", line 1053, in join\n", " raise RuntimeError(\"cannot join current thread\")\n", "RuntimeError: cannot join current thread\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_records\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_records\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_rois\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1282\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrstate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1283\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mna_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1284\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1285\u001b[0m raise TypeError('Could not compare {typ} type with Series'\n", "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mna_op\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_comp_method_OBJECT_ARRAY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_datetimelike_v_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36m_comp_method_OBJECT_ARRAY\u001b[0;34m(op, x, y)\u001b[0m\n\u001b[1;32m 1120\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvec_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1121\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "for idx, row in tqdm(df_rois.iterrows(), total=len(df_rois)):\n", " subdir = row['subdir']\n", " fn = row['fn']\n", " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", " df_rois.at[idx, 'index'] = int(row_match.index[0])\n", " df_records.drop(df.index[2])" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "subdir = 'test/n006211'\n", "rows_records = df_records.loc[df_records['subdir'] == subdir]\n", "rows_rois = df_rois.loc[df_rois['subdir'] == subdir ]" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex
00jpg0089_010.473333304214test/n0062110.6682460.2796210.2800000
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.1200000
22jpg0168_010.393333471419test/n0062110.4436090.2631580.2733331
33jpg0213_010.462745408480test/n0062110.3933330.2466670.0823532
44jpg0115_010.438662360401test/n0062110.3933330.2866670.2453534
55jpg0511_020.393333417415test/n0062110.3959730.2651010.2333336
66jpg0032_010.393333143125test/n0062110.4503820.2557250.3133337
77jpg0502_010.326667288252test/n0062110.3778630.6030530.5833338
88jpg0502_010.393333288252test/n0062110.4503820.3015270.3133338
99jpg0201_010.393333187164test/n0062110.4486690.2433460.3133339
\n", "
" ], "text/plain": [ " Unnamed: 0 ext fn h image_height image_width \\\n", "0 0 jpg 0089_01 0.473333 304 214 \n", "1 1 jpg 0089_01 0.326667 304 214 \n", "2 2 jpg 0168_01 0.393333 471 419 \n", "3 3 jpg 0213_01 0.462745 408 480 \n", "4 4 jpg 0115_01 0.438662 360 401 \n", "5 5 jpg 0511_02 0.393333 417 415 \n", "6 6 jpg 0032_01 0.393333 143 125 \n", "7 7 jpg 0502_01 0.326667 288 252 \n", "8 8 jpg 0502_01 0.393333 288 252 \n", "9 9 jpg 0201_01 0.393333 187 164 \n", "\n", " subdir w x y index \n", "0 test/n006211 0.668246 0.279621 0.280000 0 \n", "1 test/n006211 0.464455 -0.156398 0.120000 0 \n", "2 test/n006211 0.443609 0.263158 0.273333 1 \n", "3 test/n006211 0.393333 0.246667 0.082353 2 \n", "4 test/n006211 0.393333 0.286667 0.245353 4 \n", "5 test/n006211 0.395973 0.265101 0.233333 6 \n", "6 test/n006211 0.450382 0.255725 0.313333 7 \n", "7 test/n006211 0.377863 0.603053 0.583333 8 \n", "8 test/n006211 0.450382 0.301527 0.313333 8 \n", "9 test/n006211 0.448669 0.243346 0.313333 9 " ] }, "execution_count": 181, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois.head(10)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "df_meta.to_csv(fp_out)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "fp_meta = join(dir_dataset, 'identity_meta_kg_clean.csv')\n", "df = pd.read_csv(fp_meta)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "descs = []\n", "for idx, row in df.iterrows():\n", " descs.append(row['description'])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "descs = set(descs)" ] }, { "cell_type": "code", "execution_count": 472, "metadata": {}, "outputs": [], "source": [ "fp_identity_meta = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')" ] }, { "cell_type": "code", "execution_count": 473, "metadata": {}, "outputs": [], "source": [ "df_identity_meta = pd.read_csv(fp_identity_meta)" ] }, { "cell_type": "code", "execution_count": 474, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
class_iddescriptiongenderimagesname
0n000001Dalai Lamam42414th Dalai Lama
1n000002American singer-songwriterf315A Fine Frenzy
2n000003British writerm205A. A. Gill
3n000004Canadian-Irish actorm387AJ Buckley
4n000005Baseball catcherm229AJ Pierzynski
\n", "
" ], "text/plain": [ " class_id description gender images name\n", "0 n000001 Dalai Lama m 424 14th Dalai Lama\n", "1 n000002 American singer-songwriter f 315 A Fine Frenzy\n", "2 n000003 British writer m 205 A. A. Gill\n", "3 n000004 Canadian-Irish actor m 387 AJ Buckley\n", "4 n000005 Baseball catcher m 229 AJ Pierzynski" ] }, "execution_count": 474, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_identity_meta.head()" ] }, { "cell_type": "code", "execution_count": 475, "metadata": {}, "outputs": [], "source": [ "df_identity_meta.index.name = 'index'" ] }, { "cell_type": "code", "execution_count": 476, "metadata": {}, "outputs": [], "source": [ "df_identity_meta.to_csv(fp_identity_meta)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }