From a5bdab8e798fcdc7885cfdabb0e5dd8076fa1d40 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Tue, 12 Feb 2019 15:18:46 +0100 Subject: reorder nbs --- .../identity/vgg_face2_clean_meta_kg.ipynb | 2020 ++++++++++++++++++++ 1 file changed, 2020 insertions(+) create mode 100644 megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb (limited to 'megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb') diff --git a/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb new file mode 100644 index 00000000..91ca1626 --- /dev/null +++ b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb @@ -0,0 +1,2020 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Append UUID to SHA256 CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "from os.path import join\n", + "from pathlib import Path\n", + "import difflib\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/data_store_nas/datasets/people/vgg_face2/metadata/records_uuid.csv True\n", + "/data_store_nas/datasets/people/vgg_face2/metadata/records.csv True\n" + ] + } + ], + "source": [ + "# names\n", + "DATA_STORE_NAS = '/data_store_nas/'\n", + "dir_dataset = 'datasets/people/vgg_face2/metadata'\n", + "fp_records_uuids = join(DATA_STORE_NAS, dir_dataset, 'records_uuid.csv')\n", + "fp_uuids_new = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n", + "# record\n", + "fp_records = join(DATA_STORE_NAS, dir_datset, 'records.csv')\n", + "fp_records_new = join(DATA_STORE_NAS, dir_datset, 'records_new.csv')\n", + "print(fp_uuids, Path(fp_uuids).is_file())\n", + "print(fp_records, Path(fp_records).is_file())" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "def similarity(a, b):\n", + " seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n", + " return seq.ratio()" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "df_records = pd.read_csv(fp_records)\n", + "df_records_uuids = pd.read_csv(fp_records_uuids)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdiruuid
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n00621188ac6abd-6039-442b-b31f-2db8d575363a
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n00621173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", + "
" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir uuid \n", + "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", + "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdir
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
\n", + "
" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir \n", + "0 test/n006211 \n", + "1 test/n006211 " + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_records.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "# fix the records and save to new csv\n", + "df_records['index'] = [''] * len(df_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records.iterrows(), total=len(df_records)):\n", + " df_records.at[idx, 'index'] = idx" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdir
index
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
2jpg0213_013920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...test/n006211
3jpg0010_01577ce218e4a61e612942c55fd172cac4b48becacbfc708...test/n006211
4jpg0115_01b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...test/n006211
\n", + "
" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "index \n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "2 jpg 0213_01 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... \n", + "3 jpg 0010_01 577ce218e4a61e612942c55fd172cac4b48becacbfc708... \n", + "4 jpg 0115_01 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... \n", + "\n", + " subdir \n", + "index \n", + "0 test/n006211 \n", + "1 test/n006211 \n", + "2 test/n006211 \n", + "3 test/n006211 \n", + "4 test/n006211 " + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#df_records.set_index('index')\n", + "#df_records.head()\n", + "df_records.to_csv(fp_records_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "df_files = df_records.copy()\n", + "fp_files = join(DATA_STORE_NAS, dir_datset, 'files.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "df_files = df_files.drop(['sha256'], axis=1)\n", + "df_files.to_csv(fp_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "# make another CSV just for the sha\n", + "df_sha256s = df_records.copy()\n", + "fp_sha256s = join(DATA_STORE_NAS, dir_datset, 'sha256s.csv')\n", + "df_sha256s = df_sha256s.drop(['ext', 'fn', 'subdir'], axis=1)\n", + "df_sha256s.to_csv(fp_sha256s)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdiruuid
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n00621188ac6abd-6039-442b-b31f-2db8d575363a
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n00621173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", + "
" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir uuid \n", + "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", + "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create another CSV just for the UUIDs\n", + "df_records_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids = df_records_uuids.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids = df_uuids.drop(['subdir', 'fn', 'ext', 'sha256'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uuid
088ac6abd-6039-442b-b31f-2db8d575363a
173acbc00-2cb5-4260-8db3-b88ca7c29c72
\n", + "
" + ], + "text/plain": [ + " uuid\n", + "0 88ac6abd-6039-442b-b31f-2db8d575363a\n", + "1 73acbc00-2cb5-4260-8db3-b88ca7c29c72" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids.index.name = 'index'\n", + "df_uuids.to_csv(fp_uuids_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records_uuids[:2].iterrows(), total=len(df_records_uuids)):\n", + " sha256 = row['sha256']\n", + " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", + " df_rois.at[idx, 'idx'] = int(row_match.index[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Change ROI to use index" + ] + }, + { + "cell_type": "code", + "execution_count": 372, + "metadata": {}, + "outputs": [], + "source": [ + "fp_rois = join(DATA_STORE_NAS, dir_datset, 'rois.csv')\n", + "df_rois = pd.read_csv(fp_rois)" + ] + }, + { + "cell_type": "code", + "execution_count": 373, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdir
index
0jpg0089_01a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...test/n006211
1jpg0168_01e360f93613baa68cede6731d2603873cdabd3993841cfd...test/n006211
\n", + "
" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "index \n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir \n", + "index \n", + "0 test/n006211 \n", + "1 test/n006211 " + ] + }, + "execution_count": 373, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_records.index.name = 'index'\n", + "df_records.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 374, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex_new
index
00jpg0089_010.473333304214test/n0062110.6682460.2796210.28-1
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.12-1
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "index \n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "\n", + " subdir w x y index_new \n", + "index \n", + "0 test/n006211 0.668246 0.279621 0.28 -1 \n", + "1 test/n006211 0.464455 -0.156398 0.12 -1 " + ] + }, + "execution_count": 374, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois['index_new'] = [-1] * len(df_rois)\n", + "df_rois.index.name = 'index'\n", + "df_rois.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 375, + "metadata": {}, + "outputs": [], + "source": [ + "df_records_copy = df_records.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 376, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9131\n" + ] + } + ], + "source": [ + "df_records_subdirs = df_records_copy.groupby('subdir')\n", + "print(len(df_records_subdirs))" + ] + }, + { + "cell_type": "code", + "execution_count": 377, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9131\n" + ] + } + ], + "source": [ + "roi_subdir_groups = df_rois.groupby('subdir')\n", + "print(len(roi_subdir_groups))" + ] + }, + { + "cell_type": "code", + "execution_count": 387, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pandas' has no attribute 'index'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhelp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'index'" + ] + } + ], + "source": [ + "help(pd.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 390, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "81817\n" + ] + } + ], + "source": [ + "print(row.Index)" + ] + }, + { + "cell_type": "code", + "execution_count": 392, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91680fc6bee04ce087a60be57ab5a58c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=9131), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for subdir, record_group in tqdm(df_records_subdirs, total=df_records_subdirs.ngroups):\n", + " #print(name) # 'test/n00001'\n", + " roi_group = roi_subdir_groups.get_group(subdir)\n", + "# print(type(roi_group))\n", + " # for every item in the roi_group, get index from record group\n", + " for row in roi_group.itertuples():\n", + " #row_match = record_group.loc[record_group['fn'] == row.fn]\n", + " # get the index from record group where it matches this fn\n", + " #print(len(record_group))\n", + " #record_group.where('fn',row.fn)\n", + " row_match = record_group.loc[(record_group['fn'] == row.fn)]\n", + " df_rois.at[row.Index, 'index_new'] = int(row_match.index[0])\n", + " #record_group[record_group['fn'].str.match(fn)]\n", + " \n", + " #print(int(row_match.index[0]))\n", + " #print('subdir: {}, fn: {}, index: {}'.format(row.subdir, row.fn, master_index))\n", + " \n", + " # NB avoid using iterrows() is very slow. use iteritems\n", + " #print(roi_row['fn'])\n", + " #print(row.at['subdir', 0])" + ] + }, + { + "cell_type": "code", + "execution_count": 411, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex_new
index
00jpg0089_010.473333304214test/n0062110.6682460.2796210.280
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.120
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "index \n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "\n", + " subdir w x y index_new \n", + "index \n", + "0 test/n006211 0.668246 0.279621 0.28 0 \n", + "1 test/n006211 0.464455 -0.156398 0.12 0 " + ] + }, + "execution_count": 411, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois_new = df_rois.copy()\n", + "df_rois_new.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 413, + "metadata": {}, + "outputs": [], + "source": [ + "df_rois_new = df_rois_new.drop(df_rois_new.columns[df_rois_new.columns.str.contains('unnamed',case = False)],axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 422, + "metadata": {}, + "outputs": [], + "source": [ + "df_rois_new = df_rois_new.set_index('index_new')" + ] + }, + { + "cell_type": "code", + "execution_count": 423, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ext', 'fn', 'h', 'image_height', 'image_width', 'subdir', 'w', 'x', 'y']" + ] + }, + "execution_count": 423, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(df_rois_new.columns.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 425, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnhimage_heightimage_widthsubdirwxy
index_new
0jpg0089_010.473333304214test/n0062110.6682460.2796210.280000
0jpg0089_010.326667304214test/n0062110.464455-0.1563980.120000
1jpg0168_010.393333471419test/n0062110.4436090.2631580.273333
2jpg0213_010.462745408480test/n0062110.3933330.2466670.082353
4jpg0115_010.438662360401test/n0062110.3933330.2866670.245353
\n", + "
" + ], + "text/plain": [ + " ext fn h image_height image_width subdir \\\n", + "index_new \n", + "0 jpg 0089_01 0.473333 304 214 test/n006211 \n", + "0 jpg 0089_01 0.326667 304 214 test/n006211 \n", + "1 jpg 0168_01 0.393333 471 419 test/n006211 \n", + "2 jpg 0213_01 0.462745 408 480 test/n006211 \n", + "4 jpg 0115_01 0.438662 360 401 test/n006211 \n", + "\n", + " w x y \n", + "index_new \n", + "0 0.668246 0.279621 0.280000 \n", + "0 0.464455 -0.156398 0.120000 \n", + "1 0.443609 0.263158 0.273333 \n", + "2 0.393333 0.246667 0.082353 \n", + "4 0.393333 0.286667 0.245353 " + ] + }, + "execution_count": 425, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois_new.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 426, + "metadata": {}, + "outputs": [], + "source": [ + "fp_rois_new = join(DATA_STORE_NAS, dir_dataset, 'rois_new.csv')\n", + "df_rois_new.to_csv(fp_rois_new)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fix identity meta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 458, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta = pd.read_csv(fp_identity_meta)\n", + "df_files = pd.read_csv(fp_files).set_index('index')" + ] + }, + { + "cell_type": "code", + "execution_count": 459, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsubdir
index
0jpg0089_01test/n006211
1jpg0168_01test/n006211
\n", + "
" + ], + "text/plain": [ + " ext fn subdir\n", + "index \n", + "0 jpg 0089_01 test/n006211\n", + "1 jpg 0168_01 test/n006211" + ] + }, + "execution_count": 459, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_files.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 460, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
class_iddescriptiongenderimagesname
0n009279Former soccer playerf365Noriko Baba
1n009278Japanese singer-songwriterf181Hiromi Satō
\n", + "
" + ], + "text/plain": [ + " class_id description gender images name\n", + "0 n009279 Former soccer player f 365 Noriko Baba\n", + "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō" + ] + }, + "execution_count": 460, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 461, + "metadata": {}, + "outputs": [], + "source": [ + "# create a lookup table of ids\n", + "class_ids = {}\n", + "for row in df_files.itertuples():\n", + " class_id = row.subdir.split('/')[1]\n", + " if class_id not in class_ids.keys():\n", + " class_ids[class_id] = row.Index" + ] + }, + { + "cell_type": "code", + "execution_count": 463, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta['index_new'] = [-1] * len(df_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 464, + "metadata": {}, + "outputs": [], + "source": [ + "# add col to identity for 'index_new'\n", + "for row in df_identity_meta.itertuples():\n", + " df_identity_meta.at[row.Index, 'index_new'] = class_ids[row.class_id]\n", + "# iterate through" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
class_iddescriptiongenderimagesnameindex_new
0n009279Former soccer playerf365Noriko Baba1808008
1n009278Japanese singer-songwriterf181Hiromi Satō943052
2n009277Japanese fashion modelm409Ranko Kanbe595852
3n009276Japanese musicianf177Yurie Matsui2922103
4n009275Japanese idolf501Karin Miyamoto1388262
\n", + "
" + ], + "text/plain": [ + " class_id description gender images name \\\n", + "0 n009279 Former soccer player f 365 Noriko Baba \n", + "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō \n", + "2 n009277 Japanese fashion model m 409 Ranko Kanbe \n", + "3 n009276 Japanese musician f 177 Yurie Matsui \n", + "4 n009275 Japanese idol f 501 Karin Miyamoto \n", + "\n", + " index_new \n", + "0 1808008 \n", + "1 943052 \n", + "2 595852 \n", + "3 2922103 \n", + "4 1388262 " + ] + }, + "execution_count": 465, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.set_index('index_new')\n", + "df_identity_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 466, + "metadata": {}, + "outputs": [], + "source": [ + "fp_identity_meta_new = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')\n", + "df_identity_meta.to_csv(fp_identity_meta_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records.iterrows(), total=len(df_rois)):\n", + " subdir = row['subdir']\n", + " fn = row['fn']\n", + " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", + " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", + " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", + " df_rois.at[idx, 'index'] = \n", + " int(row_match.index[0])\n", + " df_records.drop(df.index[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04ad99a7cba9443ebee5b26a1c4cddf1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3325795), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 931, in __del__\n", + " self.close()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 1133, in close\n", + " self._decr_instances(self)\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 496, in _decr_instances\n", + " cls.monitor.exit()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_monitor.py\", line 52, in exit\n", + " self.join()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/threading.py\", line 1053, in join\n", + " raise RuntimeError(\"cannot join current thread\")\n", + "RuntimeError: cannot join current thread\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_records\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_records\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_rois\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1282\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrstate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1283\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mna_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1284\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1285\u001b[0m raise TypeError('Could not compare {typ} type with Series'\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mna_op\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_comp_method_OBJECT_ARRAY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_datetimelike_v_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36m_comp_method_OBJECT_ARRAY\u001b[0;34m(op, x, y)\u001b[0m\n\u001b[1;32m 1120\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvec_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1121\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for idx, row in tqdm(df_rois.iterrows(), total=len(df_rois)):\n", + " subdir = row['subdir']\n", + " fn = row['fn']\n", + " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", + " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", + " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", + " df_rois.at[idx, 'index'] = int(row_match.index[0])\n", + " df_records.drop(df.index[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "subdir = 'test/n006211'\n", + "rows_records = df_records.loc[df_records['subdir'] == subdir]\n", + "rows_rois = df_rois.loc[df_rois['subdir'] == subdir ]" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0extfnhimage_heightimage_widthsubdirwxyindex
00jpg0089_010.473333304214test/n0062110.6682460.2796210.2800000
11jpg0089_010.326667304214test/n0062110.464455-0.1563980.1200000
22jpg0168_010.393333471419test/n0062110.4436090.2631580.2733331
33jpg0213_010.462745408480test/n0062110.3933330.2466670.0823532
44jpg0115_010.438662360401test/n0062110.3933330.2866670.2453534
55jpg0511_020.393333417415test/n0062110.3959730.2651010.2333336
66jpg0032_010.393333143125test/n0062110.4503820.2557250.3133337
77jpg0502_010.326667288252test/n0062110.3778630.6030530.5833338
88jpg0502_010.393333288252test/n0062110.4503820.3015270.3133338
99jpg0201_010.393333187164test/n0062110.4486690.2433460.3133339
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "2 2 jpg 0168_01 0.393333 471 419 \n", + "3 3 jpg 0213_01 0.462745 408 480 \n", + "4 4 jpg 0115_01 0.438662 360 401 \n", + "5 5 jpg 0511_02 0.393333 417 415 \n", + "6 6 jpg 0032_01 0.393333 143 125 \n", + "7 7 jpg 0502_01 0.326667 288 252 \n", + "8 8 jpg 0502_01 0.393333 288 252 \n", + "9 9 jpg 0201_01 0.393333 187 164 \n", + "\n", + " subdir w x y index \n", + "0 test/n006211 0.668246 0.279621 0.280000 0 \n", + "1 test/n006211 0.464455 -0.156398 0.120000 0 \n", + "2 test/n006211 0.443609 0.263158 0.273333 1 \n", + "3 test/n006211 0.393333 0.246667 0.082353 2 \n", + "4 test/n006211 0.393333 0.286667 0.245353 4 \n", + "5 test/n006211 0.395973 0.265101 0.233333 6 \n", + "6 test/n006211 0.450382 0.255725 0.313333 7 \n", + "7 test/n006211 0.377863 0.603053 0.583333 8 \n", + "8 test/n006211 0.450382 0.301527 0.313333 8 \n", + "9 test/n006211 0.448669 0.243346 0.313333 9 " + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fp_meta = join(dir_dataset, 'identity_meta_kg_clean.csv')\n", + "df = pd.read_csv(fp_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "descs = []\n", + "for idx, row in df.iterrows():\n", + " descs.append(row['description'])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "descs = set(descs)" + ] + }, + { + "cell_type": "code", + "execution_count": 472, + "metadata": {}, + "outputs": [], + "source": [ + "fp_identity_meta = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 473, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta = pd.read_csv(fp_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 474, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
class_iddescriptiongenderimagesname
0n000001Dalai Lamam42414th Dalai Lama
1n000002American singer-songwriterf315A Fine Frenzy
2n000003British writerm205A. A. Gill
3n000004Canadian-Irish actorm387AJ Buckley
4n000005Baseball catcherm229AJ Pierzynski
\n", + "
" + ], + "text/plain": [ + " class_id description gender images name\n", + "0 n000001 Dalai Lama m 424 14th Dalai Lama\n", + "1 n000002 American singer-songwriter f 315 A Fine Frenzy\n", + "2 n000003 British writer m 205 A. A. Gill\n", + "3 n000004 Canadian-Irish actor m 387 AJ Buckley\n", + "4 n000005 Baseball catcher m 229 AJ Pierzynski" + ] + }, + "execution_count": 474, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 475, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 476, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta.to_csv(fp_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3-70-g09d2