diff options
Diffstat (limited to 'megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb | 2020 |
1 files changed, 2020 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb new file mode 100644 index 00000000..91ca1626 --- /dev/null +++ b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb @@ -0,0 +1,2020 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Append UUID to SHA256 CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "from os.path import join\n", + "from pathlib import Path\n", + "import difflib\n", + "\n", + "from tqdm import tqdm_notebook as tqdm\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/data_store_nas/datasets/people/vgg_face2/metadata/records_uuid.csv True\n", + "/data_store_nas/datasets/people/vgg_face2/metadata/records.csv True\n" + ] + } + ], + "source": [ + "# names\n", + "DATA_STORE_NAS = '/data_store_nas/'\n", + "dir_dataset = 'datasets/people/vgg_face2/metadata'\n", + "fp_records_uuids = join(DATA_STORE_NAS, dir_dataset, 'records_uuid.csv')\n", + "fp_uuids_new = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n", + "# record\n", + "fp_records = join(DATA_STORE_NAS, dir_datset, 'records.csv')\n", + "fp_records_new = join(DATA_STORE_NAS, dir_datset, 'records_new.csv')\n", + "print(fp_uuids, Path(fp_uuids).is_file())\n", + "print(fp_records, Path(fp_records).is_file())" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "def similarity(a, b):\n", + " seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n", + " return seq.ratio()" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "df_records = pd.read_csv(fp_records)\n", + "df_records_uuids = pd.read_csv(fp_records_uuids)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " <th>uuid</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n", + " <td>test/n006211</td>\n", + " <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n", + " <td>test/n006211</td>\n", + " <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir uuid \n", + "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", + "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir \n", + "0 test/n006211 \n", + "1 test/n006211 " + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_records.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "# fix the records and save to new csv\n", + "df_records['index'] = [''] * len(df_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records.iterrows(), total=len(df_records)):\n", + " df_records.at[idx, 'index'] = idx" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>jpg</td>\n", + " <td>0213_01</td>\n", + " <td>3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>jpg</td>\n", + " <td>0010_01</td>\n", + " <td>577ce218e4a61e612942c55fd172cac4b48becacbfc708...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>jpg</td>\n", + " <td>0115_01</td>\n", + " <td>b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "index \n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "2 jpg 0213_01 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... \n", + "3 jpg 0010_01 577ce218e4a61e612942c55fd172cac4b48becacbfc708... \n", + "4 jpg 0115_01 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... \n", + "\n", + " subdir \n", + "index \n", + "0 test/n006211 \n", + "1 test/n006211 \n", + "2 test/n006211 \n", + "3 test/n006211 \n", + "4 test/n006211 " + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#df_records.set_index('index')\n", + "#df_records.head()\n", + "df_records.to_csv(fp_records_new)" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "df_files = df_records.copy()\n", + "fp_files = join(DATA_STORE_NAS, dir_datset, 'files.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "df_files = df_files.drop(['sha256'], axis=1)\n", + "df_files.to_csv(fp_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "# make another CSV just for the sha\n", + "df_sha256s = df_records.copy()\n", + "fp_sha256s = join(DATA_STORE_NAS, dir_datset, 'sha256s.csv')\n", + "df_sha256s = df_sha256s.drop(['ext', 'fn', 'subdir'], axis=1)\n", + "df_sha256s.to_csv(fp_sha256s)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " <th>uuid</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n", + " <td>test/n006211</td>\n", + " <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n", + " <td>test/n006211</td>\n", + " <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir uuid \n", + "0 test/n006211 88ac6abd-6039-442b-b31f-2db8d575363a \n", + "1 test/n006211 73acbc00-2cb5-4260-8db3-b88ca7c29c72 " + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create another CSV just for the UUIDs\n", + "df_records_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids = df_records_uuids.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids = df_uuids.drop(['subdir', 'fn', 'ext', 'sha256'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>uuid</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " uuid\n", + "0 88ac6abd-6039-442b-b31f-2db8d575363a\n", + "1 73acbc00-2cb5-4260-8db3-b88ca7c29c72" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_uuids.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "df_uuids.index.name = 'index'\n", + "df_uuids.to_csv(fp_uuids_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records_uuids[:2].iterrows(), total=len(df_records_uuids)):\n", + " sha256 = row['sha256']\n", + " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", + " df_rois.at[idx, 'idx'] = int(row_match.index[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Change ROI to use index" + ] + }, + { + "cell_type": "code", + "execution_count": 372, + "metadata": {}, + "outputs": [], + "source": [ + "fp_rois = join(DATA_STORE_NAS, dir_datset, 'rois.csv')\n", + "df_rois = pd.read_csv(fp_rois)" + ] + }, + { + "cell_type": "code", + "execution_count": 373, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn sha256 \\\n", + "index \n", + "0 jpg 0089_01 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... \n", + "1 jpg 0168_01 e360f93613baa68cede6731d2603873cdabd3993841cfd... \n", + "\n", + " subdir \n", + "index \n", + "0 test/n006211 \n", + "1 test/n006211 " + ] + }, + "execution_count": 373, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_records.index.name = 'index'\n", + "df_records.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 374, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>h</th>\n", + " <th>image_height</th>\n", + " <th>image_width</th>\n", + " <th>subdir</th>\n", + " <th>w</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>index_new</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.473333</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.668246</td>\n", + " <td>0.279621</td>\n", + " <td>0.28</td>\n", + " <td>-1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.326667</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.464455</td>\n", + " <td>-0.156398</td>\n", + " <td>0.12</td>\n", + " <td>-1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "index \n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "\n", + " subdir w x y index_new \n", + "index \n", + "0 test/n006211 0.668246 0.279621 0.28 -1 \n", + "1 test/n006211 0.464455 -0.156398 0.12 -1 " + ] + }, + "execution_count": 374, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois['index_new'] = [-1] * len(df_rois)\n", + "df_rois.index.name = 'index'\n", + "df_rois.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 375, + "metadata": {}, + "outputs": [], + "source": [ + "df_records_copy = df_records.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 376, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9131\n" + ] + } + ], + "source": [ + "df_records_subdirs = df_records_copy.groupby('subdir')\n", + "print(len(df_records_subdirs))" + ] + }, + { + "cell_type": "code", + "execution_count": 377, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9131\n" + ] + } + ], + "source": [ + "roi_subdir_groups = df_rois.groupby('subdir')\n", + "print(len(roi_subdir_groups))" + ] + }, + { + "cell_type": "code", + "execution_count": 387, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pandas' has no attribute 'index'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-387-82023aa58c79>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhelp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'index'" + ] + } + ], + "source": [ + "help(pd.index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 390, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "81817\n" + ] + } + ], + "source": [ + "print(row.Index)" + ] + }, + { + "cell_type": "code", + "execution_count": 392, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91680fc6bee04ce087a60be57ab5a58c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=9131), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for subdir, record_group in tqdm(df_records_subdirs, total=df_records_subdirs.ngroups):\n", + " #print(name) # 'test/n00001'\n", + " roi_group = roi_subdir_groups.get_group(subdir)\n", + "# print(type(roi_group))\n", + " # for every item in the roi_group, get index from record group\n", + " for row in roi_group.itertuples():\n", + " #row_match = record_group.loc[record_group['fn'] == row.fn]\n", + " # get the index from record group where it matches this fn\n", + " #print(len(record_group))\n", + " #record_group.where('fn',row.fn)\n", + " row_match = record_group.loc[(record_group['fn'] == row.fn)]\n", + " df_rois.at[row.Index, 'index_new'] = int(row_match.index[0])\n", + " #record_group[record_group['fn'].str.match(fn)]\n", + " \n", + " #print(int(row_match.index[0]))\n", + " #print('subdir: {}, fn: {}, index: {}'.format(row.subdir, row.fn, master_index))\n", + " \n", + " # NB avoid using iterrows() is very slow. use iteritems\n", + " #print(roi_row['fn'])\n", + " #print(row.at['subdir', 0])" + ] + }, + { + "cell_type": "code", + "execution_count": 411, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>h</th>\n", + " <th>image_height</th>\n", + " <th>image_width</th>\n", + " <th>subdir</th>\n", + " <th>w</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>index_new</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.473333</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.668246</td>\n", + " <td>0.279621</td>\n", + " <td>0.28</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.326667</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.464455</td>\n", + " <td>-0.156398</td>\n", + " <td>0.12</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "index \n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "\n", + " subdir w x y index_new \n", + "index \n", + "0 test/n006211 0.668246 0.279621 0.28 0 \n", + "1 test/n006211 0.464455 -0.156398 0.12 0 " + ] + }, + "execution_count": 411, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois_new = df_rois.copy()\n", + "df_rois_new.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 413, + "metadata": {}, + "outputs": [], + "source": [ + "df_rois_new = df_rois_new.drop(df_rois_new.columns[df_rois_new.columns.str.contains('unnamed',case = False)],axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 422, + "metadata": {}, + "outputs": [], + "source": [ + "df_rois_new = df_rois_new.set_index('index_new')" + ] + }, + { + "cell_type": "code", + "execution_count": 423, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ext', 'fn', 'h', 'image_height', 'image_width', 'subdir', 'w', 'x', 'y']" + ] + }, + "execution_count": 423, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(df_rois_new.columns.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 425, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>h</th>\n", + " <th>image_height</th>\n", + " <th>image_width</th>\n", + " <th>subdir</th>\n", + " <th>w</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index_new</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.473333</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.668246</td>\n", + " <td>0.279621</td>\n", + " <td>0.280000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.326667</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.464455</td>\n", + " <td>-0.156398</td>\n", + " <td>0.120000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>0.393333</td>\n", + " <td>471</td>\n", + " <td>419</td>\n", + " <td>test/n006211</td>\n", + " <td>0.443609</td>\n", + " <td>0.263158</td>\n", + " <td>0.273333</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>jpg</td>\n", + " <td>0213_01</td>\n", + " <td>0.462745</td>\n", + " <td>408</td>\n", + " <td>480</td>\n", + " <td>test/n006211</td>\n", + " <td>0.393333</td>\n", + " <td>0.246667</td>\n", + " <td>0.082353</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>jpg</td>\n", + " <td>0115_01</td>\n", + " <td>0.438662</td>\n", + " <td>360</td>\n", + " <td>401</td>\n", + " <td>test/n006211</td>\n", + " <td>0.393333</td>\n", + " <td>0.286667</td>\n", + " <td>0.245353</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn h image_height image_width subdir \\\n", + "index_new \n", + "0 jpg 0089_01 0.473333 304 214 test/n006211 \n", + "0 jpg 0089_01 0.326667 304 214 test/n006211 \n", + "1 jpg 0168_01 0.393333 471 419 test/n006211 \n", + "2 jpg 0213_01 0.462745 408 480 test/n006211 \n", + "4 jpg 0115_01 0.438662 360 401 test/n006211 \n", + "\n", + " w x y \n", + "index_new \n", + "0 0.668246 0.279621 0.280000 \n", + "0 0.464455 -0.156398 0.120000 \n", + "1 0.443609 0.263158 0.273333 \n", + "2 0.393333 0.246667 0.082353 \n", + "4 0.393333 0.286667 0.245353 " + ] + }, + "execution_count": 425, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois_new.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 426, + "metadata": {}, + "outputs": [], + "source": [ + "fp_rois_new = join(DATA_STORE_NAS, dir_dataset, 'rois_new.csv')\n", + "df_rois_new.to_csv(fp_rois_new)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fix identity meta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 458, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta = pd.read_csv(fp_identity_meta)\n", + "df_files = pd.read_csv(fp_files).set_index('index')" + ] + }, + { + "cell_type": "code", + "execution_count": 459, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>subdir</th>\n", + " </tr>\n", + " <tr>\n", + " <th>index</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>test/n006211</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn subdir\n", + "index \n", + "0 jpg 0089_01 test/n006211\n", + "1 jpg 0168_01 test/n006211" + ] + }, + "execution_count": 459, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_files.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 460, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>class_id</th>\n", + " <th>description</th>\n", + " <th>gender</th>\n", + " <th>images</th>\n", + " <th>name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>n009279</td>\n", + " <td>Former soccer player</td>\n", + " <td>f</td>\n", + " <td>365</td>\n", + " <td>Noriko Baba</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>n009278</td>\n", + " <td>Japanese singer-songwriter</td>\n", + " <td>f</td>\n", + " <td>181</td>\n", + " <td>Hiromi Satō</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " class_id description gender images name\n", + "0 n009279 Former soccer player f 365 Noriko Baba\n", + "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō" + ] + }, + "execution_count": 460, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 461, + "metadata": {}, + "outputs": [], + "source": [ + "# create a lookup table of ids\n", + "class_ids = {}\n", + "for row in df_files.itertuples():\n", + " class_id = row.subdir.split('/')[1]\n", + " if class_id not in class_ids.keys():\n", + " class_ids[class_id] = row.Index" + ] + }, + { + "cell_type": "code", + "execution_count": 463, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta['index_new'] = [-1] * len(df_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 464, + "metadata": {}, + "outputs": [], + "source": [ + "# add col to identity for 'index_new'\n", + "for row in df_identity_meta.itertuples():\n", + " df_identity_meta.at[row.Index, 'index_new'] = class_ids[row.class_id]\n", + "# iterate through" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>class_id</th>\n", + " <th>description</th>\n", + " <th>gender</th>\n", + " <th>images</th>\n", + " <th>name</th>\n", + " <th>index_new</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>n009279</td>\n", + " <td>Former soccer player</td>\n", + " <td>f</td>\n", + " <td>365</td>\n", + " <td>Noriko Baba</td>\n", + " <td>1808008</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>n009278</td>\n", + " <td>Japanese singer-songwriter</td>\n", + " <td>f</td>\n", + " <td>181</td>\n", + " <td>Hiromi Satō</td>\n", + " <td>943052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>n009277</td>\n", + " <td>Japanese fashion model</td>\n", + " <td>m</td>\n", + " <td>409</td>\n", + " <td>Ranko Kanbe</td>\n", + " <td>595852</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>n009276</td>\n", + " <td>Japanese musician</td>\n", + " <td>f</td>\n", + " <td>177</td>\n", + " <td>Yurie Matsui</td>\n", + " <td>2922103</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>n009275</td>\n", + " <td>Japanese idol</td>\n", + " <td>f</td>\n", + " <td>501</td>\n", + " <td>Karin Miyamoto</td>\n", + " <td>1388262</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " class_id description gender images name \\\n", + "0 n009279 Former soccer player f 365 Noriko Baba \n", + "1 n009278 Japanese singer-songwriter f 181 Hiromi Satō \n", + "2 n009277 Japanese fashion model m 409 Ranko Kanbe \n", + "3 n009276 Japanese musician f 177 Yurie Matsui \n", + "4 n009275 Japanese idol f 501 Karin Miyamoto \n", + "\n", + " index_new \n", + "0 1808008 \n", + "1 943052 \n", + "2 595852 \n", + "3 2922103 \n", + "4 1388262 " + ] + }, + "execution_count": 465, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.set_index('index_new')\n", + "df_identity_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 466, + "metadata": {}, + "outputs": [], + "source": [ + "fp_identity_meta_new = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')\n", + "df_identity_meta.to_csv(fp_identity_meta_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in tqdm(df_records.iterrows(), total=len(df_rois)):\n", + " subdir = row['subdir']\n", + " fn = row['fn']\n", + " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", + " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", + " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", + " df_rois.at[idx, 'index'] = \n", + " int(row_match.index[0])\n", + " df_records.drop(df.index[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04ad99a7cba9443ebee5b26a1c4cddf1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=3325795), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Exception ignored in: <bound method tqdm.__del__ of 0%| | 139/3325795 [01:34<123:32:18, 7.48it/s]>\n", + "Traceback (most recent call last):\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 931, in __del__\n", + " self.close()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 1133, in close\n", + " self._decr_instances(self)\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 496, in _decr_instances\n", + " cls.monitor.exit()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_monitor.py\", line 52, in exit\n", + " self.join()\n", + " File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/threading.py\", line 1053, in join\n", + " raise RuntimeError(\"cannot join current thread\")\n", + "RuntimeError: cannot join current thread\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-187-f9325ab8bb02>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_records\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_records\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdf_rois\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1282\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrstate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1283\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mna_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1284\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1285\u001b[0m raise TypeError('Could not compare {typ} type with Series'\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mna_op\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1142\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_comp_method_OBJECT_ARRAY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1145\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_datetimelike_v_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36m_comp_method_OBJECT_ARRAY\u001b[0;34m(op, x, y)\u001b[0m\n\u001b[1;32m 1120\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvec_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1121\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for idx, row in tqdm(df_rois.iterrows(), total=len(df_rois)):\n", + " subdir = row['subdir']\n", + " fn = row['fn']\n", + " row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n", + " for idx_subdir, row_subdir in row_match_subdir.iterrows(): \n", + " row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n", + " df_rois.at[idx, 'index'] = int(row_match.index[0])\n", + " df_records.drop(df.index[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "subdir = 'test/n006211'\n", + "rows_records = df_records.loc[df_records['subdir'] == subdir]\n", + "rows_rois = df_rois.loc[df_rois['subdir'] == subdir ]" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>h</th>\n", + " <th>image_height</th>\n", + " <th>image_width</th>\n", + " <th>subdir</th>\n", + " <th>w</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>index</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.473333</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.668246</td>\n", + " <td>0.279621</td>\n", + " <td>0.280000</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>jpg</td>\n", + " <td>0089_01</td>\n", + " <td>0.326667</td>\n", + " <td>304</td>\n", + " <td>214</td>\n", + " <td>test/n006211</td>\n", + " <td>0.464455</td>\n", + " <td>-0.156398</td>\n", + " <td>0.120000</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>jpg</td>\n", + " <td>0168_01</td>\n", + " <td>0.393333</td>\n", + " <td>471</td>\n", + " <td>419</td>\n", + " <td>test/n006211</td>\n", + " <td>0.443609</td>\n", + " <td>0.263158</td>\n", + " <td>0.273333</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>jpg</td>\n", + " <td>0213_01</td>\n", + " <td>0.462745</td>\n", + " <td>408</td>\n", + " <td>480</td>\n", + " <td>test/n006211</td>\n", + " <td>0.393333</td>\n", + " <td>0.246667</td>\n", + " <td>0.082353</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>jpg</td>\n", + " <td>0115_01</td>\n", + " <td>0.438662</td>\n", + " <td>360</td>\n", + " <td>401</td>\n", + " <td>test/n006211</td>\n", + " <td>0.393333</td>\n", + " <td>0.286667</td>\n", + " <td>0.245353</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>5</td>\n", + " <td>jpg</td>\n", + " <td>0511_02</td>\n", + " <td>0.393333</td>\n", + " <td>417</td>\n", + " <td>415</td>\n", + " <td>test/n006211</td>\n", + " <td>0.395973</td>\n", + " <td>0.265101</td>\n", + " <td>0.233333</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>6</td>\n", + " <td>jpg</td>\n", + " <td>0032_01</td>\n", + " <td>0.393333</td>\n", + " <td>143</td>\n", + " <td>125</td>\n", + " <td>test/n006211</td>\n", + " <td>0.450382</td>\n", + " <td>0.255725</td>\n", + " <td>0.313333</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>7</td>\n", + " <td>jpg</td>\n", + " <td>0502_01</td>\n", + " <td>0.326667</td>\n", + " <td>288</td>\n", + " <td>252</td>\n", + " <td>test/n006211</td>\n", + " <td>0.377863</td>\n", + " <td>0.603053</td>\n", + " <td>0.583333</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>8</td>\n", + " <td>jpg</td>\n", + " <td>0502_01</td>\n", + " <td>0.393333</td>\n", + " <td>288</td>\n", + " <td>252</td>\n", + " <td>test/n006211</td>\n", + " <td>0.450382</td>\n", + " <td>0.301527</td>\n", + " <td>0.313333</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>9</td>\n", + " <td>jpg</td>\n", + " <td>0201_01</td>\n", + " <td>0.393333</td>\n", + " <td>187</td>\n", + " <td>164</td>\n", + " <td>test/n006211</td>\n", + " <td>0.448669</td>\n", + " <td>0.243346</td>\n", + " <td>0.313333</td>\n", + " <td>9</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 ext fn h image_height image_width \\\n", + "0 0 jpg 0089_01 0.473333 304 214 \n", + "1 1 jpg 0089_01 0.326667 304 214 \n", + "2 2 jpg 0168_01 0.393333 471 419 \n", + "3 3 jpg 0213_01 0.462745 408 480 \n", + "4 4 jpg 0115_01 0.438662 360 401 \n", + "5 5 jpg 0511_02 0.393333 417 415 \n", + "6 6 jpg 0032_01 0.393333 143 125 \n", + "7 7 jpg 0502_01 0.326667 288 252 \n", + "8 8 jpg 0502_01 0.393333 288 252 \n", + "9 9 jpg 0201_01 0.393333 187 164 \n", + "\n", + " subdir w x y index \n", + "0 test/n006211 0.668246 0.279621 0.280000 0 \n", + "1 test/n006211 0.464455 -0.156398 0.120000 0 \n", + "2 test/n006211 0.443609 0.263158 0.273333 1 \n", + "3 test/n006211 0.393333 0.246667 0.082353 2 \n", + "4 test/n006211 0.393333 0.286667 0.245353 4 \n", + "5 test/n006211 0.395973 0.265101 0.233333 6 \n", + "6 test/n006211 0.450382 0.255725 0.313333 7 \n", + "7 test/n006211 0.377863 0.603053 0.583333 8 \n", + "8 test/n006211 0.450382 0.301527 0.313333 8 \n", + "9 test/n006211 0.448669 0.243346 0.313333 9 " + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rois.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df_meta.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "fp_meta = join(dir_dataset, 'identity_meta_kg_clean.csv')\n", + "df = pd.read_csv(fp_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "descs = []\n", + "for idx, row in df.iterrows():\n", + " descs.append(row['description'])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "descs = set(descs)" + ] + }, + { + "cell_type": "code", + "execution_count": 472, + "metadata": {}, + "outputs": [], + "source": [ + "fp_identity_meta = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 473, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta = pd.read_csv(fp_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": 474, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>class_id</th>\n", + " <th>description</th>\n", + " <th>gender</th>\n", + " <th>images</th>\n", + " <th>name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>n000001</td>\n", + " <td>Dalai Lama</td>\n", + " <td>m</td>\n", + " <td>424</td>\n", + " <td>14th Dalai Lama</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>n000002</td>\n", + " <td>American singer-songwriter</td>\n", + " <td>f</td>\n", + " <td>315</td>\n", + " <td>A Fine Frenzy</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>n000003</td>\n", + " <td>British writer</td>\n", + " <td>m</td>\n", + " <td>205</td>\n", + " <td>A. A. Gill</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>n000004</td>\n", + " <td>Canadian-Irish actor</td>\n", + " <td>m</td>\n", + " <td>387</td>\n", + " <td>AJ Buckley</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>n000005</td>\n", + " <td>Baseball catcher</td>\n", + " <td>m</td>\n", + " <td>229</td>\n", + " <td>AJ Pierzynski</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " class_id description gender images name\n", + "0 n000001 Dalai Lama m 424 14th Dalai Lama\n", + "1 n000002 American singer-songwriter f 315 A Fine Frenzy\n", + "2 n000003 British writer m 205 A. A. Gill\n", + "3 n000004 Canadian-Irish actor m 387 AJ Buckley\n", + "4 n000005 Baseball catcher m 229 AJ Pierzynski" + ] + }, + "execution_count": 474, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_identity_meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 475, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta.index.name = 'index'" + ] + }, + { + "cell_type": "code", + "execution_count": 476, + "metadata": {}, + "outputs": [], + "source": [ + "df_identity_meta.to_csv(fp_identity_meta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
