1 files changed, 2020 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb
new file mode 100644
index 00000000..91ca1626
--- /dev/null
+++ b/megapixels/notebooks/datasets/identity/vgg_face2_clean_meta_kg.ipynb
@@ -0,0 +1,2020 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Append UUID to SHA256 CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 186,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from os.path import join\n",
+    "from pathlib import Path\n",
+    "import difflib\n",
+    "\n",
+    "from tqdm import tqdm_notebook as tqdm\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/data_store_nas/datasets/people/vgg_face2/metadata/records_uuid.csv True\n",
+      "/data_store_nas/datasets/people/vgg_face2/metadata/records.csv True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# names\n",
+    "DATA_STORE_NAS = '/data_store_nas/'\n",
+    "dir_dataset = 'datasets/people/vgg_face2/metadata'\n",
+    "fp_records_uuids = join(DATA_STORE_NAS, dir_dataset, 'records_uuid.csv')\n",
+    "fp_uuids_new = join(DATA_STORE_NAS, dir_dataset, 'uuids.csv')\n",
+    "# record\n",
+    "fp_records = join(DATA_STORE_NAS, dir_datset, 'records.csv')\n",
+    "fp_records_new = join(DATA_STORE_NAS, dir_datset, 'records_new.csv')\n",
+    "print(fp_uuids, Path(fp_uuids).is_file())\n",
+    "print(fp_records, Path(fp_records).is_file())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def similarity(a, b):\n",
+    "  seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n",
+    "  return seq.ratio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_records = pd.read_csv(fp_records)\n",
+    "df_records_uuids = pd.read_csv(fp_records_uuids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>sha256</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>uuid</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ext       fn                                             sha256  \\\n",
+       "0  jpg  0089_01  a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...   \n",
+       "1  jpg  0168_01  e360f93613baa68cede6731d2603873cdabd3993841cfd...   \n",
+       "\n",
+       "         subdir                                  uuid  \n",
+       "0  test/n006211  88ac6abd-6039-442b-b31f-2db8d575363a  \n",
+       "1  test/n006211  73acbc00-2cb5-4260-8db3-b88ca7c29c72  "
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_uuids.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>sha256</th>\n",
+       "      <th>subdir</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ext       fn                                             sha256  \\\n",
+       "0  jpg  0089_01  a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...   \n",
+       "1  jpg  0168_01  e360f93613baa68cede6731d2603873cdabd3993841cfd...   \n",
+       "\n",
+       "         subdir  \n",
+       "0  test/n006211  \n",
+       "1  test/n006211  "
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_records.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fix the records and save to new csv\n",
+    "df_records['index'] = [''] * len(df_records)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for idx, row in tqdm(df_records.iterrows(), total=len(df_records)):\n",
+    "  df_records.at[idx, 'index'] = idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>sha256</th>\n",
+       "      <th>subdir</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0213_01</td>\n",
+       "      <td>3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0010_01</td>\n",
+       "      <td>577ce218e4a61e612942c55fd172cac4b48becacbfc708...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0115_01</td>\n",
+       "      <td>b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       ext       fn                                             sha256  \\\n",
+       "index                                                                    \n",
+       "0      jpg  0089_01  a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...   \n",
+       "1      jpg  0168_01  e360f93613baa68cede6731d2603873cdabd3993841cfd...   \n",
+       "2      jpg  0213_01  3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...   \n",
+       "3      jpg  0010_01  577ce218e4a61e612942c55fd172cac4b48becacbfc708...   \n",
+       "4      jpg  0115_01  b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...   \n",
+       "\n",
+       "             subdir  \n",
+       "index                \n",
+       "0      test/n006211  \n",
+       "1      test/n006211  \n",
+       "2      test/n006211  \n",
+       "3      test/n006211  \n",
+       "4      test/n006211  "
+      ]
+     },
+     "execution_count": 128,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#df_records.set_index('index')\n",
+    "#df_records.head()\n",
+    "df_records.to_csv(fp_records_new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_files = df_records.copy()\n",
+    "fp_files = join(DATA_STORE_NAS, dir_datset, 'files.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_files = df_files.drop(['sha256'], axis=1)\n",
+    "df_files.to_csv(fp_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make another CSV just for the sha\n",
+    "df_sha256s = df_records.copy()\n",
+    "fp_sha256s = join(DATA_STORE_NAS, dir_datset, 'sha256s.csv')\n",
+    "df_sha256s = df_sha256s.drop(['ext', 'fn', 'subdir'], axis=1)\n",
+    "df_sha256s.to_csv(fp_sha256s)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>sha256</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>uuid</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   ext       fn                                             sha256  \\\n",
+       "0  jpg  0089_01  a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...   \n",
+       "1  jpg  0168_01  e360f93613baa68cede6731d2603873cdabd3993841cfd...   \n",
+       "\n",
+       "         subdir                                  uuid  \n",
+       "0  test/n006211  88ac6abd-6039-442b-b31f-2db8d575363a  \n",
+       "1  test/n006211  73acbc00-2cb5-4260-8db3-b88ca7c29c72  "
+      ]
+     },
+     "execution_count": 141,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# create another CSV just for the UUIDs\n",
+    "df_records_uuids.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_uuids = df_records_uuids.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_uuids = df_uuids.drop(['subdir', 'fn', 'ext', 'sha256'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>uuid</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>88ac6abd-6039-442b-b31f-2db8d575363a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>73acbc00-2cb5-4260-8db3-b88ca7c29c72</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                   uuid\n",
+       "0  88ac6abd-6039-442b-b31f-2db8d575363a\n",
+       "1  73acbc00-2cb5-4260-8db3-b88ca7c29c72"
+      ]
+     },
+     "execution_count": 145,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_uuids.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_uuids.index.name = 'index'\n",
+    "df_uuids.to_csv(fp_uuids_new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for idx, row in tqdm(df_records_uuids[:2].iterrows(), total=len(df_records_uuids)):\n",
+    "  sha256 = row['sha256']\n",
+    "  row_match = df_records.loc[(df_records['subdir'] == subdir)]\n",
+    "  df_rois.at[idx, 'idx'] = int(row_match.index[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Change ROI to use index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 372,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_rois = join(DATA_STORE_NAS, dir_datset, 'rois.csv')\n",
+    "df_rois = pd.read_csv(fp_rois)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 373,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>sha256</th>\n",
+       "      <th>subdir</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       ext       fn                                             sha256  \\\n",
+       "index                                                                    \n",
+       "0      jpg  0089_01  a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...   \n",
+       "1      jpg  0168_01  e360f93613baa68cede6731d2603873cdabd3993841cfd...   \n",
+       "\n",
+       "             subdir  \n",
+       "index                \n",
+       "0      test/n006211  \n",
+       "1      test/n006211  "
+      ]
+     },
+     "execution_count": 373,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_records.index.name = 'index'\n",
+    "df_records.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 374,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>h</th>\n",
+       "      <th>image_height</th>\n",
+       "      <th>image_width</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>w</th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>index_new</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.473333</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.668246</td>\n",
+       "      <td>0.279621</td>\n",
+       "      <td>0.28</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.326667</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.464455</td>\n",
+       "      <td>-0.156398</td>\n",
+       "      <td>0.12</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Unnamed: 0  ext       fn         h  image_height  image_width  \\\n",
+       "index                                                                  \n",
+       "0               0  jpg  0089_01  0.473333           304          214   \n",
+       "1               1  jpg  0089_01  0.326667           304          214   \n",
+       "\n",
+       "             subdir         w         x     y  index_new  \n",
+       "index                                                     \n",
+       "0      test/n006211  0.668246  0.279621  0.28         -1  \n",
+       "1      test/n006211  0.464455 -0.156398  0.12         -1  "
+      ]
+     },
+     "execution_count": 374,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_rois['index_new'] = [-1] * len(df_rois)\n",
+    "df_rois.index.name = 'index'\n",
+    "df_rois.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 375,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_records_copy = df_records.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 376,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9131\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_records_subdirs = df_records_copy.groupby('subdir')\n",
+    "print(len(df_records_subdirs))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 377,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9131\n"
+     ]
+    }
+   ],
+   "source": [
+    "roi_subdir_groups = df_rois.groupby('subdir')\n",
+    "print(len(roi_subdir_groups))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 387,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "module 'pandas' has no attribute 'index'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-387-82023aa58c79>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mhelp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'pandas' has no attribute 'index'"
+     ]
+    }
+   ],
+   "source": [
+    "help(pd.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 390,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "81817\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(row.Index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 392,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "91680fc6bee04ce087a60be57ab5a58c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=9131), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for subdir, record_group in tqdm(df_records_subdirs, total=df_records_subdirs.ngroups):\n",
+    "  #print(name)  # 'test/n00001'\n",
+    "  roi_group = roi_subdir_groups.get_group(subdir)\n",
+    "#   print(type(roi_group))\n",
+    "  # for every item in the roi_group, get index from record group\n",
+    "  for row in roi_group.itertuples():\n",
+    "    #row_match = record_group.loc[record_group['fn'] == row.fn]\n",
+    "    # get the index from record group where it matches this fn\n",
+    "    #print(len(record_group))\n",
+    "    #record_group.where('fn',row.fn)\n",
+    "    row_match = record_group.loc[(record_group['fn'] == row.fn)]\n",
+    "    df_rois.at[row.Index, 'index_new'] = int(row_match.index[0])\n",
+    "    #record_group[record_group['fn'].str.match(fn)]\n",
+    "    \n",
+    "    #print(int(row_match.index[0]))\n",
+    "    #print('subdir: {}, fn: {}, index: {}'.format(row.subdir, row.fn, master_index))\n",
+    "    \n",
+    "    # NB avoid using iterrows() is very slow. use iteritems\n",
+    "    #print(roi_row['fn'])\n",
+    "    #print(row.at['subdir', 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 411,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>h</th>\n",
+       "      <th>image_height</th>\n",
+       "      <th>image_width</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>w</th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>index_new</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.473333</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.668246</td>\n",
+       "      <td>0.279621</td>\n",
+       "      <td>0.28</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.326667</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.464455</td>\n",
+       "      <td>-0.156398</td>\n",
+       "      <td>0.12</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Unnamed: 0  ext       fn         h  image_height  image_width  \\\n",
+       "index                                                                  \n",
+       "0               0  jpg  0089_01  0.473333           304          214   \n",
+       "1               1  jpg  0089_01  0.326667           304          214   \n",
+       "\n",
+       "             subdir         w         x     y  index_new  \n",
+       "index                                                     \n",
+       "0      test/n006211  0.668246  0.279621  0.28          0  \n",
+       "1      test/n006211  0.464455 -0.156398  0.12          0  "
+      ]
+     },
+     "execution_count": 411,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_rois_new = df_rois.copy()\n",
+    "df_rois_new.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 413,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_rois_new = df_rois_new.drop(df_rois_new.columns[df_rois_new.columns.str.contains('unnamed',case = False)],axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 422,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_rois_new = df_rois_new.set_index('index_new')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 423,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ext', 'fn', 'h', 'image_height', 'image_width', 'subdir', 'w', 'x', 'y']"
+      ]
+     },
+     "execution_count": 423,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(df_rois_new.columns.values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 425,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>h</th>\n",
+       "      <th>image_height</th>\n",
+       "      <th>image_width</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>w</th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index_new</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.473333</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.668246</td>\n",
+       "      <td>0.279621</td>\n",
+       "      <td>0.280000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.326667</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.464455</td>\n",
+       "      <td>-0.156398</td>\n",
+       "      <td>0.120000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>471</td>\n",
+       "      <td>419</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.443609</td>\n",
+       "      <td>0.263158</td>\n",
+       "      <td>0.273333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0213_01</td>\n",
+       "      <td>0.462745</td>\n",
+       "      <td>408</td>\n",
+       "      <td>480</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>0.246667</td>\n",
+       "      <td>0.082353</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0115_01</td>\n",
+       "      <td>0.438662</td>\n",
+       "      <td>360</td>\n",
+       "      <td>401</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>0.286667</td>\n",
+       "      <td>0.245353</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           ext       fn         h  image_height  image_width        subdir  \\\n",
+       "index_new                                                                    \n",
+       "0          jpg  0089_01  0.473333           304          214  test/n006211   \n",
+       "0          jpg  0089_01  0.326667           304          214  test/n006211   \n",
+       "1          jpg  0168_01  0.393333           471          419  test/n006211   \n",
+       "2          jpg  0213_01  0.462745           408          480  test/n006211   \n",
+       "4          jpg  0115_01  0.438662           360          401  test/n006211   \n",
+       "\n",
+       "                  w         x         y  \n",
+       "index_new                                \n",
+       "0          0.668246  0.279621  0.280000  \n",
+       "0          0.464455 -0.156398  0.120000  \n",
+       "1          0.443609  0.263158  0.273333  \n",
+       "2          0.393333  0.246667  0.082353  \n",
+       "4          0.393333  0.286667  0.245353  "
+      ]
+     },
+     "execution_count": 425,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_rois_new.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 426,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_rois_new = join(DATA_STORE_NAS, dir_dataset, 'rois_new.csv')\n",
+    "df_rois_new.to_csv(fp_rois_new)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fix identity meta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 458,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_identity_meta = pd.read_csv(fp_identity_meta)\n",
+    "df_files = pd.read_csv(fp_files).set_index('index')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 459,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>subdir</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>index</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>test/n006211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       ext       fn        subdir\n",
+       "index                            \n",
+       "0      jpg  0089_01  test/n006211\n",
+       "1      jpg  0168_01  test/n006211"
+      ]
+     },
+     "execution_count": 459,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_files.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 460,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>class_id</th>\n",
+       "      <th>description</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>images</th>\n",
+       "      <th>name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>n009279</td>\n",
+       "      <td>Former soccer player</td>\n",
+       "      <td>f</td>\n",
+       "      <td>365</td>\n",
+       "      <td>Noriko Baba</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>n009278</td>\n",
+       "      <td>Japanese singer-songwriter</td>\n",
+       "      <td>f</td>\n",
+       "      <td>181</td>\n",
+       "      <td>Hiromi Satō</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  class_id                 description gender  images         name\n",
+       "0  n009279        Former soccer player      f     365  Noriko Baba\n",
+       "1  n009278  Japanese singer-songwriter      f     181  Hiromi Satō"
+      ]
+     },
+     "execution_count": 460,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_identity_meta.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 461,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a lookup table of ids\n",
+    "class_ids = {}\n",
+    "for row in df_files.itertuples():\n",
+    "  class_id = row.subdir.split('/')[1]\n",
+    "  if class_id not in class_ids.keys():\n",
+    "    class_ids[class_id] = row.Index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 463,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_identity_meta['index_new'] = [-1] * len(df_identity_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 464,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add col to identity for 'index_new'\n",
+    "for row in df_identity_meta.itertuples():\n",
+    "  df_identity_meta.at[row.Index, 'index_new'] = class_ids[row.class_id]\n",
+    "# iterate through"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 465,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>class_id</th>\n",
+       "      <th>description</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>images</th>\n",
+       "      <th>name</th>\n",
+       "      <th>index_new</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>n009279</td>\n",
+       "      <td>Former soccer player</td>\n",
+       "      <td>f</td>\n",
+       "      <td>365</td>\n",
+       "      <td>Noriko Baba</td>\n",
+       "      <td>1808008</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>n009278</td>\n",
+       "      <td>Japanese singer-songwriter</td>\n",
+       "      <td>f</td>\n",
+       "      <td>181</td>\n",
+       "      <td>Hiromi Satō</td>\n",
+       "      <td>943052</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>n009277</td>\n",
+       "      <td>Japanese fashion model</td>\n",
+       "      <td>m</td>\n",
+       "      <td>409</td>\n",
+       "      <td>Ranko Kanbe</td>\n",
+       "      <td>595852</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>n009276</td>\n",
+       "      <td>Japanese musician</td>\n",
+       "      <td>f</td>\n",
+       "      <td>177</td>\n",
+       "      <td>Yurie Matsui</td>\n",
+       "      <td>2922103</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>n009275</td>\n",
+       "      <td>Japanese idol</td>\n",
+       "      <td>f</td>\n",
+       "      <td>501</td>\n",
+       "      <td>Karin Miyamoto</td>\n",
+       "      <td>1388262</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  class_id                 description gender  images            name  \\\n",
+       "0  n009279        Former soccer player      f     365     Noriko Baba   \n",
+       "1  n009278  Japanese singer-songwriter      f     181     Hiromi Satō   \n",
+       "2  n009277      Japanese fashion model      m     409     Ranko Kanbe   \n",
+       "3  n009276           Japanese musician      f     177    Yurie Matsui   \n",
+       "4  n009275               Japanese idol      f     501  Karin Miyamoto   \n",
+       "\n",
+       "   index_new  \n",
+       "0    1808008  \n",
+       "1     943052  \n",
+       "2     595852  \n",
+       "3    2922103  \n",
+       "4    1388262  "
+      ]
+     },
+     "execution_count": 465,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_identity_meta.set_index('index_new')\n",
+    "df_identity_meta.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 466,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_identity_meta_new = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')\n",
+    "df_identity_meta.to_csv(fp_identity_meta_new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for idx, row in tqdm(df_records.iterrows(), total=len(df_rois)):\n",
+    "  subdir = row['subdir']\n",
+    "  fn = row['fn']\n",
+    "  row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n",
+    "  for idx_subdir, row_subdir in row_match_subdir.iterrows():  \n",
+    "    row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n",
+    "    df_rois.at[idx, 'index'] = \n",
+    "    int(row_match.index[0])\n",
+    "  df_records.drop(df.index[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 187,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04ad99a7cba9443ebee5b26a1c4cddf1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, max=3325795), HTML(value='')))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Exception ignored in: <bound method tqdm.__del__ of   0%|          | 139/3325795 [01:34<123:32:18,  7.48it/s]>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 931, in __del__\n",
+      "    self.close()\n",
+      "  File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 1133, in close\n",
+      "    self._decr_instances(self)\n",
+      "  File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_tqdm.py\", line 496, in _decr_instances\n",
+      "    cls.monitor.exit()\n",
+      "  File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/site-packages/tqdm/_monitor.py\", line 52, in exit\n",
+      "    self.join()\n",
+      "  File \"/home/adam/anaconda3/envs/megapixels/lib/python3.6/threading.py\", line 1053, in join\n",
+      "    raise RuntimeError(\"cannot join current thread\")\n",
+      "RuntimeError: cannot join current thread\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-187-f9325ab8bb02>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m   \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m   \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m   \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_records\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_records\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m   \u001b[0mrow_match\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fn'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m   \u001b[0mdf_rois\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'index'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_match\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(self, other, axis)\u001b[0m\n\u001b[1;32m   1281\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1282\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrstate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'ignore'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1283\u001b[0;31m                 \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mna_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1284\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1285\u001b[0m                 raise TypeError('Could not compare {typ} type with Series'\n",
+      "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36mna_op\u001b[0;34m(x, y)\u001b[0m\n\u001b[1;32m   1141\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1142\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mis_object_dtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1143\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_comp_method_OBJECT_ARRAY\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1144\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1145\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mis_datetimelike_v_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/ops.py\u001b[0m in \u001b[0;36m_comp_method_OBJECT_ARRAY\u001b[0;34m(op, x, y)\u001b[0m\n\u001b[1;32m   1120\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvec_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1121\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1122\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_compare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1123\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for idx, row in tqdm(df_rois.iterrows(), total=len(df_rois)):\n",
+    "  subdir = row['subdir']\n",
+    "  fn = row['fn']\n",
+    "  row_match_subdir = df_records_copy.loc[(df_records_copy['subdir'] == subdir)]\n",
+    "  for idx_subdir, row_subdir in row_match_subdir.iterrows():  \n",
+    "    row_match_fn = row_match.loc[(row_match['fn'] == fn)]\n",
+    "    df_rois.at[idx, 'index'] = int(row_match.index[0])\n",
+    "  df_records.drop(df.index[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subdir = 'test/n006211'\n",
+    "rows_records = df_records.loc[df_records['subdir'] == subdir]\n",
+    "rows_rois = df_rois.loc[df_rois['subdir'] == subdir ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 181,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>ext</th>\n",
+       "      <th>fn</th>\n",
+       "      <th>h</th>\n",
+       "      <th>image_height</th>\n",
+       "      <th>image_width</th>\n",
+       "      <th>subdir</th>\n",
+       "      <th>w</th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>index</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.473333</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.668246</td>\n",
+       "      <td>0.279621</td>\n",
+       "      <td>0.280000</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0089_01</td>\n",
+       "      <td>0.326667</td>\n",
+       "      <td>304</td>\n",
+       "      <td>214</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.464455</td>\n",
+       "      <td>-0.156398</td>\n",
+       "      <td>0.120000</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0168_01</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>471</td>\n",
+       "      <td>419</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.443609</td>\n",
+       "      <td>0.263158</td>\n",
+       "      <td>0.273333</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0213_01</td>\n",
+       "      <td>0.462745</td>\n",
+       "      <td>408</td>\n",
+       "      <td>480</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>0.246667</td>\n",
+       "      <td>0.082353</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0115_01</td>\n",
+       "      <td>0.438662</td>\n",
+       "      <td>360</td>\n",
+       "      <td>401</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>0.286667</td>\n",
+       "      <td>0.245353</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>5</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0511_02</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>417</td>\n",
+       "      <td>415</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.395973</td>\n",
+       "      <td>0.265101</td>\n",
+       "      <td>0.233333</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>6</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0032_01</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>143</td>\n",
+       "      <td>125</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.450382</td>\n",
+       "      <td>0.255725</td>\n",
+       "      <td>0.313333</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>7</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0502_01</td>\n",
+       "      <td>0.326667</td>\n",
+       "      <td>288</td>\n",
+       "      <td>252</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.377863</td>\n",
+       "      <td>0.603053</td>\n",
+       "      <td>0.583333</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>8</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0502_01</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>288</td>\n",
+       "      <td>252</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.450382</td>\n",
+       "      <td>0.301527</td>\n",
+       "      <td>0.313333</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>9</td>\n",
+       "      <td>jpg</td>\n",
+       "      <td>0201_01</td>\n",
+       "      <td>0.393333</td>\n",
+       "      <td>187</td>\n",
+       "      <td>164</td>\n",
+       "      <td>test/n006211</td>\n",
+       "      <td>0.448669</td>\n",
+       "      <td>0.243346</td>\n",
+       "      <td>0.313333</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0  ext       fn         h  image_height  image_width  \\\n",
+       "0           0  jpg  0089_01  0.473333           304          214   \n",
+       "1           1  jpg  0089_01  0.326667           304          214   \n",
+       "2           2  jpg  0168_01  0.393333           471          419   \n",
+       "3           3  jpg  0213_01  0.462745           408          480   \n",
+       "4           4  jpg  0115_01  0.438662           360          401   \n",
+       "5           5  jpg  0511_02  0.393333           417          415   \n",
+       "6           6  jpg  0032_01  0.393333           143          125   \n",
+       "7           7  jpg  0502_01  0.326667           288          252   \n",
+       "8           8  jpg  0502_01  0.393333           288          252   \n",
+       "9           9  jpg  0201_01  0.393333           187          164   \n",
+       "\n",
+       "         subdir         w         x         y index  \n",
+       "0  test/n006211  0.668246  0.279621  0.280000     0  \n",
+       "1  test/n006211  0.464455 -0.156398  0.120000     0  \n",
+       "2  test/n006211  0.443609  0.263158  0.273333     1  \n",
+       "3  test/n006211  0.393333  0.246667  0.082353     2  \n",
+       "4  test/n006211  0.393333  0.286667  0.245353     4  \n",
+       "5  test/n006211  0.395973  0.265101  0.233333     6  \n",
+       "6  test/n006211  0.450382  0.255725  0.313333     7  \n",
+       "7  test/n006211  0.377863  0.603053  0.583333     8  \n",
+       "8  test/n006211  0.450382  0.301527  0.313333     8  \n",
+       "9  test/n006211  0.448669  0.243346  0.313333     9  "
+      ]
+     },
+     "execution_count": 181,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_rois.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_meta.to_csv(fp_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_meta = join(dir_dataset, 'identity_meta_kg_clean.csv')\n",
+    "df = pd.read_csv(fp_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "descs = []\n",
+    "for idx, row in df.iterrows():\n",
+    "  descs.append(row['description'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "descs = set(descs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 472,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp_identity_meta = join(DATA_STORE_NAS, dir_dataset, 'identity_meta_indexed.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 473,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_identity_meta = pd.read_csv(fp_identity_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 474,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>class_id</th>\n",
+       "      <th>description</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>images</th>\n",
+       "      <th>name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>n000001</td>\n",
+       "      <td>Dalai Lama</td>\n",
+       "      <td>m</td>\n",
+       "      <td>424</td>\n",
+       "      <td>14th Dalai Lama</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>n000002</td>\n",
+       "      <td>American singer-songwriter</td>\n",
+       "      <td>f</td>\n",
+       "      <td>315</td>\n",
+       "      <td>A Fine Frenzy</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>n000003</td>\n",
+       "      <td>British writer</td>\n",
+       "      <td>m</td>\n",
+       "      <td>205</td>\n",
+       "      <td>A. A. Gill</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>n000004</td>\n",
+       "      <td>Canadian-Irish actor</td>\n",
+       "      <td>m</td>\n",
+       "      <td>387</td>\n",
+       "      <td>AJ Buckley</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>n000005</td>\n",
+       "      <td>Baseball catcher</td>\n",
+       "      <td>m</td>\n",
+       "      <td>229</td>\n",
+       "      <td>AJ Pierzynski</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  class_id                 description gender  images             name\n",
+       "0  n000001                  Dalai Lama      m     424  14th Dalai Lama\n",
+       "1  n000002  American singer-songwriter      f     315    A Fine Frenzy\n",
+       "2  n000003              British writer      m     205       A. A. Gill\n",
+       "3  n000004        Canadian-Irish actor      m     387       AJ Buckley\n",
+       "4  n000005            Baseball catcher      m     229    AJ Pierzynski"
+      ]
+     },
+     "execution_count": 474,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_identity_meta.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 475,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_identity_meta.index.name = 'index'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 476,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_identity_meta.to_csv(fp_identity_meta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:megapixels]",
+   "language": "python",
+   "name": "conda-env-megapixels-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}