summaryrefslogtreecommitdiff
path: root/megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb')
-rw-r--r--megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb460
1 files changed, 460 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb
new file mode 100644
index 00000000..008eee2e
--- /dev/null
+++ b/megapixels/notebooks/datasets/vgg_face2/clean_vgg_02.ipynb
@@ -0,0 +1,460 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Append UUID to SHA256 CSV"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from os.path import join\n",
+ "from pathlib import Path\n",
+ "import difflib\n",
+ "\n",
+ "from tqdm import tqdm_notebook as tqdm\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# names\n",
+ "DATA_STORE_NAS = '/data_store_nas/'\n",
+ "dir_dataset = 'datasets/people/vgg_face2/metadata'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>sha256</th>\n",
+ " <th>identity</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " sha256 identity\n",
+ "index \n",
+ "0 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... -1\n",
+ "1 e360f93613baa68cede6731d2603873cdabd3993841cfd... -1"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# associate each file with an identity\n",
+ "fp_index = join(DATA_STORE_NAS, dir_dataset, 'index.csv')\n",
+ "df_index = pd.read_csv(fp_index).set_index('index')\n",
+ "df_index['identity'] = [-1] * len(df_index)\n",
+ "df_index.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3311286\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>ext</th>\n",
+ " <th>fn</th>\n",
+ " <th>subdir</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>jpg</td>\n",
+ " <td>0089_01</td>\n",
+ " <td>test/n006211</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>jpg</td>\n",
+ " <td>0168_01</td>\n",
+ " <td>test/n006211</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>jpg</td>\n",
+ " <td>0213_01</td>\n",
+ " <td>test/n006211</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>jpg</td>\n",
+ " <td>0010_01</td>\n",
+ " <td>test/n006211</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>jpg</td>\n",
+ " <td>0115_01</td>\n",
+ " <td>test/n006211</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " ext fn subdir\n",
+ "index \n",
+ "0 jpg 0089_01 test/n006211\n",
+ "1 jpg 0168_01 test/n006211\n",
+ "2 jpg 0213_01 test/n006211\n",
+ "3 jpg 0010_01 test/n006211\n",
+ "4 jpg 0115_01 test/n006211"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get file info\n",
+ "fp_files = join(DATA_STORE_NAS, dir_dataset, 'files.csv')\n",
+ "df_files = pd.read_csv(fp_files).set_index('index')\n",
+ "print(len(df_files))\n",
+ "df_files.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9131\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>class_id</th>\n",
+ " <th>description</th>\n",
+ " <th>gender</th>\n",
+ " <th>images</th>\n",
+ " <th>name</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>n000001</td>\n",
+ " <td>Dalai Lama</td>\n",
+ " <td>m</td>\n",
+ " <td>424</td>\n",
+ " <td>14th Dalai Lama</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>n000002</td>\n",
+ " <td>American singer-songwriter</td>\n",
+ " <td>f</td>\n",
+ " <td>315</td>\n",
+ " <td>A Fine Frenzy</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " class_id description gender images name\n",
+ "index \n",
+ "0 n000001 Dalai Lama m 424 14th Dalai Lama\n",
+ "1 n000002 American singer-songwriter f 315 A Fine Frenzy"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fp_identities = join(DATA_STORE_NAS, dir_dataset, 'identities.csv')\n",
+ "df_identities = pd.read_csv(fp_identities).set_index('index')\n",
+ "print(len(df_identities))\n",
+ "df_identities.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a031effa76034e8c88c686c3c8dff2e6",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=3311286), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m<ipython-input-57-cab4feea6e04>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_index\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitertuples\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mfile_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIndex\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mfile_row\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf_files\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfile_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msubdir\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfile_row\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'subdir'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mclass_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msubdir\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1477\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1478\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1479\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1480\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2102\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2104\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2106\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_convert_to_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_setter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_get_loc\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 145\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ixs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 146\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_ixs\u001b[0;34m(self, i, axis)\u001b[0m\n\u001b[1;32m 2624\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2625\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2626\u001b[0;31m dtype=new_values.dtype)\n\u001b[0m\u001b[1;32m 2627\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_is_copy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2628\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, dtype, name, copy, fastpath)\u001b[0m\n\u001b[1;32m 275\u001b[0m raise_cast_failure=True)\n\u001b[1;32m 276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 277\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSingleBlockManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfastpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0mgeneric\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfastpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, block, axis, do_integrity_check, fastpath)\u001b[0m\n\u001b[1;32m 4675\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4676\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBlock\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4677\u001b[0;31m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4678\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4679\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mmake_block\u001b[0;34m(values, placement, klass, ndim, dtype, fastpath)\u001b[0m\n\u001b[1;32m 3203\u001b[0m placement=placement, dtype=dtype)\n\u001b[1;32m 3204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3205\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3206\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3207\u001b[0m \u001b[0;31m# TODO: flexible with index=None and/or items=None\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, values, placement, ndim)\u001b[0m\n\u001b[1;32m 2301\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2302\u001b[0m super(ObjectBlock, self).__init__(values, ndim=ndim,\n\u001b[0;32m-> 2303\u001b[0;31m placement=placement)\n\u001b[0m\u001b[1;32m 2304\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2305\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, values, placement, ndim)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_ndim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgr_locs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mplacement\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 119\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/anaconda3/envs/megapixels/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mmgr_locs\u001b[0;34m(self, new_mgr_locs)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmgr_locs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_mgr_locs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_mgr_locs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBlockPlacement\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0mnew_mgr_locs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBlockPlacement\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_mgr_locs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr_locs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_mgr_locs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "for row in tqdm(df_index.itertuples(), total=len(df_index)):\n",
+ " file_index = row.Index\n",
+ " file_row = df_files.iloc[file_index]\n",
+ " subdir = file_row['subdir']\n",
+ " class_id = subdir.split('/')[1]\n",
+ " identity_row = df_identities.loc[(df_identities['class_id'] == class_id)]\n",
+ " identity_index = int(identity_row.index[0])\n",
+ " df_index.at[row.Index, 'identity'] = identity_index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>sha256</th>\n",
+ " <th>identity</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>index</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10...</td>\n",
+ " <td>6123</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>e360f93613baa68cede6731d2603873cdabd3993841cfd...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>577ce218e4a61e612942c55fd172cac4b48becacbfc708...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f...</td>\n",
+ " <td>-1</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " sha256 identity\n",
+ "index \n",
+ "0 a39a1df855cb0c70dc553c5e9afa35b4f7c00f4011ca10... 6123\n",
+ "1 e360f93613baa68cede6731d2603873cdabd3993841cfd... -1\n",
+ "2 3920a8bdf523a5bf7da9258ec414a741462d0cfbec8d2c... -1\n",
+ "3 577ce218e4a61e612942c55fd172cac4b48becacbfc708... -1\n",
+ "4 b27d37425a4e59dc4d37c3df331d0b69e4919338a3d46f... -1"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_index.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:megapixels]",
+ "language": "python",
+ "name": "conda-env-megapixels-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}