diff options
Diffstat (limited to 'megapixels/notebooks/datasets/sha_uuid-Copy1.ipynb')
| -rw-r--r-- | megapixels/notebooks/datasets/sha_uuid-Copy1.ipynb | 728 |
1 files changed, 728 insertions, 0 deletions
diff --git a/megapixels/notebooks/datasets/sha_uuid-Copy1.ipynb b/megapixels/notebooks/datasets/sha_uuid-Copy1.ipynb new file mode 100644 index 00000000..21f59722 --- /dev/null +++ b/megapixels/notebooks/datasets/sha_uuid-Copy1.ipynb @@ -0,0 +1,728 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Append UUID to SHA256 CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "import math\n", + "from glob import glob\n", + "from random import randint\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_STORE = '/data_store_ssd/'\n", + "dir_dataset = join(DATA_STORE, 'apps/megapixels/datasets/lfw')\n", + "fp_shas = join(dir_dataset, 'records.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Cook_0001</td>\n", + " <td>550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...</td>\n", + " <td>AJ_Cook</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Lamas_0001</td>\n", + " <td>46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...</td>\n", + " <td>AJ_Lamas</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Eckhart_0001</td>\n", + " <td>b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...</td>\n", + " <td>Aaron_Eckhart</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Guiel_0001</td>\n", + " <td>156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...</td>\n", + " <td>Aaron_Guiel</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Patterson_0001</td>\n", + " <td>34dfe798220b53aac910e5e39705770d212cdfbe4be8a4...</td>\n", + " <td>Aaron_Patterson</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn \\\n", + "0 jpg AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load names\n", + "df_records = pd.read_csv(fp_shas)\n", + "records = df_records.to_dict('index')\n", + "df_records.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import base64\n", + "\n", + "# get a UUID - URL safe, Base64\n", + "def b64uuid():\n", + " r_uuid = base64.urlsafe_b64encode(uuid.uuid4().bytes)\n", + " print(r_uuid)\n", + " return r_uuid.replace('=', '')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "UUID('05ba06b3-875e-429a-ac39-02b129b77d71')" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uuid.uuid4()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# append a UUID to every entry\n", + "for idx, item in records.items():\n", + " records[idx]['uuid'] = uuid.uuid4()\n", + "# save to csv\n", + "fp_sha_uuid = join(dir_dataset, 'records_uuid.csv')\n", + "df_uuid = pd.DataFrame.from_dict(list(records.values())) # ignore the indices\n", + "df_uuid.to_csv(fp_sha_uuid, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "fp = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(fp)" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['a', 'a', 'a', 'a', 'a']" + ] + }, + "execution_count": 180, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "['a'] * 5" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>h</th>\n", + " <th>image_height</th>\n", + " <th>image_width</th>\n", + " <th>subdir</th>\n", + " <th>vec</th>\n", + " <th>w</th>\n", + " <th>x</th>\n", + " <th>y</th>\n", + " <th>newcol</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Cook_0001</td>\n", + " <td>0.330000</td>\n", + " <td>250</td>\n", + " <td>250</td>\n", + " <td>AJ_Cook</td>\n", + " <td>-0.07324773073196411, 0.150736004114151, 0.006...</td>\n", + " <td>0.330000</td>\n", + " <td>0.336667</td>\n", + " <td>0.350000</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Lamas_0001</td>\n", + " <td>0.393333</td>\n", + " <td>250</td>\n", + " <td>250</td>\n", + " <td>AJ_Lamas</td>\n", + " <td>-0.12234891951084137, 0.06931854784488678, 0.0...</td>\n", + " <td>0.393333</td>\n", + " <td>0.286667</td>\n", + " <td>0.313333</td>\n", + " <td></td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn h image_height image_width subdir \\\n", + "0 jpg AJ_Cook_0001 0.330000 250 250 AJ_Cook \n", + "1 jpg AJ_Lamas_0001 0.393333 250 250 AJ_Lamas \n", + "\n", + " vec w x \\\n", + "0 -0.07324773073196411, 0.150736004114151, 0.006... 0.330000 0.336667 \n", + "1 -0.12234891951084137, 0.06931854784488678, 0.0... 0.393333 0.286667 \n", + "\n", + " y newcol \n", + "0 0.350000 10 \n", + "1 0.313333 " + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'list'>\n", + "128\n" + ] + } + ], + "source": [ + "for idx, row in df.iterrows():\n", + " vec = row['vec'].split(',')\n", + " print(type(vec))\n", + " print(len(vec))\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test_idx.csv'\n", + "df.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", + "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/records_idx.csv'\n", + "df = pd.read_csv(fp_in)\n", + "df.to_csv(fp_out, index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[(df['fn'] == 'AJ_Cook_0001') & (df['subdir'] == 'AJ_Cook'), 'ext'] = 'wow'" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " <th>uuid</th>\n", + " <th>newcol</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>wow</td>\n", + " <td>AJ_Cook_0001</td>\n", + " <td>550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...</td>\n", + " <td>AJ_Cook</td>\n", + " <td>f03fd921-2d56-4e83-8115-f658d6a72287</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Lamas_0001</td>\n", + " <td>46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...</td>\n", + " <td>AJ_Lamas</td>\n", + " <td>0c96c5bb-dbd1-4584-bd68-af11664b98bb</td>\n", + " <td>x</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Eckhart_0001</td>\n", + " <td>b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...</td>\n", + " <td>Aaron_Eckhart</td>\n", + " <td>8221e75c-9537-4a4f-9693-483b445244b4</td>\n", + " <td>x</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Guiel_0001</td>\n", + " <td>156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...</td>\n", + " <td>Aaron_Guiel</td>\n", + " <td>a2955610-ed5e-433c-bdd4-e3a72ff44736</td>\n", + " <td>x</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Patterson_0001</td>\n", + " <td>34dfe798220b53aac910e5e39705770d212cdfbe4be8a4...</td>\n", + " <td>Aaron_Patterson</td>\n", + " <td>1d0782e9-ed16-4550-b1e9-d9c03eef6181</td>\n", + " <td>x</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn \\\n", + "0 wow AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \\\n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", + "\n", + " uuid newcol \n", + "0 f03fd921-2d56-4e83-8115-f658d6a72287 10 \n", + "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb x \n", + "2 8221e75c-9537-4a4f-9693-483b445244b4 x \n", + "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 x \n", + "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 x " + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['newcol'] = ['x'] * len(df)\n", + "df.at[0, 'newcol'] = '10'\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ext</th>\n", + " <th>fn</th>\n", + " <th>sha256</th>\n", + " <th>subdir</th>\n", + " <th>uuid</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>wow</td>\n", + " <td>AJ_Cook_0001</td>\n", + " <td>550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...</td>\n", + " <td>AJ_Cook</td>\n", + " <td>f03fd921-2d56-4e83-8115-f658d6a72287</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>jpg</td>\n", + " <td>AJ_Lamas_0001</td>\n", + " <td>46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...</td>\n", + " <td>AJ_Lamas</td>\n", + " <td>0c96c5bb-dbd1-4584-bd68-af11664b98bb</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Eckhart_0001</td>\n", + " <td>b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...</td>\n", + " <td>Aaron_Eckhart</td>\n", + " <td>8221e75c-9537-4a4f-9693-483b445244b4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Guiel_0001</td>\n", + " <td>156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...</td>\n", + " <td>Aaron_Guiel</td>\n", + " <td>a2955610-ed5e-433c-bdd4-e3a72ff44736</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>jpg</td>\n", + " <td>Aaron_Patterson_0001</td>\n", + " <td>34dfe798220b53aac910e5e39705770d212cdfbe4be8a4...</td>\n", + " <td>Aaron_Patterson</td>\n", + " <td>1d0782e9-ed16-4550-b1e9-d9c03eef6181</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ext fn \\\n", + "0 wow AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \\\n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", + "\n", + " uuid \n", + "0 f03fd921-2d56-4e83-8115-f658d6a72287 \n", + "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n", + "2 8221e75c-9537-4a4f-9693-483b445244b4 \n", + "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n", + "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 " + ] + }, + "execution_count": 214, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop('newcol', axis=1, errors='ignore')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [], + "source": [ + "a = [1,2,3,4]" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['1.00', '2.00', '3.00', '4.00']" + ] + }, + "execution_count": 220, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "['{:.2f}'.format(x) for x in a]" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [], + "source": [ + "a = Path('/path/to/file.mp3')" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bc7e9ccf-49ba-4672-b1d8-6880d6b7e251\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "id_to_lookup = 13000\n", + "fp_records = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", + "df = pd.read_csv(fp_records)\n", + "row = df.iloc[id_to_lookup]\n", + "print(row['uuid'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
