{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Append UUID to SHA256 CSV"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import os\n",
"from os.path import join\n",
"import math\n",
"from glob import glob\n",
"from random import randint\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import sys\n",
"sys.path.append('/work/megapixels_dev/megapixels/')\n",
"from app.utils import file_utils"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"DATA_STORE = '/data_store_ssd/'\n",
"dir_dataset = join(DATA_STORE, 'apps/megapixels/datasets/lfw')\n",
"fp_shas = join(dir_dataset, 'records.csv')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" AJ_Cook_0001 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" AJ_Cook | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" AJ_Lamas | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" Aaron_Eckhart_0001 | \n",
" b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... | \n",
" Aaron_Eckhart | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" Aaron_Guiel_0001 | \n",
" 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... | \n",
" Aaron_Guiel | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" Aaron_Patterson_0001 | \n",
" 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... | \n",
" Aaron_Patterson | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn \\\n",
"0 jpg AJ_Cook_0001 \n",
"1 jpg AJ_Lamas_0001 \n",
"2 jpg Aaron_Eckhart_0001 \n",
"3 jpg Aaron_Guiel_0001 \n",
"4 jpg Aaron_Patterson_0001 \n",
"\n",
" sha256 subdir \n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n",
"2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n",
"3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n",
"4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load names\n",
"df_records = pd.read_csv(fp_shas)\n",
"records = df_records.to_dict('index')\n",
"df_records.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import uuid\n",
"import base64\n",
"\n",
"# get a UUID - URL safe, Base64\n",
"def b64uuid():\n",
" r_uuid = base64.urlsafe_b64encode(uuid.uuid4().bytes)\n",
" print(r_uuid)\n",
" return r_uuid.replace('=', '')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"UUID('05ba06b3-875e-429a-ac39-02b129b77d71')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uuid.uuid4()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# append a UUID to every entry\n",
"for idx, item in records.items():\n",
" records[idx]['uuid'] = uuid.uuid4()\n",
"# save to csv\n",
"fp_sha_uuid = join(dir_dataset, 'records_uuid.csv')\n",
"df_uuid = pd.DataFrame.from_dict(list(records.values())) # ignore the indices\n",
"df_uuid.to_csv(fp_sha_uuid, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [],
"source": [
"fp = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test.csv'"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(fp)"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a', 'a', 'a', 'a', 'a']"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"['a'] * 5"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" subdir | \n",
" vec | \n",
" w | \n",
" x | \n",
" y | \n",
" newcol | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" AJ_Cook_0001 | \n",
" 0.330000 | \n",
" 250 | \n",
" 250 | \n",
" AJ_Cook | \n",
" -0.07324773073196411, 0.150736004114151, 0.006... | \n",
" 0.330000 | \n",
" 0.336667 | \n",
" 0.350000 | \n",
" 10 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 0.393333 | \n",
" 250 | \n",
" 250 | \n",
" AJ_Lamas | \n",
" -0.12234891951084137, 0.06931854784488678, 0.0... | \n",
" 0.393333 | \n",
" 0.286667 | \n",
" 0.313333 | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn h image_height image_width subdir \\\n",
"0 jpg AJ_Cook_0001 0.330000 250 250 AJ_Cook \n",
"1 jpg AJ_Lamas_0001 0.393333 250 250 AJ_Lamas \n",
"\n",
" vec w x \\\n",
"0 -0.07324773073196411, 0.150736004114151, 0.006... 0.330000 0.336667 \n",
"1 -0.12234891951084137, 0.06931854784488678, 0.0... 0.393333 0.286667 \n",
"\n",
" y newcol \n",
"0 0.350000 10 \n",
"1 0.313333 "
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"128\n"
]
}
],
"source": [
"for idx, row in df.iterrows():\n",
" vec = row['vec'].split(',')\n",
" print(type(vec))\n",
" print(len(vec))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test_idx.csv'\n",
"df.to_csv(fp_out)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [],
"source": [
"fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n",
"fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/records_idx.csv'\n",
"df = pd.read_csv(fp_in)\n",
"df.to_csv(fp_out, index=True)"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [],
"source": [
"df.loc[(df['fn'] == 'AJ_Cook_0001') & (df['subdir'] == 'AJ_Cook'), 'ext'] = 'wow'"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
" newcol | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" wow | \n",
" AJ_Cook_0001 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" AJ_Cook | \n",
" f03fd921-2d56-4e83-8115-f658d6a72287 | \n",
" 10 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" AJ_Lamas | \n",
" 0c96c5bb-dbd1-4584-bd68-af11664b98bb | \n",
" x | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" Aaron_Eckhart_0001 | \n",
" b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... | \n",
" Aaron_Eckhart | \n",
" 8221e75c-9537-4a4f-9693-483b445244b4 | \n",
" x | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" Aaron_Guiel_0001 | \n",
" 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... | \n",
" Aaron_Guiel | \n",
" a2955610-ed5e-433c-bdd4-e3a72ff44736 | \n",
" x | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" Aaron_Patterson_0001 | \n",
" 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... | \n",
" Aaron_Patterson | \n",
" 1d0782e9-ed16-4550-b1e9-d9c03eef6181 | \n",
" x | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn \\\n",
"0 wow AJ_Cook_0001 \n",
"1 jpg AJ_Lamas_0001 \n",
"2 jpg Aaron_Eckhart_0001 \n",
"3 jpg Aaron_Guiel_0001 \n",
"4 jpg Aaron_Patterson_0001 \n",
"\n",
" sha256 subdir \\\n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n",
"2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n",
"3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n",
"4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n",
"\n",
" uuid newcol \n",
"0 f03fd921-2d56-4e83-8115-f658d6a72287 10 \n",
"1 0c96c5bb-dbd1-4584-bd68-af11664b98bb x \n",
"2 8221e75c-9537-4a4f-9693-483b445244b4 x \n",
"3 a2955610-ed5e-433c-bdd4-e3a72ff44736 x \n",
"4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 x "
]
},
"execution_count": 208,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['newcol'] = ['x'] * len(df)\n",
"df.at[0, 'newcol'] = '10'\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" wow | \n",
" AJ_Cook_0001 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" AJ_Cook | \n",
" f03fd921-2d56-4e83-8115-f658d6a72287 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" AJ_Lamas | \n",
" 0c96c5bb-dbd1-4584-bd68-af11664b98bb | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" Aaron_Eckhart_0001 | \n",
" b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... | \n",
" Aaron_Eckhart | \n",
" 8221e75c-9537-4a4f-9693-483b445244b4 | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" Aaron_Guiel_0001 | \n",
" 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... | \n",
" Aaron_Guiel | \n",
" a2955610-ed5e-433c-bdd4-e3a72ff44736 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" Aaron_Patterson_0001 | \n",
" 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... | \n",
" Aaron_Patterson | \n",
" 1d0782e9-ed16-4550-b1e9-d9c03eef6181 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn \\\n",
"0 wow AJ_Cook_0001 \n",
"1 jpg AJ_Lamas_0001 \n",
"2 jpg Aaron_Eckhart_0001 \n",
"3 jpg Aaron_Guiel_0001 \n",
"4 jpg Aaron_Patterson_0001 \n",
"\n",
" sha256 subdir \\\n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n",
"2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n",
"3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n",
"4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n",
"\n",
" uuid \n",
"0 f03fd921-2d56-4e83-8115-f658d6a72287 \n",
"1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n",
"2 8221e75c-9537-4a4f-9693-483b445244b4 \n",
"3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n",
"4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 "
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.drop('newcol', axis=1, errors='ignore')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [],
"source": [
"a = [1,2,3,4]"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['1.00', '2.00', '3.00', '4.00']"
]
},
"execution_count": 220,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"['{:.2f}'.format(x) for x in a]"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 222,
"metadata": {},
"outputs": [],
"source": [
"a = Path('/path/to/file.mp3')"
]
},
{
"cell_type": "code",
"execution_count": 235,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bc7e9ccf-49ba-4672-b1d8-6880d6b7e251\n"
]
}
],
"source": [
"import pandas as pd\n",
"id_to_lookup = 13000\n",
"fp_records = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n",
"df = pd.read_csv(fp_records)\n",
"row = df.iloc[id_to_lookup]\n",
"print(row['uuid'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}