{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Append UUID to SHA256 CSV" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "import math\n", "from glob import glob\n", "from random import randint\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "DATA_STORE = '/data_store_ssd/'\n", "dir_dataset = join(DATA_STORE, 'apps/megapixels/datasets/lfw')\n", "fp_shas = join(dir_dataset, 'records.csv')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdir
0jpgAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cook
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiel
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson
\n", "
" ], "text/plain": [ " ext fn \\\n", "0 jpg AJ_Cook_0001 \n", "1 jpg AJ_Lamas_0001 \n", "2 jpg Aaron_Eckhart_0001 \n", "3 jpg Aaron_Guiel_0001 \n", "4 jpg Aaron_Patterson_0001 \n", "\n", " sha256 subdir \n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load names\n", "df_records = pd.read_csv(fp_shas)\n", "records = df_records.to_dict('index')\n", "df_records.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import uuid\n", "import base64\n", "\n", "# get a UUID - URL safe, Base64\n", "def b64uuid():\n", " r_uuid = base64.urlsafe_b64encode(uuid.uuid4().bytes)\n", " print(r_uuid)\n", " return r_uuid.replace('=', '')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "UUID('05ba06b3-875e-429a-ac39-02b129b77d71')" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "uuid.uuid4()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# append a UUID to every entry\n", "for idx, item in records.items():\n", " records[idx]['uuid'] = uuid.uuid4()\n", "# save to csv\n", "fp_sha_uuid = join(dir_dataset, 'records_uuid.csv')\n", "df_uuid = pd.DataFrame.from_dict(list(records.values())) # ignore the indices\n", "df_uuid.to_csv(fp_sha_uuid, index=False)" ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [], "source": [ "import time\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "fp = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test.csv'" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(fp)" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['a', 'a', 'a', 'a', 'a']" ] }, "execution_count": 180, "metadata": {}, "output_type": "execute_result" } ], "source": [ "['a'] * 5" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnhimage_heightimage_widthsubdirvecwxynewcol
0jpgAJ_Cook_00010.330000250250AJ_Cook-0.07324773073196411, 0.150736004114151, 0.006...0.3300000.3366670.35000010
1jpgAJ_Lamas_00010.393333250250AJ_Lamas-0.12234891951084137, 0.06931854784488678, 0.0...0.3933330.2866670.313333
\n", "
" ], "text/plain": [ " ext fn h image_height image_width subdir \\\n", "0 jpg AJ_Cook_0001 0.330000 250 250 AJ_Cook \n", "1 jpg AJ_Lamas_0001 0.393333 250 250 AJ_Lamas \n", "\n", " vec w x \\\n", "0 -0.07324773073196411, 0.150736004114151, 0.006... 0.330000 0.336667 \n", "1 -0.12234891951084137, 0.06931854784488678, 0.0... 0.393333 0.286667 \n", "\n", " y newcol \n", "0 0.350000 10 \n", "1 0.313333 " ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "128\n" ] } ], "source": [ "for idx, row in df.iterrows():\n", " vec = row['vec'].split(',')\n", " print(type(vec))\n", " print(len(vec))\n", " break" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test_idx.csv'\n", "df.to_csv(fp_out)" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/records_idx.csv'\n", "df = pd.read_csv(fp_in)\n", "df.to_csv(fp_out, index=True)" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "df.loc[(df['fn'] == 'AJ_Cook_0001') & (df['subdir'] == 'AJ_Cook'), 'ext'] = 'wow'" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuidnewcol
0wowAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a7228710
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bbx
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4x
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736x
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181x
\n", "
" ], "text/plain": [ " ext fn \\\n", "0 wow AJ_Cook_0001 \n", "1 jpg AJ_Lamas_0001 \n", "2 jpg Aaron_Eckhart_0001 \n", "3 jpg Aaron_Guiel_0001 \n", "4 jpg Aaron_Patterson_0001 \n", "\n", " sha256 subdir \\\n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", "\n", " uuid newcol \n", "0 f03fd921-2d56-4e83-8115-f658d6a72287 10 \n", "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb x \n", "2 8221e75c-9537-4a4f-9693-483b445244b4 x \n", "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 x \n", "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 x " ] }, "execution_count": 208, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['newcol'] = ['x'] * len(df)\n", "df.at[0, 'newcol'] = '10'\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuid
0wowAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a72287
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bb
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181
\n", "
" ], "text/plain": [ " ext fn \\\n", "0 wow AJ_Cook_0001 \n", "1 jpg AJ_Lamas_0001 \n", "2 jpg Aaron_Eckhart_0001 \n", "3 jpg Aaron_Guiel_0001 \n", "4 jpg Aaron_Patterson_0001 \n", "\n", " sha256 subdir \\\n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", "\n", " uuid \n", "0 f03fd921-2d56-4e83-8115-f658d6a72287 \n", "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n", "2 8221e75c-9537-4a4f-9693-483b445244b4 \n", "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n", "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 " ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.drop('newcol', axis=1, errors='ignore')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [], "source": [ "a = [1,2,3,4]" ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1.00', '2.00', '3.00', '4.00']" ] }, "execution_count": 220, "metadata": {}, "output_type": "execute_result" } ], "source": [ "['{:.2f}'.format(x) for x in a]" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [], "source": [ "a = Path('/path/to/file.mp3')" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bc7e9ccf-49ba-4672-b1d8-6880d6b7e251\n" ] } ], "source": [ "import pandas as pd\n", "id_to_lookup = 13000\n", "fp_records = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", "df = pd.read_csv(fp_records)\n", "row = df.iloc[id_to_lookup]\n", "print(row['uuid'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }