From 49a49bebe3f972e93add837180f5672a4ae62ce0 Mon Sep 17 00:00:00 2001 From: adamhrv Date: Thu, 13 Dec 2018 14:33:05 +0100 Subject: new nbs --- megapixels/notebooks/datasets/sha_uuid.ipynb | 728 +++++++++++++++++++++++++++ 1 file changed, 728 insertions(+) create mode 100644 megapixels/notebooks/datasets/sha_uuid.ipynb (limited to 'megapixels/notebooks/datasets/sha_uuid.ipynb') diff --git a/megapixels/notebooks/datasets/sha_uuid.ipynb b/megapixels/notebooks/datasets/sha_uuid.ipynb new file mode 100644 index 00000000..21f59722 --- /dev/null +++ b/megapixels/notebooks/datasets/sha_uuid.ipynb @@ -0,0 +1,728 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Append UUID to SHA256 CSV" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import os\n", + "from os.path import join\n", + "import math\n", + "from glob import glob\n", + "from random import randint\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import sys\n", + "sys.path.append('/work/megapixels_dev/megapixels/')\n", + "from app.utils import file_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_STORE = '/data_store_ssd/'\n", + "dir_dataset = join(DATA_STORE, 'apps/megapixels/datasets/lfw')\n", + "fp_shas = join(dir_dataset, 'records.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdir
0jpgAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cook
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiel
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson
\n", + "
" + ], + "text/plain": [ + " ext fn \\\n", + "0 jpg AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load names\n", + "df_records = pd.read_csv(fp_shas)\n", + "records = df_records.to_dict('index')\n", + "df_records.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import base64\n", + "\n", + "# get a UUID - URL safe, Base64\n", + "def b64uuid():\n", + " r_uuid = base64.urlsafe_b64encode(uuid.uuid4().bytes)\n", + " print(r_uuid)\n", + " return r_uuid.replace('=', '')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "UUID('05ba06b3-875e-429a-ac39-02b129b77d71')" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uuid.uuid4()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# append a UUID to every entry\n", + "for idx, item in records.items():\n", + " records[idx]['uuid'] = uuid.uuid4()\n", + "# save to csv\n", + "fp_sha_uuid = join(dir_dataset, 'records_uuid.csv')\n", + "df_uuid = pd.DataFrame.from_dict(list(records.values())) # ignore the indices\n", + "df_uuid.to_csv(fp_sha_uuid, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "fp = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(fp)" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['a', 'a', 'a', 'a', 'a']" + ] + }, + "execution_count": 180, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "['a'] * 5" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnhimage_heightimage_widthsubdirvecwxynewcol
0jpgAJ_Cook_00010.330000250250AJ_Cook-0.07324773073196411, 0.150736004114151, 0.006...0.3300000.3366670.35000010
1jpgAJ_Lamas_00010.393333250250AJ_Lamas-0.12234891951084137, 0.06931854784488678, 0.0...0.3933330.2866670.313333
\n", + "
" + ], + "text/plain": [ + " ext fn h image_height image_width subdir \\\n", + "0 jpg AJ_Cook_0001 0.330000 250 250 AJ_Cook \n", + "1 jpg AJ_Lamas_0001 0.393333 250 250 AJ_Lamas \n", + "\n", + " vec w x \\\n", + "0 -0.07324773073196411, 0.150736004114151, 0.006... 0.330000 0.336667 \n", + "1 -0.12234891951084137, 0.06931854784488678, 0.0... 0.393333 0.286667 \n", + "\n", + " y newcol \n", + "0 0.350000 10 \n", + "1 0.313333 " + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "128\n" + ] + } + ], + "source": [ + "for idx, row in df.iterrows():\n", + " vec = row['vec'].split(',')\n", + " print(type(vec))\n", + " print(len(vec))\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/embeddings_arr_test_idx.csv'\n", + "df.to_csv(fp_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [], + "source": [ + "fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", + "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/records_idx.csv'\n", + "df = pd.read_csv(fp_in)\n", + "df.to_csv(fp_out, index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[(df['fn'] == 'AJ_Cook_0001') & (df['subdir'] == 'AJ_Cook'), 'ext'] = 'wow'" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdiruuidnewcol
0wowAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a7228710
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bbx
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4x
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736x
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181x
\n", + "
" + ], + "text/plain": [ + " ext fn \\\n", + "0 wow AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \\\n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", + "\n", + " uuid newcol \n", + "0 f03fd921-2d56-4e83-8115-f658d6a72287 10 \n", + "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb x \n", + "2 8221e75c-9537-4a4f-9693-483b445244b4 x \n", + "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 x \n", + "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 x " + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['newcol'] = ['x'] * len(df)\n", + "df.at[0, 'newcol'] = '10'\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
extfnsha256subdiruuid
0wowAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a72287
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bb
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181
\n", + "
" + ], + "text/plain": [ + " ext fn \\\n", + "0 wow AJ_Cook_0001 \n", + "1 jpg AJ_Lamas_0001 \n", + "2 jpg Aaron_Eckhart_0001 \n", + "3 jpg Aaron_Guiel_0001 \n", + "4 jpg Aaron_Patterson_0001 \n", + "\n", + " sha256 subdir \\\n", + "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", + "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", + "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", + "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", + "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", + "\n", + " uuid \n", + "0 f03fd921-2d56-4e83-8115-f658d6a72287 \n", + "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n", + "2 8221e75c-9537-4a4f-9693-483b445244b4 \n", + "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n", + "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 " + ] + }, + "execution_count": 214, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop('newcol', axis=1, errors='ignore')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [], + "source": [ + "a = [1,2,3,4]" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['1.00', '2.00', '3.00', '4.00']" + ] + }, + "execution_count": 220, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "['{:.2f}'.format(x) for x in a]" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [], + "source": [ + "a = Path('/path/to/file.mp3')" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bc7e9ccf-49ba-4672-b1d8-6880d6b7e251\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "id_to_lookup = 13000\n", + "fp_records = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", + "df = pd.read_csv(fp_records)\n", + "row = df.iloc[id_to_lookup]\n", + "print(row['uuid'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:megapixels]", + "language": "python", + "name": "conda-env-megapixels-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3-70-g09d2