{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Append UUID to SHA256 CSV" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "import math\n", "from glob import glob\n", "from random import randint\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DATA_STORE = '/data_store_ssd/'\n", "dir_dataset = join(DATA_STORE, 'apps/megapixels/datasets/lfw')\n", "fp_shas = join(dir_dataset, 'records.csv')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/records.csv'\n", "fp_in = '/data_store_ssd/apps/megapixels/datasets/lfw/rois.csv'\n", "fp_out = '/data_store_ssd/apps/megapixels/datasets/lfw/rois_min.csv'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_records = pd.read_csv(fp_in)\n", "df_rois = pd.read_csv(fp_in)\n", "nrecords = len(df_records)\n", "nrois = len(df_rois)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# add explicit index to rois\n", "df_rois['idx'] = [0] * nrois" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 14399/14399 [00:15<00:00, 914.63it/s]\n" ] } ], "source": [ "for idx, row in tqdm(df_rois.iterrows(), total=nrois):\n", " fn = row['fn']\n", " subdir = row['subdir']\n", " # get sha256 from records where match\n", " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", " df_rois.at[idx, 'idx'] = int(row_match.index[0])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnhimage_heightimage_widthsubdirwxyidx
0jpgAJ_Cook_00010.330000250250AJ_Cook0.3300000.3366670.3500000
1jpgAJ_Lamas_00010.393333250250AJ_Lamas0.3933330.2866670.3133331
2jpgAaron_Eckhart_00010.393333250250Aaron_Eckhart0.3933330.2866670.2733332
3jpgAaron_Guiel_00010.393333250250Aaron_Guiel0.3933330.2866670.3133333
4jpgAaron_Patterson_00010.393333250250Aaron_Patterson0.3933330.2866670.2733334
5jpgAaron_Peirsol_00010.393333250250Aaron_Peirsol0.3933330.2866670.3133335
6jpgAaron_Peirsol_00020.393333250250Aaron_Peirsol0.3933330.2866670.3133335
7jpgAaron_Peirsol_00030.326667250250Aaron_Peirsol0.3300000.3366670.3200005
8jpgAaron_Peirsol_00040.330000250250Aaron_Peirsol0.3300000.3366670.3500005
9jpgAaron_Pena_00010.393333250250Aaron_Pena0.3933330.3266670.2733339
10jpgAaron_Sorkin_00010.393333250250Aaron_Sorkin0.3933330.2466670.31333310
11jpgAaron_Sorkin_00020.393333250250Aaron_Sorkin0.3933330.2866670.31333310
12jpgAaron_Tippin_00010.330000250250Aaron_Tippin0.3300000.2700000.35000012
13jpgAbba_Eban_00010.393333250250Abba_Eban0.3933330.2866670.31333313
14jpgAbbas_Kiarostami_00010.330000250250Abbas_Kiarostami0.3300000.3033330.35000014
15jpgAbdel_Aziz_Al-Hakim_00010.330000250250Abdel_Aziz_Al-Hakim0.3300000.3033330.35000015
16jpgAbdel_Aziz_Al-Hakim_00010.270000250250Abdel_Aziz_Al-Hakim0.2733330.6733330.37666715
17jpgAbdel_Madi_Shabneh_00010.393333250250Abdel_Madi_Shabneh0.3933330.2466670.31333316
18jpgAbdel_Nasser_Assidi_00010.393333250250Abdel_Nasser_Assidi0.3933330.2866670.27333317
19jpgAbdel_Nasser_Assidi_00010.190000250250Abdel_Nasser_Assidi0.1900000.7533330.44666717
\n", "
" ], "text/plain": [ " ext fn h image_height image_width \\\n", "0 jpg AJ_Cook_0001 0.330000 250 250 \n", "1 jpg AJ_Lamas_0001 0.393333 250 250 \n", "2 jpg Aaron_Eckhart_0001 0.393333 250 250 \n", "3 jpg Aaron_Guiel_0001 0.393333 250 250 \n", "4 jpg Aaron_Patterson_0001 0.393333 250 250 \n", "5 jpg Aaron_Peirsol_0001 0.393333 250 250 \n", "6 jpg Aaron_Peirsol_0002 0.393333 250 250 \n", "7 jpg Aaron_Peirsol_0003 0.326667 250 250 \n", "8 jpg Aaron_Peirsol_0004 0.330000 250 250 \n", "9 jpg Aaron_Pena_0001 0.393333 250 250 \n", "10 jpg Aaron_Sorkin_0001 0.393333 250 250 \n", "11 jpg Aaron_Sorkin_0002 0.393333 250 250 \n", "12 jpg Aaron_Tippin_0001 0.330000 250 250 \n", "13 jpg Abba_Eban_0001 0.393333 250 250 \n", "14 jpg Abbas_Kiarostami_0001 0.330000 250 250 \n", "15 jpg Abdel_Aziz_Al-Hakim_0001 0.330000 250 250 \n", "16 jpg Abdel_Aziz_Al-Hakim_0001 0.270000 250 250 \n", "17 jpg Abdel_Madi_Shabneh_0001 0.393333 250 250 \n", "18 jpg Abdel_Nasser_Assidi_0001 0.393333 250 250 \n", "19 jpg Abdel_Nasser_Assidi_0001 0.190000 250 250 \n", "\n", " subdir w x y idx \n", "0 AJ_Cook 0.330000 0.336667 0.350000 0 \n", "1 AJ_Lamas 0.393333 0.286667 0.313333 1 \n", "2 Aaron_Eckhart 0.393333 0.286667 0.273333 2 \n", "3 Aaron_Guiel 0.393333 0.286667 0.313333 3 \n", "4 Aaron_Patterson 0.393333 0.286667 0.273333 4 \n", "5 Aaron_Peirsol 0.393333 0.286667 0.313333 5 \n", "6 Aaron_Peirsol 0.393333 0.286667 0.313333 5 \n", "7 Aaron_Peirsol 0.330000 0.336667 0.320000 5 \n", "8 Aaron_Peirsol 0.330000 0.336667 0.350000 5 \n", "9 Aaron_Pena 0.393333 0.326667 0.273333 9 \n", "10 Aaron_Sorkin 0.393333 0.246667 0.313333 10 \n", "11 Aaron_Sorkin 0.393333 0.286667 0.313333 10 \n", "12 Aaron_Tippin 0.330000 0.270000 0.350000 12 \n", "13 Abba_Eban 0.393333 0.286667 0.313333 13 \n", "14 Abbas_Kiarostami 0.330000 0.303333 0.350000 14 \n", "15 Abdel_Aziz_Al-Hakim 0.330000 0.303333 0.350000 15 \n", "16 Abdel_Aziz_Al-Hakim 0.273333 0.673333 0.376667 15 \n", "17 Abdel_Madi_Shabneh 0.393333 0.246667 0.313333 16 \n", "18 Abdel_Nasser_Assidi 0.393333 0.286667 0.273333 17 \n", "19 Abdel_Nasser_Assidi 0.190000 0.753333 0.446667 17 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois.head(20)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'AJ_Cook'" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_records.iloc[0]['subdir']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# drop\n", "df_rois.drop(['fn', 'subdir', 'ext'], axis=1, inplace=True, errors='ignore')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "# save\n", "df_rois.to_csv(fp_out, index=False)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "fp_uuid_vecs = '/data_store_ssd/apps/megapixels/datasets/lfw/lfw_uuid_vecs.json'" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "with open(fp_uuid_vecs, 'r') as fp:\n", " data = json.load(fp)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "00147121-7445-45bc-ae11-c518f018a700 128\n" ] } ], "source": [ "for k,v in data.items():\n", " print(k, ' ', len(v))\n", " break" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "import difflib" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6451612903225806" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = 'Aleksandar Petrović'\n", "b = 'Aco Petrović'\n", "seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n", "seq.ratio()\n", "#0.97560975609756095" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }