{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Split CSVs into multiple files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import os\n", "from os.path import join\n", "import math\n", "from glob import glob\n", "from random import randint\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm\n", "\n", "import sys\n", "sys.path.append('/work/megapixels_dev/megapixels/')\n", "from app.utils import file_utils" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "DATA_STORE_NAS = '/data_store_nas/'\n", "DATA_STORE_SSD = '/data_store_ssd/'\n", "dir_dataset_ssd = 'apps/megapixels/datasets/lfw/'\n", "dir_dataset_nas = 'datasets/people/lfw'" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "fp_records = join(DATA_STORE_NAS, dir_dataset_nas, 'records.csv')\n", "fp_index = join(DATA_STORE_NAS, dir_dataset_nas, 'index.csv')\n", "fp_sha256s = join(DATA_STORE_NAS, dir_dataset_nas, 'sha256s.csv')\n", "fp_uuids = join(DATA_STORE_NAS, dir_dataset_nas, 'uuids.csv')\n", "fp_rois = join(DATA_STORE_NAS, dir_dataset_nas, 'rois.csv')\n", "fp_names_gender_kg = join(DATA_STORE_NAS, dir_dataset_nas, 'lfw_names_gender_kg.csv')\n", "fp_identity_meta = join(DATA_STORE_NAS, dir_dataset_nas, 'identity_meta.csv')" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "import difflib\n", "def similarity(a, b):\n", " seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n", " return seq.ratio()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "df_records = pd.read_csv(fp_records).set_index('index')" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Create index.csv, `index, sha256`\n", "# drop old columns\n", "df_records.drop(['ext', 'fn', 'subdir', 'uuid'], axis=1).to_csv(fp_index)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# Create uuids.csv, `index, uuid`\n", "# drop old columns\n", "df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create identity_meta.csv, `index, name, description, gender`\n", "# drop old columns\n", "df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
extfnsha256subdiruuid
index
0jpgAJ_Cook_0001550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...AJ_Cookf03fd921-2d56-4e83-8115-f658d6a72287
1jpgAJ_Lamas_000146d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...AJ_Lamas0c96c5bb-dbd1-4584-bd68-af11664b98bb
2jpgAaron_Eckhart_0001b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...Aaron_Eckhart8221e75c-9537-4a4f-9693-483b445244b4
3jpgAaron_Guiel_0001156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...Aaron_Guiela2955610-ed5e-433c-bdd4-e3a72ff44736
4jpgAaron_Patterson_000134dfe798220b53aac910e5e39705770d212cdfbe4be8a4...Aaron_Patterson1d0782e9-ed16-4550-b1e9-d9c03eef6181
\n", "
" ], "text/plain": [ " ext fn \\\n", "index \n", "0 jpg AJ_Cook_0001 \n", "1 jpg AJ_Lamas_0001 \n", "2 jpg Aaron_Eckhart_0001 \n", "3 jpg Aaron_Guiel_0001 \n", "4 jpg Aaron_Patterson_0001 \n", "\n", " sha256 subdir \\\n", "index \n", "0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n", "1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n", "2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n", "3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n", "4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n", "\n", " uuid \n", "index \n", "0 f03fd921-2d56-4e83-8115-f658d6a72287 \n", "1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n", "2 8221e75c-9537-4a4f-9693-483b445244b4 \n", "3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n", "4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_records.head()" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptiongenderimagesnamename_kgscoreurlname_new
0Canadian actressf1AJ CookA. J. Cook274.554810http://www.ajcookofficial.com
1American actorm1AJ LamasAJ Lamas389.547211NaN
\n", "
" ], "text/plain": [ " description gender images name name_kg score \\\n", "0 Canadian actress f 1 AJ Cook A. J. Cook 274.554810 \n", "1 American actor m 1 AJ Lamas AJ Lamas 389.547211 \n", "\n", " url name_new \n", "0 http://www.ajcookofficial.com \n", "1 NaN " ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_kg = pd.read_csv(fp_names_gender_kg)\n", "df_kg_new = df_kg.copy()\n", "df_kg_new['name_new'] = [''] * len(df_kg_new)\n", "df_kg_new.head(2)" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptiongenderimagesnamename_kgscoreurlname_new
0Canadian actressf1AJ CookA. J. Cook274.554810http://www.ajcookofficial.comA. J. Cook
1American actorm1AJ LamasAJ Lamas389.547211NaNAJ Lamas
\n", "
" ], "text/plain": [ " description gender images name name_kg score \\\n", "0 Canadian actress f 1 AJ Cook A. J. Cook 274.554810 \n", "1 American actor m 1 AJ Lamas AJ Lamas 389.547211 \n", "\n", " url name_new \n", "0 http://www.ajcookofficial.com A. J. Cook \n", "1 NaN AJ Lamas " ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# correct original LFW names using\n", "for idx, row in df_kg_new.iterrows():\n", " name_kg = str(row.get('name_kg', ''))\n", " name = str(row.get('name', ''))\n", " name_new = row['name']\n", " if name_kg:\n", " sim = similarity(name, name_kg)\n", " if sim > .75:\n", " name_new = row['name_kg']\n", " df_kg_new.at[idx, 'name_new'] = name_new\n", "\n", "df_kg_new.head(2)" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "df_kg_new['index'] = [''] * len(df_kg_new)" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptiongenderimagesnamename_kgname_new
0Canadian actressf1AJ CookA. J. CookA. J. Cook
1American actorm1AJ LamasAJ LamasAJ Lamas
\n", "
" ], "text/plain": [ " description gender images name name_kg name_new\n", "0 Canadian actress f 1 AJ Cook A. J. Cook A. J. Cook\n", "1 American actor m 1 AJ Lamas AJ Lamas AJ Lamas" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop extra info\n", "df_kg_new = df_kg_new.drop(['score', 'url'], axis=1)\n", "df_kg_new.head(2)" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5749/5749 [00:05<00:00, 1006.44it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
descriptiongenderimagesnamename_kgname_newindex
0Canadian actressf1AJ CookA. J. CookA. J. Cook0
1American actorm1AJ LamasAJ LamasAJ Lamas1
\n", "
" ], "text/plain": [ " description gender images name name_kg name_new index\n", "0 Canadian actress f 1 AJ Cook A. J. Cook A. J. Cook 0\n", "1 American actor m 1 AJ Lamas AJ Lamas AJ Lamas 1" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# add index column\n", "limit = 100000\n", "for idx, row in tqdm(df_kg_new[:limit].iterrows(), total=len(df_kg_new[:limit])):\n", " name = row['name'] # original LFW\n", " # get sha256 from records where match\n", " subdir = name.replace(' ', '_')\n", " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", " df_kg_new.at[idx, 'index'] = int(row_match.index[0])" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [], "source": [ "df_kg_new = df_kg_new.drop(['name', 'name_kg'], axis=1)" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [], "source": [ "df_kg_new = df_kg_new.rename(columns={'name_new': 'name'})" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "df_kg_new.to_csv(fp_identity_meta, index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert ROIs" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
himage_heightimage_widthwxy
index
00.3300002502500.3300000.3366670.350000
10.3933332502500.3933330.2866670.313333
\n", "
" ], "text/plain": [ " h image_height image_width w x y\n", "index \n", "0 0.330000 250 250 0.330000 0.336667 0.350000\n", "1 0.393333 250 250 0.393333 0.286667 0.313333" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rois = pd.read_csv(fp_rois).set_index('index')\n", "df_rois.head(2)" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "df_index = pd.read_csv(fp_index)" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [], "source": [ "#row = df_records.iloc[ (df_records)]\n", "row = df_kg_new.loc[(df_kg_new['name'] == 'B.B. King')]\n", "#print('index', row['index'].index[0])\n", "row = df_index.iloc[row['index']]" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7cf753f9e1256e433901a262030f4d184afb4002b49e6b1c7a2d59d07306c2ff\n" ] } ], "source": [ "sha = row['sha256']\n", "print(sha.values[0])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 14399/14399 [00:15<00:00, 914.63it/s]\n" ] } ], "source": [ "limit = 10\n", "for idx, row in tqdm(df_kg[:limt].iterrows(), total=len(df_kg[:limt])):\n", " fn = row['fn']\n", " subdir = row['subdir']\n", " # get sha256 from records where match\n", " row_match = df_records.loc[(df_records['subdir'] == subdir)]\n", " df_records.at[idx, 'idx'] = int(row_match.index[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_identity_meta = df_identity_meta.drop(['ext', 'url', 'score'], axis=1)\n", "df_identity_meta.to_csv(fp_identity_meta)" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:megapixels]", "language": "python", "name": "conda-env-megapixels-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }