{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Split CSVs into multiple files"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import os\n",
"from os.path import join\n",
"import math\n",
"from glob import glob\n",
"from random import randint\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"from tqdm import tqdm\n",
"\n",
"import sys\n",
"sys.path.append('/work/megapixels_dev/megapixels/')\n",
"from app.utils import file_utils"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"DATA_STORE_NAS = '/data_store_nas/'\n",
"DATA_STORE_SSD = '/data_store_ssd/'\n",
"dir_dataset_ssd = 'apps/megapixels/datasets/lfw/'\n",
"dir_dataset_nas = 'datasets/people/lfw'"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"fp_records = join(DATA_STORE_NAS, dir_dataset_nas, 'records.csv')\n",
"fp_index = join(DATA_STORE_NAS, dir_dataset_nas, 'index.csv')\n",
"fp_sha256s = join(DATA_STORE_NAS, dir_dataset_nas, 'sha256s.csv')\n",
"fp_uuids = join(DATA_STORE_NAS, dir_dataset_nas, 'uuids.csv')\n",
"fp_rois = join(DATA_STORE_NAS, dir_dataset_nas, 'rois.csv')\n",
"fp_names_gender_kg = join(DATA_STORE_NAS, dir_dataset_nas, 'lfw_names_gender_kg.csv')\n",
"fp_identity_meta = join(DATA_STORE_NAS, dir_dataset_nas, 'identity_meta.csv')"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"import difflib\n",
"def similarity(a, b):\n",
" seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())\n",
" return seq.ratio()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"df_records = pd.read_csv(fp_records).set_index('index')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Create index.csv, `index, sha256`\n",
"# drop old columns\n",
"df_records.drop(['ext', 'fn', 'subdir', 'uuid'], axis=1).to_csv(fp_index)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"# Create uuids.csv, `index, uuid`\n",
"# drop old columns\n",
"df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create identity_meta.csv, `index, name, description, gender`\n",
"# drop old columns\n",
"df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext | \n",
" fn | \n",
" sha256 | \n",
" subdir | \n",
" uuid | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" jpg | \n",
" AJ_Cook_0001 | \n",
" 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... | \n",
" AJ_Cook | \n",
" f03fd921-2d56-4e83-8115-f658d6a72287 | \n",
"
\n",
" \n",
" | 1 | \n",
" jpg | \n",
" AJ_Lamas_0001 | \n",
" 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... | \n",
" AJ_Lamas | \n",
" 0c96c5bb-dbd1-4584-bd68-af11664b98bb | \n",
"
\n",
" \n",
" | 2 | \n",
" jpg | \n",
" Aaron_Eckhart_0001 | \n",
" b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... | \n",
" Aaron_Eckhart | \n",
" 8221e75c-9537-4a4f-9693-483b445244b4 | \n",
"
\n",
" \n",
" | 3 | \n",
" jpg | \n",
" Aaron_Guiel_0001 | \n",
" 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... | \n",
" Aaron_Guiel | \n",
" a2955610-ed5e-433c-bdd4-e3a72ff44736 | \n",
"
\n",
" \n",
" | 4 | \n",
" jpg | \n",
" Aaron_Patterson_0001 | \n",
" 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... | \n",
" Aaron_Patterson | \n",
" 1d0782e9-ed16-4550-b1e9-d9c03eef6181 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ext fn \\\n",
"index \n",
"0 jpg AJ_Cook_0001 \n",
"1 jpg AJ_Lamas_0001 \n",
"2 jpg Aaron_Eckhart_0001 \n",
"3 jpg Aaron_Guiel_0001 \n",
"4 jpg Aaron_Patterson_0001 \n",
"\n",
" sha256 subdir \\\n",
"index \n",
"0 550937b71b9af36b6083fa1ce7c76e97e3254c439614a6... AJ_Cook \n",
"1 46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193... AJ_Lamas \n",
"2 b68ed8d50ba85209d826b962987077bc8e1826f7f2f325... Aaron_Eckhart \n",
"3 156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b... Aaron_Guiel \n",
"4 34dfe798220b53aac910e5e39705770d212cdfbe4be8a4... Aaron_Patterson \n",
"\n",
" uuid \n",
"index \n",
"0 f03fd921-2d56-4e83-8115-f658d6a72287 \n",
"1 0c96c5bb-dbd1-4584-bd68-af11664b98bb \n",
"2 8221e75c-9537-4a4f-9693-483b445244b4 \n",
"3 a2955610-ed5e-433c-bdd4-e3a72ff44736 \n",
"4 1d0782e9-ed16-4550-b1e9-d9c03eef6181 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_records.head()"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
" name_kg | \n",
" score | \n",
" url | \n",
" name_new | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" AJ Cook | \n",
" A. J. Cook | \n",
" 274.554810 | \n",
" http://www.ajcookofficial.com | \n",
" | \n",
"
\n",
" \n",
" | 1 | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" 389.547211 | \n",
" NaN | \n",
" | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" description gender images name name_kg score \\\n",
"0 Canadian actress f 1 AJ Cook A. J. Cook 274.554810 \n",
"1 American actor m 1 AJ Lamas AJ Lamas 389.547211 \n",
"\n",
" url name_new \n",
"0 http://www.ajcookofficial.com \n",
"1 NaN "
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_kg = pd.read_csv(fp_names_gender_kg)\n",
"df_kg_new = df_kg.copy()\n",
"df_kg_new['name_new'] = [''] * len(df_kg_new)\n",
"df_kg_new.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
" name_kg | \n",
" score | \n",
" url | \n",
" name_new | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" AJ Cook | \n",
" A. J. Cook | \n",
" 274.554810 | \n",
" http://www.ajcookofficial.com | \n",
" A. J. Cook | \n",
"
\n",
" \n",
" | 1 | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" 389.547211 | \n",
" NaN | \n",
" AJ Lamas | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" description gender images name name_kg score \\\n",
"0 Canadian actress f 1 AJ Cook A. J. Cook 274.554810 \n",
"1 American actor m 1 AJ Lamas AJ Lamas 389.547211 \n",
"\n",
" url name_new \n",
"0 http://www.ajcookofficial.com A. J. Cook \n",
"1 NaN AJ Lamas "
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# correct original LFW names using\n",
"for idx, row in df_kg_new.iterrows():\n",
" name_kg = str(row.get('name_kg', ''))\n",
" name = str(row.get('name', ''))\n",
" name_new = row['name']\n",
" if name_kg:\n",
" sim = similarity(name, name_kg)\n",
" if sim > .75:\n",
" name_new = row['name_kg']\n",
" df_kg_new.at[idx, 'name_new'] = name_new\n",
"\n",
"df_kg_new.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"df_kg_new['index'] = [''] * len(df_kg_new)"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
" name_kg | \n",
" name_new | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" AJ Cook | \n",
" A. J. Cook | \n",
" A. J. Cook | \n",
"
\n",
" \n",
" | 1 | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" description gender images name name_kg name_new\n",
"0 Canadian actress f 1 AJ Cook A. J. Cook A. J. Cook\n",
"1 American actor m 1 AJ Lamas AJ Lamas AJ Lamas"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop extra info\n",
"df_kg_new = df_kg_new.drop(['score', 'url'], axis=1)\n",
"df_kg_new.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 5749/5749 [00:05<00:00, 1006.44it/s]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" description | \n",
" gender | \n",
" images | \n",
" name | \n",
" name_kg | \n",
" name_new | \n",
" index | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Canadian actress | \n",
" f | \n",
" 1 | \n",
" AJ Cook | \n",
" A. J. Cook | \n",
" A. J. Cook | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" American actor | \n",
" m | \n",
" 1 | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" AJ Lamas | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" description gender images name name_kg name_new index\n",
"0 Canadian actress f 1 AJ Cook A. J. Cook A. J. Cook 0\n",
"1 American actor m 1 AJ Lamas AJ Lamas AJ Lamas 1"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add index column\n",
"limit = 100000\n",
"for idx, row in tqdm(df_kg_new[:limit].iterrows(), total=len(df_kg_new[:limit])):\n",
" name = row['name'] # original LFW\n",
" # get sha256 from records where match\n",
" subdir = name.replace(' ', '_')\n",
" row_match = df_records.loc[(df_records['subdir'] == subdir)]\n",
" df_kg_new.at[idx, 'index'] = int(row_match.index[0])"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"df_kg_new = df_kg_new.drop(['name', 'name_kg'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"df_kg_new = df_kg_new.rename(columns={'name_new': 'name'})"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"df_kg_new.to_csv(fp_identity_meta, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert ROIs"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" h | \n",
" image_height | \n",
" image_width | \n",
" w | \n",
" x | \n",
" y | \n",
"
\n",
" \n",
" | index | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.330000 | \n",
" 250 | \n",
" 250 | \n",
" 0.330000 | \n",
" 0.336667 | \n",
" 0.350000 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.393333 | \n",
" 250 | \n",
" 250 | \n",
" 0.393333 | \n",
" 0.286667 | \n",
" 0.313333 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" h image_height image_width w x y\n",
"index \n",
"0 0.330000 250 250 0.330000 0.336667 0.350000\n",
"1 0.393333 250 250 0.393333 0.286667 0.313333"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rois = pd.read_csv(fp_rois).set_index('index')\n",
"df_rois.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"df_index = pd.read_csv(fp_index)"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"#row = df_records.iloc[ (df_records)]\n",
"row = df_kg_new.loc[(df_kg_new['name'] == 'B.B. King')]\n",
"#print('index', row['index'].index[0])\n",
"row = df_index.iloc[row['index']]"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7cf753f9e1256e433901a262030f4d184afb4002b49e6b1c7a2d59d07306c2ff\n"
]
}
],
"source": [
"sha = row['sha256']\n",
"print(sha.values[0])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 14399/14399 [00:15<00:00, 914.63it/s]\n"
]
}
],
"source": [
"limit = 10\n",
"for idx, row in tqdm(df_kg[:limt].iterrows(), total=len(df_kg[:limt])):\n",
" fn = row['fn']\n",
" subdir = row['subdir']\n",
" # get sha256 from records where match\n",
" row_match = df_records.loc[(df_records['subdir'] == subdir)]\n",
" df_records.at[idx, 'idx'] = int(row_match.index[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_identity_meta = df_identity_meta.drop(['ext', 'url', 'score'], axis=1)\n",
"df_identity_meta.to_csv(fp_identity_meta)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:megapixels]",
"language": "python",
"name": "conda-env-megapixels-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}