# Split CSVs into multiple files

In [1]:
%reload_ext autoreload
%autoreload 2

import os
from os.path import join
import math
from glob import glob
from random import randint
from pathlib import Path

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm

import sys
sys.path.append('/work/megapixels_dev/megapixels/')
from app.utils import file_utils

In [12]:
DATA_STORE_NAS = '/data_store_nas/'
DATA_STORE_SSD = '/data_store_ssd/'
dir_dataset_ssd = 'apps/megapixels/datasets/lfw/'
dir_dataset_nas = 'datasets/people/lfw'

In [63]:
fp_records = join(DATA_STORE_NAS, dir_dataset_nas, 'records.csv')
fp_index = join(DATA_STORE_NAS, dir_dataset_nas, 'index.csv')
fp_sha256s = join(DATA_STORE_NAS, dir_dataset_nas, 'sha256s.csv')
fp_uuids = join(DATA_STORE_NAS, dir_dataset_nas, 'uuids.csv')
fp_rois = join(DATA_STORE_NAS, dir_dataset_nas, 'rois.csv')
fp_names_gender_kg = join(DATA_STORE_NAS, dir_dataset_nas, 'lfw_names_gender_kg.csv')
fp_identity_meta = join(DATA_STORE_NAS, dir_dataset_nas, 'identity_meta.csv')

In [77]:
import difflib
def similarity(a, b):
  seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())
  return seq.ratio()

In [48]:
df_records = pd.read_csv(fp_records).set_index('index')

In [50]:
# Create index.csv, `index, sha256`
# drop old columns
df_records.drop(['ext', 'fn', 'subdir', 'uuid'], axis=1).to_csv(fp_index)

In [51]:
# Create uuids.csv, `index, uuid`
# drop old columns
df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)

In [None]:
# Create identity_meta.csv, `index, name, description, gender`
# drop old columns
df_records.drop(['ext', 'fn', 'subdir', 'sha256'], axis=1).to_csv(fp_uuids)

In [52]:
df_records.head()

Unnamed: 0_level_0,ext,fn,sha256,subdir,uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,jpg,AJ_Cook_0001,550937b71b9af36b6083fa1ce7c76e97e3254c439614a6...,AJ_Cook,f03fd921-2d56-4e83-8115-f658d6a72287
1,jpg,AJ_Lamas_0001,46d7ddeec9b00add61ade2f89277d74e8264a2b6cec193...,AJ_Lamas,0c96c5bb-dbd1-4584-bd68-af11664b98bb
2,jpg,Aaron_Eckhart_0001,b68ed8d50ba85209d826b962987077bc8e1826f7f2f325...,Aaron_Eckhart,8221e75c-9537-4a4f-9693-483b445244b4
3,jpg,Aaron_Guiel_0001,156f5428fad30c420ef01d9b0a3ab73e98aa6a1e5a2f0b...,Aaron_Guiel,a2955610-ed5e-433c-bdd4-e3a72ff44736
4,jpg,Aaron_Patterson_0001,34dfe798220b53aac910e5e39705770d212cdfbe4be8a4...,Aaron_Patterson,1d0782e9-ed16-4550-b1e9-d9c03eef6181


In [132]:
df_kg = pd.read_csv(fp_names_gender_kg)
df_kg_new = df_kg.copy()
df_kg_new['name_new'] = [''] * len(df_kg_new)
df_kg_new.head(2)

Unnamed: 0,description,gender,images,name,name_kg,score,url,name_new
0,Canadian actress,f,1,AJ Cook,A. J. Cook,274.55481,http://www.ajcookofficial.com,
1,American actor,m,1,AJ Lamas,AJ Lamas,389.547211,,


In [133]:
# correct original LFW names using
for idx, row in df_kg_new.iterrows():
  name_kg = str(row.get('name_kg', ''))
  name = str(row.get('name', ''))
  name_new = row['name']
  if name_kg:
    sim = similarity(name, name_kg)
    if sim > .75:
      name_new = row['name_kg']
  df_kg_new.at[idx, 'name_new'] = name_new

df_kg_new.head(2)

Unnamed: 0,description,gender,images,name,name_kg,score,url,name_new
0,Canadian actress,f,1,AJ Cook,A. J. Cook,274.55481,http://www.ajcookofficial.com,A. J. Cook
1,American actor,m,1,AJ Lamas,AJ Lamas,389.547211,,AJ Lamas


In [136]:
df_kg_new['index'] = [''] * len(df_kg_new)

In [134]:
# drop extra info
df_kg_new = df_kg_new.drop(['score', 'url'], axis=1)
df_kg_new.head(2)

Unnamed: 0,description,gender,images,name,name_kg,name_new
0,Canadian actress,f,1,AJ Cook,A. J. Cook,A. J. Cook
1,American actor,m,1,AJ Lamas,AJ Lamas,AJ Lamas


In [138]:
# add index column
limit = 100000
for idx, row in tqdm(df_kg_new[:limit].iterrows(), total=len(df_kg_new[:limit])):
  name = row['name']  # original LFW
  # get sha256 from records where match
  subdir = name.replace(' ', '_')
  row_match = df_records.loc[(df_records['subdir'] == subdir)]
  df_kg_new.at[idx, 'index'] = int(row_match.index[0])

100%|██████████| 5749/5749 [00:05<00:00, 1006.44it/s]


Unnamed: 0,description,gender,images,name,name_kg,name_new,index
0,Canadian actress,f,1,AJ Cook,A. J. Cook,A. J. Cook,0
1,American actor,m,1,AJ Lamas,AJ Lamas,AJ Lamas,1


In [140]:
df_kg_new = df_kg_new.drop(['name', 'name_kg'], axis=1)

In [151]:
df_kg_new = df_kg_new.rename(columns={'name_new': 'name'})

In [152]:
df_kg_new.to_csv(fp_identity_meta, index=False)

## Convert ROIs

In [145]:
df_rois = pd.read_csv(fp_rois).set_index('index')
df_rois.head(2)

Unnamed: 0_level_0,h,image_height,image_width,w,x,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.33,250,250,0.33,0.336667,0.35
1,0.393333,250,250,0.393333,0.286667,0.313333


In [146]:
df_index = pd.read_csv(fp_index)

In [166]:
#row = df_records.iloc[ (df_records)]
row = df_kg_new.loc[(df_kg_new['name'] == 'B.B. King')]
#print('index', row['index'].index[0])
row = df_index.iloc[row['index']]

In [171]:
sha = row['sha256']
print(sha.values[0])

7cf753f9e1256e433901a262030f4d184afb4002b49e6b1c7a2d59d07306c2ff


In [31]:
limit = 10
for idx, row in tqdm(df_kg[:limt].iterrows(), total=len(df_kg[:limt])):
  fn = row['fn']
  subdir = row['subdir']
  # get sha256 from records where match
  row_match = df_records.loc[(df_records['subdir'] == subdir)]
  df_records.at[idx, 'idx'] = int(row_match.index[0])

100%|██████████| 14399/14399 [00:15<00:00, 914.63it/s]


In [None]:
df_identity_meta = df_identity_meta.drop(['ext', 'url', 'score'], axis=1)
df_identity_meta.to_csv(fp_identity_meta)