5 files changed, 206 insertions, 34 deletions
diff --git a/megapixels/commands/datasets/records.py b/megapixels/commands/datasets/file_record.py
index 80de5040..355b22f2 100644
--- a/megapixels/commands/datasets/records.py
+++ b/megapixels/commands/datasets/file_record.py
@@ -10,7 +10,12 @@ from app.utils.logger_utils import Logger
 
 log = Logger.getLogger()
 
-identity_sources = ['subdir', 'subdir_head', 'subdir_tail']
+# Choose part of the filepath that will be used for the person identity
+# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama"
+# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1"
+# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama"
+
+identity_sources = ['subdir', 'numeric']
 
 @click.command()
 @click.option('-i', '--input', 'opt_fp_in', default=None,
@@ -21,7 +26,7 @@ identity_sources = ['subdir', 'subdir_head', 'subdir_tail']
   help='Override enum media directory')
 @click.option('--data_store', 'opt_data_store',
   type=cfg.DataStoreVar,
-  default=click_utils.get_default(types.DataStore.SSD),
+  default=click_utils.get_default(types.DataStore.HDD),
   show_default=True,
   help=click_utils.show_help(types.Dataset))
 @click.option('--dataset', 'opt_dataset',
@@ -35,7 +40,8 @@ identity_sources = ['subdir', 'subdir_head', 'subdir_tail']
   help='Number of threads')
 @click.option('-f', '--force', 'opt_force', is_flag=True,
   help='Force overwrite file')
-@click.option('--identity', 'opt_identity', default=None, type=click.Choice(identity_sources),
+@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources),
+  default='numeric',
   help='Identity source, blank for no identity')
 @click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
   help='Use glob recursion (slower)')
@@ -44,7 +50,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
   opt_identity, opt_force, opt_recursive):
   """Generates sha256, uuid, and identity index CSV file"""
   
-  import sys
+  import sys, os
   from glob import glob
   from os.path import join
   from pathlib import Path
@@ -53,9 +59,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
   import random
   import uuid
 
+  import cv2 as cv
   import pandas as pd
   from tqdm import tqdm
   from glob import glob
+  from operator import itemgetter
 
   from app.models.data_store import DataStore
   from app.utils import file_utils, im_utils
@@ -91,15 +99,26 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
 
   pbar = tqdm(total=len(fp_ims))
 
-  def as_sha256(fp_im):
+  def pool_mapper(fp_im):
     pbar.update(1)
-    return file_utils.sha256(fp_im)
+    sha256 = file_utils.sha256(fp_im)
+    im = cv.imread(fp_im)
+    w, h = im.shape[:2][::-1]
+    file_size_kb = os.stat(fp_im).st_size // 1000
+    num_channels = im_utils.num_channels(im)
+    return {
+      'width': w, 
+      'height': h, 
+      'sha256': sha256,
+      'file_size_kb': file_size_kb, 
+      'num_channels': num_channels
+      }
 
   # convert to thread pool
-  sha256s = []  # ?
+  pool_maps = []  # ?
   pool = ThreadPool(opt_threads) 
   with tqdm(total=len(fp_ims)) as pbar:
-    sha256s = pool.map(as_sha256, fp_ims)
+    pool_maps = pool.map(pool_mapper, fp_ims)
   pbar.close()
 
 
@@ -107,9 +126,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
   # convert data to dict
   
   data = []
-  for sha256, fp_im in zip(sha256s, fp_ims):
+  indentity_count = 0
+  for pool_map, fp_im in zip(pool_maps, fp_ims):
     fpp_im = Path(fp_im)
     subdir = str(fpp_im.parent.relative_to(fp_in))
+    #subdir = '' if subdir is '.' else subdir
+    log.debug(subdir)
     
     if opt_identity:
       subdirs = subdir.split('/')
@@ -118,42 +140,65 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
         log.error('exiting')
         return
       if opt_identity == 'subdir':
-        identity = subdirs[0]  # use first/only part
-      elif opt_identity == 'subdir_head':
-        identity = subdirs[0]  # use first part of subdir path
-      elif opt_identity == 'subdir_tail':
         identity = subdirs[-1]  # use last part of subdir path
+      elif opt_identity == 'numeric':
+        identity = indentity_count  # use incrementing number
+        indentity_count += 1
     else:
       identity = ''
 
     data.append({
       'subdir': subdir,
+      'num_channels': pool_map['num_channels'],
       'fn': fpp_im.stem,
       'ext': fpp_im.suffix.replace('.',''),
-      'sha256': sha256,
+      'sha256': pool_map['sha256'],
       'uuid': uuid.uuid4(),
-      'identity_key': identity
+      'identity_key': identity,
+      'width': pool_map['width'],
+      'height': pool_map['height']
       })
 
-  log.info(f'adding identity index using: "{opt_identity}". This may take a while...')
-  # convert dict to DataFrame
+  # create dataframe
   df_records = pd.DataFrame.from_dict(data)
-  # sort based on identity_key
-  df_records = df_records.sort_values(by=['identity_key'], ascending=True)
-  # add new column for identity
-  df_records['identity_index'] = [-1] * len(df_records)
-  # populate the identity_index
-  df_records_identity_groups = df_records.groupby('identity_key')
-  # enumerate groups to create identity indices
-  for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups):
-    identity_key, df_records_identity_group = df_records_identity_group_tuple
-    for ds_record in df_records_identity_group.itertuples():
-      df_records.at[ds_record.Index, 'identity_index'] = identity_index
-  # reset index after being sorted
-  df_records = df_records.reset_index(drop=True)
+
   df_records.index.name = 'index'  # reassign 'index' as primary key column
   # write to CSV
   file_utils.mkdirs(fp_out)
   df_records.to_csv(fp_out)
   # done
-  log.info(f'wrote rows: {len(df_records)} to {fp_out}')
-\ No newline at end of file
+  log.info(f'wrote {len(df_records)} rows to "{fp_out}"')
+  # save script
+  cmd_line = ' '.join(sys.argv)
+  file_utils.write_text(cmd_line, '{}.sh'.format(fp_out))
+
+
+'''
+# create dataframe
+  df_records = pd.DataFrame.from_dict(data)
+
+  # add identity key (used for associating identity)
+  if opt_identity:
+    log.info(f'adding identity index using: "{opt_identity}" subdirectory')
+    # convert dict to DataFrame
+    # sort based on identity_key
+    df_records = df_records.sort_values(by=['identity_key'], ascending=True)
+    # add new column for identity
+    df_records['identity_index'] = [-1] * len(df_records)
+    # populate the identity_index
+    df_records_identity_groups = df_records.groupby('identity_key')
+    # enumerate groups to create identity indices
+    log.info(f'updating records with identity_key. This may take a while...')
+    st = time.time()
+    for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups):
+      identity_key, df_records_identity_group = df_records_identity_group_tuple
+      for ds_record in df_records_identity_group.itertuples():
+        df_records.at[ds_record.Index, 'identity_index'] = identity_index
+    # reset index after being sorted
+    df_records = df_records.reset_index(drop=True)
+    log.debug('update time: {:.2f}s'.format(time.time() - st))
+  else:
+    # name everyone person 1, 2, 3...
+    df_records = df_records.sort_values(by=['subdir'], ascending=True)
+    pass
+'''
+\ No newline at end of file
diff --git a/megapixels/commands/datasets/lookup.py b/megapixels/commands/datasets/lookup.py
index c1c66c19..5ae4c3f5 100644
--- a/megapixels/commands/datasets/lookup.py
+++ b/megapixels/commands/datasets/lookup.py
@@ -10,7 +10,7 @@ log = Logger.getLogger()
 
 @click.command()
 @click.option('--index', 'opt_index', type=int, required=True,
-  help='Vector index to lookup')
+  help='File index to lookup')
 @click.option('--data_store', 'opt_data_store',
   type=cfg.DataStoreVar,
   default=click_utils.get_default(types.DataStore.SSD),
@@ -45,7 +45,7 @@ def cli(ctx, opt_index, opt_data_store, opt_dataset):
   dataset.load_records()
   dataset.load_identities()
   # set data store and load files
-  # find image records
+  # get image record from file index
   image_record = dataset.index_to_record(opt_index)
   image_record.summarize()
   # load image
diff --git a/megapixels/commands/datasets/msceleb.py b/megapixels/commands/datasets/msceleb.py
new file mode 100644
index 00000000..969a1df2
--- /dev/null
+++ b/megapixels/commands/datasets/msceleb.py
@@ -0,0 +1,66 @@
+'''
+Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images
+'''
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Path to input TSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output path for images')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice):
+  """Converts MSCeleb TSV to images"""
+  
+  import sys
+  import os
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+  import base64
+  from io import BytesIO
+
+  import pandas as pd
+  import cv2 as cv
+  from PIL import Image
+  from tqdm import tqdm
+  
+  from app.utils import file_utils, im_utils
+  from app.models.data_store import DataStore
+
+
+  log = Logger.getLogger()
+  log.debug(f'opening "{opt_fp_in}" ...')
+  try:
+    n_lines = sum(1 for line in open(opt_fp_in))
+  except:
+    n_lines = 1
+    
+  log.debug('{:,}'.format(n_lines))
+
+  with open(opt_fp_in, 'rb') as fp:
+    for data_line in tqdm(fp, total=n_lines):
+      try:
+        freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t')
+        # decode image
+        im64 = base64.b64decode(b64_bytes)
+        im = Image.open(BytesIO(im64))
+        # save image
+        dir_out = join(opt_fp_out, freebase_mid)
+        Path(dir_out).mkdir(parents=True, exist_ok=True)
+        idx = len(os.listdir(dir_out))
+        fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx)))
+        im.save(fp_out, quality=100)
+      except Exception as e:
+        log.error('Could not process: {}, {}. Error: {}'.format(query_name, url_image, e))
diff --git a/megapixels/commands/datasets/msceleb_names.py b/megapixels/commands/datasets/msceleb_names.py
new file mode 100644
index 00000000..6ee2ad9a
--- /dev/null
+++ b/megapixels/commands/datasets/msceleb_names.py
@@ -0,0 +1,57 @@
+'''
+Converts MsCelebV1-ImageThumbnails.part.00.tsv to names and images
+'''
+import click
+
+from app.settings import types
+from app.models.dataset import Dataset
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Path to input TSV file')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output path for images')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out):
+  """Converts MSCeleb TSV to names file with image count"""
+  
+  import sys
+  import os
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+  import base64
+  from io import BytesIO
+
+  import pandas as pd
+  import cv2 as cv
+  from PIL import Image
+  from tqdm import tqdm
+  
+  from app.utils import file_utils, im_utils
+  from app.models.data_store import DataStore
+
+
+  log = Logger.getLogger()
+  log.debug(f'opening "{opt_fp_in}" ...')
+  n_lines = sum(1 for line in open(opt_fp_in))
+  log.debug('{:,}'.format(n_lines))
+
+  with open(opt_fp_in, 'rb') as fp:
+    for data_line in tqdm(fp, total=n_lines):
+      freebase_mid, query_name, search_rank, url_image, url_page, b64_bytes = data_line.decode().split('\t')
+      # decode image
+      im64 = base64.b64decode(b64_bytes)
+      im = Image.open(BytesIO(im64))
+      # save image
+      dir_out = join(opt_fp_out, freebase_mid)
+      Path(dir_out).mkdir(parents=True, exist_ok=True)
+      idx = len(os.listdir(dir_out))
+      fp_out = join(dir_out, '{}.jpg'.format(file_utils.zpad(idx)))
+      im.save(fp_out, quality=100)
diff --git a/megapixels/commands/datasets/s3_sync.py b/megapixels/commands/datasets/s3_sync.py
index 3098d9be..17940c6d 100644
--- a/megapixels/commands/datasets/s3_sync.py
+++ b/megapixels/commands/datasets/s3_sync.py
@@ -54,4 +54,8 @@ def cli(ctx, opt_data_store, opt_dataset, opt_type, opt_dryrun):
   if not opt_dryrun:
     subprocess.call(cmd)
 
-  
-\ No newline at end of file
+  
+'''
+upload: '/data_store_ssd/datasets/people/vgg_face2/media/uuid/00418e0e-48e9-44f9-b6a0-b2ffd773802e.jpg' -> 's3://megapixels/v1/media/vgg_face2/00418e0e-48e9-44f9-b6a0-b2ffd773802e.jpg'  [3202 of 3187313]
+[2953 of 3187313]
+'''
+\ No newline at end of file