From 5891e2f13ae9dfead0e1794c399e5ff813e694d3 Mon Sep 17 00:00:00 2001
From: adamhrv <adam@ahprojects.com>
Date: Fri, 14 Dec 2018 02:06:39 +0100
Subject: added FR demo notebook

---
 megapixels/commands/cv/_old_files_to_face_rois.py | 168 ++++++++++++++++++++++
 megapixels/commands/cv/embeddings.py              | 100 -------------
 megapixels/commands/cv/face_vec_to_csv.py         | 110 ++++++++++++++
 megapixels/commands/cv/faces_to_csv.py            | 168 ----------------------
 megapixels/commands/cv/faces_to_csv_indexed.py    | 156 --------------------
 megapixels/commands/cv/files_to_rois.py           | 156 ++++++++++++++++++++
 megapixels/commands/datasets/file_meta.py         |  84 +++++++++++
 megapixels/commands/datasets/sha256.py            |  55 ++++---
 8 files changed, 545 insertions(+), 452 deletions(-)
 create mode 100644 megapixels/commands/cv/_old_files_to_face_rois.py
 delete mode 100644 megapixels/commands/cv/embeddings.py
 create mode 100644 megapixels/commands/cv/face_vec_to_csv.py
 delete mode 100644 megapixels/commands/cv/faces_to_csv.py
 delete mode 100644 megapixels/commands/cv/faces_to_csv_indexed.py
 create mode 100644 megapixels/commands/cv/files_to_rois.py
 create mode 100644 megapixels/commands/datasets/file_meta.py

(limited to 'megapixels/commands')

diff --git a/megapixels/commands/cv/_old_files_to_face_rois.py b/megapixels/commands/cv/_old_files_to_face_rois.py
new file mode 100644
index 00000000..d92cbd74
--- /dev/null
+++ b/megapixels/commands/cv/_old_files_to_face_rois.py
@@ -0,0 +1,168 @@
+"""
+Crop images to prepare for training
+"""
+
+import click
+# from PIL import Image, ImageOps, ImageFilter, ImageDraw
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+
+color_filters = {'color': 1, 'gray': 2, 'all': 3}
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_files', required=True,
+  help='Input file meta CSV')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output CSV')
+@click.option('-e', '--ext', 'opt_ext',
+  default='jpg', type=click.Choice(['jpg', 'png']),
+  help='File glob ext')
+@click.option('--size', 'opt_size', 
+  type=(int, int), default=(300, 300),
+  help='Output image size')
+@click.option('-t', '--detector-type', 'opt_detector_type',
+  type=cfg.FaceDetectNetVar,
+  default=click_utils.get_default(types.FaceDetectNet.DLIB_CNN),
+  help=click_utils.show_help(types.FaceDetectNet))
+@click.option('-g', '--gpu', 'opt_gpu', default=0,
+  help='GPU index')
+@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1),
+  help='Confidence minimum threshold')
+@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4),
+  help='Number pyramids to upscale for DLIB detectors')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False,
+  help='Display detections to debug')
+@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
+  help='Use glob recursion (slower)')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite file')
+@click.option('--color', 'opt_color_filter', 
+  type=click.Choice(color_filters.keys()), default='color',
+  help='Filter to keep color or grayscale images (color = keep color')
+@click.pass_context
+def cli(ctx, opt_dirs_in, opt_fp_out, opt_ext, opt_size, opt_detector_type, 
+  opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_recursive, opt_force, opt_color_filter):
+  """Converts frames with faces to CSV of ROIs"""
+  
+  import sys
+  import os
+  from os.path import join
+  from pathlib import Path
+  from glob import glob
+  
+  from tqdm import tqdm
+  import numpy as np
+  import dlib  # must keep a local reference for dlib
+  import cv2 as cv
+  import pandas as pd
+
+  from app.utils import logger_utils, file_utils, im_utils
+  from app.processors import face_detector 
+
+  # -------------------------------------------------
+  # init here
+
+  log = logger_utils.Logger.getLogger()
+
+  if not opt_force and Path(opt_fp_out).exists():
+    log.error('File exists. Use "-f / --force" to overwite')
+    return
+  
+  if opt_detector_type == types.FaceDetectNet.CVDNN:
+    detector = face_detector.DetectorCVDNN()
+  elif opt_detector_type == types.FaceDetectNet.DLIB_CNN:
+    detector = face_detector.DetectorDLIBCNN(opt_gpu)
+  elif opt_detector_type == types.FaceDetectNet.DLIB_HOG:
+    detector = face_detector.DetectorDLIBHOG()
+  elif opt_detector_type == types.FaceDetectNet.MTCNN:
+    detector = face_detector.DetectorMTCNN()
+  elif opt_detector_type == types.FaceDetectNet.HAAR:
+    log.error('{} not yet implemented'.format(opt_detector_type.name))
+    return
+
+  
+  # -------------------------------------------------
+  # process here
+  color_filter = color_filters[opt_color_filter]
+  
+  # get list of files to process
+  fp_ims = []
+  for opt_dir_in in opt_dirs_in:
+    if opt_recursive:
+      fp_glob = join(opt_dir_in, '**/*.{}'.format(opt_ext))
+      fp_ims += glob(fp_glob, recursive=True)
+    else:
+      fp_glob = join(opt_dir_in, '*.{}'.format(opt_ext))
+      fp_ims += glob(fp_glob)
+    log.debug(fp_glob)
+
+
+  if opt_slice:
+    fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
+  log.debug('processing {:,} files'.format(len(fp_ims)))
+
+
+  data = []
+
+  for fp_im in tqdm(fp_ims):
+    im = cv.imread(fp_im)
+
+    # filter out color or grayscale iamges
+    if color_filter != color_filters['all']:
+      try:
+        is_gray = im_utils.is_grayscale(im)
+        if is_gray and color_filter != color_filters['gray']:
+          log.debug('Skipping grayscale image: {}'.format(fp_im))
+          continue
+      except Exception as e:
+        log.error('Could not check grayscale: {}'.format(fp_im))
+        continue
+        
+    try:
+      bboxes = detector.detect(im, opt_size=opt_size, opt_pyramids=opt_pyramids)
+    except Exception as e:
+      log.error('could not detect: {}'.format(fp_im))
+      log.error('{}'.format(e))
+    fpp_im = Path(fp_im)
+    subdir = str(fpp_im.parent.relative_to(opt_dir_in))
+
+    for bbox in bboxes:
+      # log.debug('is square: {}'.format(bbox.w == bbox.h))
+      nw,nh = int(bbox.w * im.shape[1]),  int(bbox.h * im.shape[0])
+      roi = {
+        'fn': fpp_im.stem, 
+        'ext': fpp_im.suffix.replace('.',''), 
+        'x': bbox.x, 
+        'y': bbox.y, 
+        'w': bbox.w, 
+        'h': bbox.h,
+        'image_height': im.shape[0],
+        'image_width': im.shape[1],
+        'subdir': subdir}
+      bbox_dim = bbox.to_dim(im.shape[:2][::-1])  # w,h
+      data.append(roi)
+    
+    # debug display
+    if opt_display and len(bboxes):
+      im_md = im_utils.resize(im, width=min(1200, opt_size[0]))
+      for bbox in bboxes:
+        bbox_dim = bbox.to_dim(im_md.shape[:2][::-1])
+        cv.rectangle(im_md, bbox_dim.pt_tl, bbox_dim.pt_br, (0,255,0), 3)
+      cv.imshow('', im_md)
+      while True:
+        k = cv.waitKey(1) & 0xFF
+        if k == 27 or k == ord('q'):  # ESC
+          cv.destroyAllWindows()
+          sys.exit()
+        elif k != 255:
+          # any key to continue
+          break
+
+  # save date
+  file_utils.mkdirs(opt_fp_out)
+  df = pd.DataFrame.from_dict(data)
+  df.to_csv(opt_fp_out, index=False)
\ No newline at end of file
diff --git a/megapixels/commands/cv/embeddings.py b/megapixels/commands/cv/embeddings.py
deleted file mode 100644
index 9cb26ae7..00000000
--- a/megapixels/commands/cv/embeddings.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-Crop images to prepare for training
-"""
-
-import click
-
-from app.settings import types
-from app.utils import click_utils
-from app.settings import app_cfg as cfg
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True,
-  help='Input directory')
-@click.option('-r', '--records', 'opt_fp_records', required=True,
-  help='Input directory')
-@click.option('-m', '--media', 'opt_fp_media', required=True,
-  help='Image directory')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
-  help='Output CSV')
-@click.option('--size', 'opt_size', 
-  type=(int, int), default=(300, 300),
-  help='Output image size')
-@click.option('-g', '--gpu', 'opt_gpu', default=0,
-  help='GPU index')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
-  help='Slice list of files')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
-  help='Force overwrite file')
-@click.option('-j', '--jitters', 'opt_jitters', default=cfg.DLIB_FACEREC_JITTERS,
-  help='Number of jitters')
-@click.option('-p', '--padding', 'opt_padding', default=cfg.DLIB_FACEREC_PADDING,
-  help='Percentage padding')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_records, opt_fp_out, opt_fp_media, opt_size, opt_gpu,
-  opt_slice, opt_jitters, opt_padding, opt_force):
-  """Converts frames with faces to CSV of rows"""
-  
-  import sys
-  import os
-  from os.path import join
-  from pathlib import Path
-  
-  from tqdm import tqdm
-  import numpy as np
-  import dlib  # must keep a local reference for dlib
-  import cv2 as cv
-  import dlib
-  import pandas as pd
-
-  from app.utils import logger_utils, file_utils, im_utils
-  from app.models.bbox import BBox
-  from app.processors import face_recognition
-
-  # -------------------------------------------------
-  # init here
-
-  log = logger_utils.Logger.getLogger()
-
-  if not opt_force and Path(opt_fp_out).exists():
-    log.error('File exists. Use "-f / --force" to overwite')
-    return
-  
-  # init dlib FR
-  facerec = face_recognition.RecognitionDLIB()
-
-  # load data
-  df_rois = pd.read_csv(opt_fp_in)
-  df_records = pd.read_csv(opt_fp_records)
-
-  if opt_slice:
-    df_rois = df_rois[opt_slice[0]:opt_slice[1]]
-  log.info('Processing {:,} rows'.format(len(df_rois)))
-  nrows = len(df_rois)
-
-  # face vecs
-  vecs = []
-
-  for roi_idx, row in tqdm(df_rois.iterrows(), total=nrows):
-    # make image path
-    record_id = int(row['id'])
-    df = df_records.iloc[record_id]
-    fp_im = join(opt_fp_media, df['subdir'], '{}.{}'.format(df['fn'], df['ext'])) 
-    # load image
-    im = cv.imread(fp_im)
-    # make bbox
-    xywh = [row['x'], row['y'], row['w'] , row['h']]
-    bbox = BBox.from_xywh(*xywh)
-    # scale to actual image size
-    dim = (row['image_width'], row['image_height'])
-    bbox_dim = bbox.to_dim(dim)
-    # compute vec
-    vec = facerec.vec(im, bbox_dim, jitters=opt_jitters, padding=opt_padding)
-    vec_str = ','.join([repr(x) for x in vec])
-    vecs.append( {'id': row['id'], 'vec': vec_str})
-  
-  # save data
-  file_utils.mkdirs(opt_fp_out)
-  df_vecs = pd.DataFrame.from_dict(vecs)
-  df_vecs.to_csv(opt_fp_out, index=False)
-  log.info('saved {:,} lines to {}'.format(len(df_vecs), opt_fp_out))
\ No newline at end of file
diff --git a/megapixels/commands/cv/face_vec_to_csv.py b/megapixels/commands/cv/face_vec_to_csv.py
new file mode 100644
index 00000000..6c9fad09
--- /dev/null
+++ b/megapixels/commands/cv/face_vec_to_csv.py
@@ -0,0 +1,110 @@
+"""
+Converts ROIs to face vector
+"""
+
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_files', required=True,
+  help='Input ROI CSV')
+@click.option('-r', '--rois', 'opt_fp_rois', required=True,
+  help='Input ROI CSV')
+@click.option('-m', '--media', 'opt_dir_media', required=True,
+  help='Input media directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output CSV')
+@click.option('--size', 'opt_size', 
+  type=(int, int), default=(300, 300),
+  help='Output image size')
+@click.option('-j', '--jitters', 'opt_jitters', default=cfg.DLIB_FACEREC_JITTERS,
+  help='Number of jitters')
+@click.option('-p', '--padding', 'opt_padding', default=cfg.DLIB_FACEREC_PADDING,
+  help='Percentage padding')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite file')
+@click.option('-g', '--gpu', 'opt_gpu', default=0,
+  help='GPU index')
+@click.pass_context
+def cli(ctx, opt_fp_files, opt_fp_rois, opt_dir_media, opt_fp_out, opt_size, 
+  opt_slice, opt_force, opt_gpu, opt_jitters, opt_padding):
+  """Converts face ROIs to vectors"""
+  
+  import sys
+  import os
+  from os.path import join
+  from pathlib import Path
+  from glob import glob
+  
+  from tqdm import tqdm
+  import numpy as np
+  import dlib  # must keep a local reference for dlib
+  import cv2 as cv
+  import pandas as pd
+
+  from app.models.bbox import BBox
+  from app.utils import logger_utils, file_utils, im_utils
+  from app.processors import face_recognition
+
+
+  # -------------------------------------------------
+  # init here
+
+  log = logger_utils.Logger.getLogger()
+
+  # init face processors
+  facerec = face_recognition.RecognitionDLIB()
+
+  # load data
+  df_file_meta = pd.read_csv(opt_fp_files)
+  df_rois = pd.read_csv(opt_fp_rois)
+
+  if not opt_force and Path(opt_fp_out).exists():
+    log.error('File exists. Use "-f / --force" to overwite')
+    return
+  
+  if opt_slice:
+    df_rois = df_rois[opt_slice[0]:opt_slice[1]]
+  
+  # -------------------------------------------------
+  # process here
+
+  df_img_groups = df_rois.groupby('image_index')
+  log.debug('processing {:,} groups'.format(len(df_img_groups)))
+
+  vecs = []
+
+  for image_index, df_img_group in tqdm(df_img_groups):
+    # make fp
+    roi_index = df_img_group.index.values[0]
+    file_meta = df_file_meta.iloc[image_index]  # locate image meta
+    fp_im = join(opt_dir_media, file_meta.subdir, '{}.{}'.format(file_meta.fn, file_meta.ext))
+    im = cv.imread(fp_im)
+    # get bbox
+    x = df_img_group.x.values[0]
+    y = df_img_group.y.values[0]
+    w = df_img_group.w.values[0]
+    h = df_img_group.h.values[0]
+    imw = df_img_group.image_width.values[0]
+    imh = df_img_group.image_height.values[0]
+    dim = im.shape[:2][::-1]
+    # get face vector
+    dim = (imw, imh)
+    bbox_dim = BBox.from_xywh(x, y, w, h).to_dim(dim)  # convert to int real dimensions
+    # compute vec
+    # padding=opt_padding not yet implemented in 19.16 but merged in master
+    vec = facerec.vec(im, bbox_dim, jitters=opt_jitters)
+    vec_str = ','.join([repr(x) for x in vec])  # convert to string for CSV
+    vecs.append( {'roi_index': roi_index, 'image_index': image_index, 'vec': vec_str})
+
+
+  # save date
+  file_utils.mkdirs(opt_fp_out)
+  df = pd.DataFrame.from_dict(vecs)
+  df.index.name = 'index'
+  df.to_csv(opt_fp_out)
\ No newline at end of file
diff --git a/megapixels/commands/cv/faces_to_csv.py b/megapixels/commands/cv/faces_to_csv.py
deleted file mode 100644
index 1fd47571..00000000
--- a/megapixels/commands/cv/faces_to_csv.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Crop images to prepare for training
-"""
-
-import click
-# from PIL import Image, ImageOps, ImageFilter, ImageDraw
-
-from app.settings import types
-from app.utils import click_utils
-from app.settings import app_cfg as cfg
-
-color_filters = {'color': 1, 'gray': 2, 'all': 3}
-
-@click.command()
-@click.option('-i', '--input', 'opt_dirs_in', required=True, multiple=True,
-  help='Input directory')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
-  help='Output CSV')
-@click.option('-e', '--ext', 'opt_ext',
-  default='jpg', type=click.Choice(['jpg', 'png']),
-  help='File glob ext')
-@click.option('--size', 'opt_size', 
-  type=(int, int), default=(300, 300),
-  help='Output image size')
-@click.option('-t', '--detector-type', 'opt_detector_type',
-  type=cfg.FaceDetectNetVar,
-  default=click_utils.get_default(types.FaceDetectNet.DLIB_CNN),
-  help=click_utils.show_help(types.FaceDetectNet))
-@click.option('-g', '--gpu', 'opt_gpu', default=0,
-  help='GPU index')
-@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1),
-  help='Confidence minimum threshold')
-@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4),
-  help='Number pyramids to upscale for DLIB detectors')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
-  help='Slice list of files')
-@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False,
-  help='Display detections to debug')
-@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
-  help='Use glob recursion (slower)')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
-  help='Force overwrite file')
-@click.option('--color', 'opt_color_filter', 
-  type=click.Choice(color_filters.keys()), default='color',
-  help='Filter to keep color or grayscale images (color = keep color')
-@click.pass_context
-def cli(ctx, opt_dirs_in, opt_fp_out, opt_ext, opt_size, opt_detector_type, 
-  opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_recursive, opt_force, opt_color_filter):
-  """Converts frames with faces to CSV of ROIs"""
-  
-  import sys
-  import os
-  from os.path import join
-  from pathlib import Path
-  from glob import glob
-  
-  from tqdm import tqdm
-  import numpy as np
-  import dlib  # must keep a local reference for dlib
-  import cv2 as cv
-  import pandas as pd
-
-  from app.utils import logger_utils, file_utils, im_utils
-  from app.processors import face_detector 
-
-  # -------------------------------------------------
-  # init here
-
-  log = logger_utils.Logger.getLogger()
-
-  if not opt_force and Path(opt_fp_out).exists():
-    log.error('File exists. Use "-f / --force" to overwite')
-    return
-  
-  if opt_detector_type == types.FaceDetectNet.CVDNN:
-    detector = face_detector.DetectorCVDNN()
-  elif opt_detector_type == types.FaceDetectNet.DLIB_CNN:
-    detector = face_detector.DetectorDLIBCNN(opt_gpu)
-  elif opt_detector_type == types.FaceDetectNet.DLIB_HOG:
-    detector = face_detector.DetectorDLIBHOG()
-  elif opt_detector_type == types.FaceDetectNet.MTCNN:
-    detector = face_detector.DetectorMTCNN()
-  elif opt_detector_type == types.FaceDetectNet.HAAR:
-    log.error('{} not yet implemented'.format(opt_detector_type.name))
-    return
-
-  
-  # -------------------------------------------------
-  # process here
-  color_filter = color_filters[opt_color_filter]
-  
-  # get list of files to process
-  fp_ims = []
-  for opt_dir_in in opt_dirs_in:
-    if opt_recursive:
-      fp_glob = join(opt_dir_in, '**/*.{}'.format(opt_ext))
-      fp_ims += glob(fp_glob, recursive=True)
-    else:
-      fp_glob = join(opt_dir_in, '*.{}'.format(opt_ext))
-      fp_ims += glob(fp_glob)
-    log.debug(fp_glob)
-
-
-  if opt_slice:
-    fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
-  log.debug('processing {:,} files'.format(len(fp_ims)))
-
-
-  data = []
-
-  for fp_im in tqdm(fp_ims):
-    im = cv.imread(fp_im)
-
-    # filter out color or grayscale iamges
-    if color_filter != color_filters['all']:
-      try:
-        is_gray = im_utils.is_grayscale(im)
-        if is_gray and color_filter != color_filters['gray']:
-          log.debug('Skipping grayscale image: {}'.format(fp_im))
-          continue
-      except Exception as e:
-        log.error('Could not check grayscale: {}'.format(fp_im))
-        continue
-        
-    try:
-      bboxes = detector.detect(im, opt_size=opt_size, opt_pyramids=opt_pyramids)
-    except Exception as e:
-      log.error('could not detect: {}'.format(fp_im))
-      log.error('{}'.format(e))
-    fpp_im = Path(fp_im)
-    subdir = str(fpp_im.parent.relative_to(opt_dir_in))
-
-    for bbox in bboxes:
-      # log.debug('is square: {}'.format(bbox.w == bbox.h))
-      nw,nh = int(bbox.w * im.shape[1]),  int(bbox.h * im.shape[0])
-      roi = {
-        'fn': fpp_im.stem, 
-        'ext': fpp_im.suffix.replace('.',''), 
-        'x': bbox.x, 
-        'y': bbox.y, 
-        'w': bbox.w, 
-        'h': bbox.h,
-        'image_height': im.shape[0],
-        'image_width': im.shape[1],
-        'subdir': subdir}
-      bbox_dim = bbox.to_dim(im.shape[:2][::-1])  # w,h
-      data.append(roi)
-    
-    # debug display
-    if opt_display and len(bboxes):
-      im_md = im_utils.resize(im, width=min(1200, opt_size[0]))
-      for bbox in bboxes:
-        bbox_dim = bbox.to_dim(im_md.shape[:2][::-1])
-        cv.rectangle(im_md, bbox_dim.pt_tl, bbox_dim.pt_br, (0,255,0), 3)
-      cv.imshow('', im_md)
-      while True:
-        k = cv.waitKey(1) & 0xFF
-        if k == 27 or k == ord('q'):  # ESC
-          cv.destroyAllWindows()
-          sys.exit()
-        elif k != 255:
-          # any key to continue
-          break
-
-  # save date
-  file_utils.mkdirs(opt_fp_out)
-  df = pd.DataFrame.from_dict(data)
-  df.to_csv(opt_fp_out, index=False)
\ No newline at end of file
diff --git a/megapixels/commands/cv/faces_to_csv_indexed.py b/megapixels/commands/cv/faces_to_csv_indexed.py
deleted file mode 100644
index ef958f89..00000000
--- a/megapixels/commands/cv/faces_to_csv_indexed.py
+++ /dev/null
@@ -1,156 +0,0 @@
-"""
-Crop images to prepare for training
-"""
-
-import click
-# from PIL import Image, ImageOps, ImageFilter, ImageDraw
-
-from app.settings import types
-from app.utils import click_utils
-from app.settings import app_cfg as cfg
-
-color_filters = {'color': 1, 'gray': 2, 'all': 3}
-
-@click.command()
-@click.option('-i', '--input', 'opt_fp_in', required=True,
-  help='Input CSV (eg image_files.csv)')
-@click.option('-m', '--media', 'opt_dir_media', required=True,
-  help='Input media directory')
-@click.option('-o', '--output', 'opt_fp_out', required=True,
-  help='Output CSV')
-@click.option('--size', 'opt_size', 
-  type=(int, int), default=(300, 300),
-  help='Output image size')
-@click.option('-t', '--detector-type', 'opt_detector_type',
-  type=cfg.FaceDetectNetVar,
-  default=click_utils.get_default(types.FaceDetectNet.DLIB_CNN),
-  help=click_utils.show_help(types.FaceDetectNet))
-@click.option('-g', '--gpu', 'opt_gpu', default=0,
-  help='GPU index')
-@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1),
-  help='Confidence minimum threshold')
-@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4),
-  help='Number pyramids to upscale for DLIB detectors')
-@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
-  help='Slice list of files')
-@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False,
-  help='Display detections to debug')
-@click.option('-f', '--force', 'opt_force', is_flag=True,
-  help='Force overwrite file')
-@click.option('--color', 'opt_color_filter', 
-  type=click.Choice(color_filters.keys()), default='all',
-  help='Filter to keep color or grayscale images (color = keep color')
-@click.option('--largest', 'opt_largest', is_flag=True, 
-  help='Only keep largest face')
-@click.pass_context
-def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_size, opt_detector_type, 
-  opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_force, opt_color_filter,
-  opt_largest):
-  """Converts frames with faces to CSV of ROIs"""
-  
-  import sys
-  import os
-  from os.path import join
-  from pathlib import Path
-  from glob import glob
-  
-  from tqdm import tqdm
-  import numpy as np
-  import dlib  # must keep a local reference for dlib
-  import cv2 as cv
-  import pandas as pd
-
-  from app.utils import logger_utils, file_utils, im_utils
-  from app.processors import face_detector 
-
-  # -------------------------------------------------
-  # init here
-
-  log = logger_utils.Logger.getLogger()
-
-  if not opt_force and Path(opt_fp_out).exists():
-    log.error('File exists. Use "-f / --force" to overwite')
-    return
-  
-  if opt_detector_type == types.FaceDetectNet.CVDNN:
-    detector = face_detector.DetectorCVDNN()
-  elif opt_detector_type == types.FaceDetectNet.DLIB_CNN:
-    detector = face_detector.DetectorDLIBCNN(opt_gpu)
-  elif opt_detector_type == types.FaceDetectNet.DLIB_HOG:
-    detector = face_detector.DetectorDLIBHOG()
-  elif opt_detector_type == types.FaceDetectNet.MTCNN:
-    detector = face_detector.DetectorMTCNN()
-  elif opt_detector_type == types.FaceDetectNet.HAAR:
-    log.error('{} not yet implemented'.format(opt_detector_type.name))
-    return
-
-  
-  # -------------------------------------------------
-  # process here
-  color_filter = color_filters[opt_color_filter]
-  
-  # get list of files to process
-  df_files = pd.read_csv(opt_fp_in).set_index('index')
-
-  if opt_slice:
-    df_files = df_files[opt_slice[0]:opt_slice[1]]
-  log.debug('processing {:,} files'.format(len(df_files)))
-
-
-  data = []
-
-  for df_file in tqdm(df_files.itertuples(), total=len(df_files)):
-    fp_im = join(opt_dir_media, df_file.subdir, '{}.{}'.format(df_file.fn, df_file.ext))
-    im = cv.imread(fp_im)
-
-    # filter out color or grayscale iamges
-    if color_filter != color_filters['all']:
-      try:
-        is_gray = im_utils.is_grayscale(im)
-        if is_gray and color_filter != color_filters['gray']:
-          log.debug('Skipping grayscale image: {}'.format(fp_im))
-          continue
-      except Exception as e:
-        log.error('Could not check grayscale: {}'.format(fp_im))
-        continue
-        
-    try:
-      bboxes = detector.detect(im, opt_size=opt_size, opt_pyramids=opt_pyramids, opt_largest=opt_largest)
-    except Exception as e:
-      log.error('could not detect: {}'.format(fp_im))
-      log.error('{}'.format(e))
-      continue
-
-    for bbox in bboxes:
-      roi = {
-        'image_index': int(df_file.Index),
-        'x': bbox.x, 
-        'y': bbox.y, 
-        'w': bbox.w, 
-        'h': bbox.h,
-        'image_width': im.shape[1],
-        'image_height': im.shape[0]}
-      data.append(roi)
-    
-    # debug display
-    if opt_display and len(bboxes):
-      bbox_dim = bbox.to_dim(im.shape[:2][::-1])  # w,h
-      im_md = im_utils.resize(im, width=min(1200, opt_size[0]))
-      for bbox in bboxes:
-        bbox_dim = bbox.to_dim(im_md.shape[:2][::-1])
-        cv.rectangle(im_md, bbox_dim.pt_tl, bbox_dim.pt_br, (0,255,0), 3)
-      cv.imshow('', im_md)
-      while True:
-        k = cv.waitKey(1) & 0xFF
-        if k == 27 or k == ord('q'):  # ESC
-          cv.destroyAllWindows()
-          sys.exit()
-        elif k != 255:
-          # any key to continue
-          break
-
-  # save date
-  file_utils.mkdirs(opt_fp_out)
-  df = pd.DataFrame.from_dict(data)
-  df.index.name = 'index'
-  df.to_csv(opt_fp_out)
\ No newline at end of file
diff --git a/megapixels/commands/cv/files_to_rois.py b/megapixels/commands/cv/files_to_rois.py
new file mode 100644
index 00000000..1aaf991c
--- /dev/null
+++ b/megapixels/commands/cv/files_to_rois.py
@@ -0,0 +1,156 @@
+"""
+Crop images to prepare for training
+"""
+
+import click
+# from PIL import Image, ImageOps, ImageFilter, ImageDraw
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+
+color_filters = {'color': 1, 'gray': 2, 'all': 3}
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input CSV (eg image_files.csv)')
+@click.option('-m', '--media', 'opt_dir_media', required=True,
+  help='Input media directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output CSV')
+@click.option('--size', 'opt_size', 
+  type=(int, int), default=(300, 300),
+  help='Output image size')
+@click.option('-t', '--detector-type', 'opt_detector_type',
+  type=cfg.FaceDetectNetVar,
+  default=click_utils.get_default(types.FaceDetectNet.DLIB_CNN),
+  help=click_utils.show_help(types.FaceDetectNet))
+@click.option('-g', '--gpu', 'opt_gpu', default=0,
+  help='GPU index')
+@click.option('--conf', 'opt_conf_thresh', default=0.85, type=click.FloatRange(0,1),
+  help='Confidence minimum threshold')
+@click.option('-p', '--pyramids', 'opt_pyramids', default=0, type=click.IntRange(0,4),
+  help='Number pyramids to upscale for DLIB detectors')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('--display/--no-display', 'opt_display', is_flag=True, default=False,
+  help='Display detections to debug')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite file')
+@click.option('--color', 'opt_color_filter', 
+  type=click.Choice(color_filters.keys()), default='all',
+  help='Filter to keep color or grayscale images (color = keep color')
+@click.option('--largest', 'opt_largest', is_flag=True, 
+  help='Only keep largest face')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_size, opt_detector_type, 
+  opt_gpu, opt_conf_thresh, opt_pyramids, opt_slice, opt_display, opt_force, opt_color_filter,
+  opt_largest):
+  """Converts frames with faces to CSV of ROIs"""
+  
+  import sys
+  import os
+  from os.path import join
+  from pathlib import Path
+  from glob import glob
+  
+  from tqdm import tqdm
+  import numpy as np
+  import dlib  # must keep a local reference for dlib
+  import cv2 as cv
+  import pandas as pd
+
+  from app.utils import logger_utils, file_utils, im_utils
+  from app.processors import face_detector 
+
+  # -------------------------------------------------
+  # init here
+
+  log = logger_utils.Logger.getLogger()
+
+  if not opt_force and Path(opt_fp_out).exists():
+    log.error('File exists. Use "-f / --force" to overwite')
+    return
+  
+  if opt_detector_type == types.FaceDetectNet.CVDNN:
+    detector = face_detector.DetectorCVDNN()
+  elif opt_detector_type == types.FaceDetectNet.DLIB_CNN:
+    detector = face_detector.DetectorDLIBCNN(opt_gpu)
+  elif opt_detector_type == types.FaceDetectNet.DLIB_HOG:
+    detector = face_detector.DetectorDLIBHOG()
+  elif opt_detector_type == types.FaceDetectNet.MTCNN:
+    detector = face_detector.DetectorMTCNN()
+  elif opt_detector_type == types.FaceDetectNet.HAAR:
+    log.error('{} not yet implemented'.format(opt_detector_type.name))
+    return
+
+  
+  # -------------------------------------------------
+  # process here
+  color_filter = color_filters[opt_color_filter]
+  
+  # get list of files to process
+  df_files = pd.read_csv(opt_fp_in).set_index('index')
+
+  if opt_slice:
+    df_files = df_files[opt_slice[0]:opt_slice[1]]
+  log.debug('processing {:,} files'.format(len(df_files)))
+
+
+  data = []
+
+  for df_file in tqdm(df_files.itertuples(), total=len(df_files)):
+    fp_im = join(opt_dir_media, str(df_file.subdir), f'{df_file.fn}.{df_file.ext}')
+    im = cv.imread(fp_im)
+
+    # filter out color or grayscale iamges
+    if color_filter != color_filters['all']:
+      try:
+        is_gray = im_utils.is_grayscale(im)
+        if is_gray and color_filter != color_filters['gray']:
+          log.debug('Skipping grayscale image: {}'.format(fp_im))
+          continue
+      except Exception as e:
+        log.error('Could not check grayscale: {}'.format(fp_im))
+        continue
+        
+    try:
+      bboxes = detector.detect(im, size=opt_size, pyramids=opt_pyramids, largest=opt_largest)
+    except Exception as e:
+      log.error('could not detect: {}'.format(fp_im))
+      log.error('{}'.format(e))
+      continue
+
+    for bbox in bboxes:
+      roi = {
+        'image_index': int(df_file.Index),
+        'x': bbox.x, 
+        'y': bbox.y, 
+        'w': bbox.w, 
+        'h': bbox.h,
+        'image_width': im.shape[1],
+        'image_height': im.shape[0]}
+      data.append(roi)
+    
+    # debug display
+    if opt_display and len(bboxes):
+      bbox_dim = bbox.to_dim(im.shape[:2][::-1])  # w,h
+      im_md = im_utils.resize(im, width=min(1200, opt_size[0]))
+      for bbox in bboxes:
+        bbox_dim = bbox.to_dim(im_md.shape[:2][::-1])
+        cv.rectangle(im_md, bbox_dim.pt_tl, bbox_dim.pt_br, (0,255,0), 3)
+      cv.imshow('', im_md)
+      while True:
+        k = cv.waitKey(1) & 0xFF
+        if k == 27 or k == ord('q'):  # ESC
+          cv.destroyAllWindows()
+          sys.exit()
+        elif k != 255:
+          # any key to continue
+          break
+
+  # save date
+  file_utils.mkdirs(opt_fp_out)
+  df = pd.DataFrame.from_dict(data)
+  df.index.name = 'index'
+  df.to_csv(opt_fp_out)
\ No newline at end of file
diff --git a/megapixels/commands/datasets/file_meta.py b/megapixels/commands/datasets/file_meta.py
new file mode 100644
index 00000000..e1456f44
--- /dev/null
+++ b/megapixels/commands/datasets/file_meta.py
@@ -0,0 +1,84 @@
+"""
+Begin with this file to process folder of images
+- Converts folders and subdirectories into CSV with file attributes split
+"""
+import click
+
+from app.settings import types
+from app.utils import click_utils
+from app.settings import app_cfg as cfg
+from app.utils.logger_utils import Logger
+
+log = Logger.getLogger()
+
+@click.command()
+@click.option('-i', '--input', 'opt_fp_in', required=True,
+  help='Input directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
+  help='Output file for file meta CSV')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+  help='Slice list of files')
+@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
+  help='Use glob recursion (slower)')
+@click.option('-t', '--threads', 'opt_threads', default=4,
+  help='Number of threads')
+@click.option('-f', '--force', 'opt_force', is_flag=True,
+  help='Force overwrite file')
+@click.pass_context
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_force):
+  """Multithreading test"""
+  
+  from glob import glob
+  from os.path import join
+  from pathlib import Path
+  import time
+  from multiprocessing.dummy import Pool as ThreadPool 
+  import random
+
+  import pandas as pd
+  from tqdm import tqdm
+  from glob import glob
+  
+  from app.utils import file_utils, im_utils
+
+
+  if not opt_force and Path(opt_fp_out).exists():
+    log.error('File exists. Use "-f / --force" to overwite')
+    return
+
+  fp_ims = []
+  log.info(f'Globbing {opt_fp_in}')
+  for ext in ['jpg', 'png']:
+    if opt_recursive:
+      fp_glob = join(opt_fp_in, '**/*.{}'.format(ext))
+      fp_ims += glob(fp_glob, recursive=True)
+    else:
+      fp_glob = join(opt_fp_in, '*.{}'.format(ext))
+      fp_ims += glob(fp_glob)
+
+  if not fp_ims:
+    log.warn('No images. Try with "--recursive"')
+    return
+
+  if opt_slice:
+    fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
+
+  log.info('Processing {:,} images'.format(len(fp_ims)))
+
+
+  # convert data to dict
+  data = []
+  for i, fp_im in enumerate(tqdm(fp_ims)):
+    fpp_im = Path(fp_im)
+    subdir = str(fpp_im.parent.relative_to(opt_fp_in))
+    data.append( {
+      'subdir': subdir,
+      'fn': fpp_im.stem,
+      'ext': fpp_im.suffix.replace('.','')
+      })
+
+  # save to CSV
+  file_utils.mkdirs(opt_fp_out)
+  df = pd.DataFrame.from_dict(data)
+  df.index.name = 'index'
+  df.to_csv(opt_fp_out)
\ No newline at end of file
diff --git a/megapixels/commands/datasets/sha256.py b/megapixels/commands/datasets/sha256.py
index c04fb504..4c734073 100644
--- a/megapixels/commands/datasets/sha256.py
+++ b/megapixels/commands/datasets/sha256.py
@@ -10,18 +10,18 @@ log = Logger.getLogger()
 @click.command()
 @click.option('-i', '--input', 'opt_fp_in', required=True,
   help='Input directory')
-@click.option('-o', '--output', 'opt_fp_out',
+@click.option('-m', '--media', 'opt_dir_media', required=True,
+  help='Input media directory')
+@click.option('-o', '--output', 'opt_fp_out', required=True,
   help='Output directory')
 @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
   help='Slice list of files')
-@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
-  help='Use glob recursion (slower)')
 @click.option('-t', '--threads', 'opt_threads', default=4,
   help='Number of threads')
 @click.option('-f', '--force', 'opt_force', is_flag=True,
   help='Force overwrite file')
 @click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_force):
+def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_slice, opt_threads, opt_force):
   """Multithreading test"""
   
   from glob import glob
@@ -42,47 +42,46 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_f
     log.error('File exists. Use "-f / --force" to overwite')
     return
 
-  fp_ims = []
-  for ext in ['jpg', 'png']:
-    if opt_recursive:
-      fp_glob = join(opt_fp_in, '**/*.{}'.format(ext))
-      fp_ims += glob(fp_glob, recursive=True)
-    else:
-      fp_glob = join(opt_fp_in, '*.{}'.format(ext))
-      fp_ims += glob(fp_glob)
+  df_files = pd.read_csv(opt_fp_in).set_index('index')
 
   if opt_slice:
-    fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
+    df_files = df_files[opt_slice[0]:opt_slice[1]]
 
-  log.info('Processing {:,} images'.format(len(fp_ims)))
+  log.info('Processing {:,} images'.format(len(df_files)))
 
-  pbar = tqdm(total=100)
+  
+  # prepare list of images to multithread into sha256s
+  file_objs = []
+  for ds_file in df_files.itertuples():
+    fp_im = join(opt_dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}")
+    file_objs.append({'fp': fp_im, 'index': ds_file.Index})
+
+  # convert to thread pool
+  pbar = tqdm(total=len(file_objs))
 
-  def as_sha256(fp_im):
+  def as_sha256(file_obj):
     pbar.update(1)
-    return file_utils.sha256(fp_im)
+    file_obj['sha256'] = file_utils.sha256(file_obj['fp'])
+    return file_obj
 
   # multithread pool
+  pool_file_objs = []
   st = time.time()
   pool = ThreadPool(opt_threads) 
-  with tqdm(total=len(fp_ims)) as pbar:
-    sha256s = pool.map(as_sha256, fp_ims)
+  with tqdm(total=len(file_objs)) as pbar:
+    pool_file_objs = pool.map(as_sha256, file_objs)
   pbar.close()
-
+  
   # convert data to dict
   data = []
-  for i, fp_im in enumerate(fp_ims):
-    fpp_im = Path(fp_im)
-    subdir = str(fpp_im.parent.relative_to(opt_fp_in))
-    sha256 = sha256s[i]
+  for pool_file_obj in pool_file_objs:
     data.append( {
-      'sha256': sha256, 
-      'subdir': subdir,
-      'fn': fpp_im.stem,
-      'ext': fpp_im.suffix.replace('.','')
+      'sha256': pool_file_obj['sha256'], 
+      'index': pool_file_obj['index']
       })
 
   # save to CSV
+  file_utils.mkdirs(opt_fp_out)
   df = pd.DataFrame.from_dict(data)
   df.to_csv(opt_fp_out, index=False)
 
-- 
cgit v1.2.3-70-g09d2