summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets')
-rw-r--r--megapixels/commands/datasets/file_record.py40
1 files changed, 35 insertions, 5 deletions
diff --git a/megapixels/commands/datasets/file_record.py b/megapixels/commands/datasets/file_record.py
index d3f790d4..b5daef4e 100644
--- a/megapixels/commands/datasets/file_record.py
+++ b/megapixels/commands/datasets/file_record.py
@@ -45,9 +45,11 @@ identity_sources = ['subdir', 'numeric']
help='Identity source key')
@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
help='Use glob recursion (slower)')
+@click.option('--max-depth', 'opt_max_depth', default=None, type=int,
+ help='Max number of images per subdirectory')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads,
- opt_identity, opt_force, opt_recursive):
+ opt_identity, opt_force, opt_recursive, opt_max_depth):
"""Generates sha256, uuid, and identity index CSV file"""
import sys, os
@@ -59,6 +61,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
import random
import uuid
+ from PIL import Image
import cv2 as cv
import pandas as pd
from tqdm import tqdm
@@ -84,6 +87,26 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original()
log.info(f'Globbing {fp_in}')
fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive)
+
+ log.info('Found {:,} images'.format(len(fp_ims)))
+ subdir_groups = {}
+ if opt_max_depth:
+ log.debug(f'using max depth: {opt_max_depth}')
+ for fp_im in fp_ims:
+ fpp_im = Path(fp_im)
+
+ subdir = fp_im.split('/')[-2]
+ if not subdir in subdir_groups.keys():
+ subdir_groups[subdir] = []
+ else:
+ subdir_groups[subdir].append(fp_im)
+ # for each subgroup, limit number of files
+ fp_ims = []
+ for subdir_name, items in subdir_groups.items():
+ ims = items[0:opt_max_depth]
+ fp_ims += ims
+
+ log.debug(f'num subdirs: {len(subdir_groups.keys())}')
# fail if none
if not fp_ims:
log.error('No images. Try with "--recursive"')
@@ -93,7 +116,6 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]
log.info('Found {:,} images'.format(len(fp_ims)))
-
# ----------------------------------------------------------------
# multithread process into SHA256
@@ -101,7 +123,14 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
def pool_mapper(fp_im):
pbar.update(1)
- sha256 = file_utils.sha256(fp_im)
+ try:
+ sha256 = file_utils.sha256(fp_im)
+ im = Image.open(fp_im)
+ im.verify() # throws error if bad file
+ assert(im.size[0] > 100 and im.size[1] > 100)
+ except Exception as e:
+ log.warn(f'skipping file: {fp_im}')
+ return None
im = cv.imread(fp_im)
w, h = im.shape[:2][::-1]
file_size_kb = os.stat(fp_im).st_size // 1000
@@ -128,10 +157,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media,
data = []
indentity_count = 0
for pool_map, fp_im in zip(pool_maps, fp_ims):
+ if pool_map is None:
+ log.warn(f'skipping file: {fp_im}')
+ continue # skip error files
fpp_im = Path(fp_im)
subdir = str(fpp_im.parent.relative_to(fp_in))
- #subdir = '' if subdir is '.' else subdir
- log.debug(subdir)
if opt_identity:
subdirs = subdir.split('/')