diff options
Diffstat (limited to 'megapixels/commands/datasets/file_record.py')
| -rw-r--r-- | megapixels/commands/datasets/file_record.py | 40 |
1 files changed, 35 insertions, 5 deletions
diff --git a/megapixels/commands/datasets/file_record.py b/megapixels/commands/datasets/file_record.py index d3f790d4..41a5df28 100644 --- a/megapixels/commands/datasets/file_record.py +++ b/megapixels/commands/datasets/file_record.py @@ -45,9 +45,11 @@ identity_sources = ['subdir', 'numeric'] help='Identity source key') @click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, help='Use glob recursion (slower)') +@click.option('--max-depth', 'opt_max_depth', default=None, type=int, + help='Max number of images per subdirectory') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force, opt_recursive): + opt_identity, opt_force, opt_recursive, opt_max_depth): """Generates sha256, uuid, and identity index CSV file""" import sys, os @@ -59,6 +61,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, import random import uuid + from PIL import Image import cv2 as cv import pandas as pd from tqdm import tqdm @@ -84,6 +87,26 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() log.info(f'Globbing {fp_in}') fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) + + log.info('Found {:,} images'.format(len(fp_ims))) + subdir_groups = {} + if opt_max_depth: + log.debug(f'using max depth: {opt_max_depth}') + for fp_im in fp_ims: + fpp_im = Path(fp_im) + + subdir = fp_im.split('/')[-2] + if not subdir in subdir_groups.keys(): + subdir_groups[subdir] = [] + else: + subdir_groups[subdir].append(fp_im) + # for each subgroup, limit number of files + fp_ims = [] + for subdir_name, items in subdir_groups.items(): + ims = items[0:opt_max_depth] + fp_ims += ims + + log.debug(f'num subdirs: {len(subdir_groups.keys())}') # fail if none if not fp_ims: log.error('No images. Try with "--recursive"') @@ -93,7 +116,6 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] log.info('Found {:,} images'.format(len(fp_ims))) - # ---------------------------------------------------------------- # multithread process into SHA256 @@ -101,7 +123,14 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, def pool_mapper(fp_im): pbar.update(1) - sha256 = file_utils.sha256(fp_im) + try: + sha256 = file_utils.sha256(fp_im) + im = Image.open(fp_im) + im.verify() # throws error if bad file + assert(im.size[0] > 60 and im.size[1] > 60) + except Exception as e: + log.warn(f'skipping file: {fp_im}') + return None im = cv.imread(fp_im) w, h = im.shape[:2][::-1] file_size_kb = os.stat(fp_im).st_size // 1000 @@ -128,10 +157,11 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, data = [] indentity_count = 0 for pool_map, fp_im in zip(pool_maps, fp_ims): + if pool_map is None: + log.warn(f'skipping file: {fp_im}') + continue # skip error files fpp_im = Path(fp_im) subdir = str(fpp_im.parent.relative_to(fp_in)) - #subdir = '' if subdir is '.' else subdir - log.debug(subdir) if opt_identity: subdirs = subdir.split('/') |
