''' ''' import click from app.settings import types from app.utils import click_utils from app.settings import app_cfg as cfg from app.utils.logger_utils import Logger log = Logger.getLogger() identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] @click.command() @click.option('-i', '--input', 'opt_fp_in', default=None, help='Override enum input filename CSV') @click.option('-o', '--output', 'opt_fp_out', default=None, help='Override enum output filename CSV') @click.option('-m', '--media', 'opt_dir_media', default=None, help='Override enum media directory') @click.option('--data_store', 'opt_data_store', type=cfg.DataStoreVar, default=click_utils.get_default(types.DataStore.NAS), show_default=True, help=click_utils.show_help(types.Dataset)) @click.option('--dataset', 'opt_dataset', type=cfg.DatasetVar, required=True, show_default=True, help=click_utils.show_help(types.Dataset)) @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') @click.option('-t', '--threads', 'opt_threads', default=12, help='Number of threads') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite file') @click.option('--identity', 'opt_identity', default='subdir_tail', type=click.Choice(identity_sources), help='Identity source, blank for no identity') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, opt_identity, opt_force): """Generates sha256/identity index CSV file""" import sys from glob import glob from os.path import join from pathlib import Path import time from multiprocessing.dummy import Pool as ThreadPool import random import pandas as pd from tqdm import tqdm from glob import glob from app.models import DataStore from app.utils import file_utils, im_utils # set data_store data_store = DataStore(opt_data_store, opt_dataset) # get filepath out fp_out = data_store.metadata(types.Metadata.SHA256) if opt_fp_out is None else opt_fp_out # exit if exists if not opt_force and Path(fp_out).exists(): log.error('File exists. Use "-f / --force" to overwite') return # get filepath in fp_in = data_store.metadata(types.Metadata.FILEPATH) df_files = pd.read_csv(fp_in).set_index('index') # slice if you want if opt_slice: df_files = df_files[opt_slice[0]:opt_slice[1]] log.info('Processing {:,} images'.format(len(df_files))) # prepare list of images to multithread into sha256s dir_media = data_store.media_images_original() if opt_dir_media is None else opt_dir_media file_objs = [] for ds_file in df_files.itertuples(): fp_im = join(dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}") # find the image_index # append the subdir option, sort by this then increment by unique subdir file_obj = {'fp': fp_im, 'index': ds_file.Index} if opt_identity: subdirs = ds_file.subdir.split('/') if not len(subdirs) > 0: log.error(f'Could not split subdir: "{ds_file.subdir}. Try different option for "--identity"') log.error('exiting') return if opt_identity == 'subdir': subdir = subdirs[0] elif opt_identity == 'subdir_head': # use first part of subdir path subdir = subdirs[0] elif opt_identity == 'subdir_tail': # use last part of subdir path subdir = subdirs[-1] file_obj['identity_subdir'] = subdir file_objs.append(file_obj) # convert to thread pool pbar = tqdm(total=len(file_objs)) def as_sha256(file_obj): pbar.update(1) file_obj['sha256'] = file_utils.sha256(file_obj['fp']) return file_obj # multithread pool pool_file_objs = [] st = time.time() pool = ThreadPool(opt_threads) with tqdm(total=len(file_objs)) as pbar: pool_file_objs = pool.map(as_sha256, file_objs) pbar.close() # convert data to dict data = [] for pool_file_obj in pool_file_objs: data.append( { 'sha256': pool_file_obj['sha256'], 'index': pool_file_obj['index'], 'identity_subdir': pool_file_obj.get('identity_subdir', ''), }) # sort based on identity_subdir # save to CSV df_sha256 = pd.DataFrame.from_dict(data) # add new column for identity df_sha256['identity_index'] = [1] * len(df_sha256) df_sha256 = df_sha256.sort_values(by=['identity_subdir'], ascending=True) df_sha256_identity_groups = df_sha256.groupby('identity_subdir') for identity_index, df_sha256_identity_group_tuple in enumerate(df_sha256_identity_groups): identity_subdir, df_sha256_identity_group = df_sha256_identity_group_tuple for ds_sha256 in df_sha256_identity_group.itertuples(): df_sha256.at[ds_sha256.Index, 'identity_index'] = identity_index # drop temp identity subdir column df_sha256 = df_sha256.drop('identity_subdir', axis=1) # write to CSV log.info(f'rows: {len(df_sha256)}') file_utils.mkdirs(fp_out) df_sha256.set_index('index') df_sha256 = df_sha256.sort_values(['index'], ascending=[True]) df_sha256.to_csv(fp_out, index=False) # timing log.info(f'wrote file: {fp_out}') log.info('time: {:.2f}, theads: {}'.format(time.time() - st, opt_threads))