diff options
Diffstat (limited to 'megapixels/commands/datasets/gen_sha256.py')
| -rw-r--r-- | megapixels/commands/datasets/gen_sha256.py | 152 |
1 files changed, 0 insertions, 152 deletions
diff --git a/megapixels/commands/datasets/gen_sha256.py b/megapixels/commands/datasets/gen_sha256.py deleted file mode 100644 index 1616eebf..00000000 --- a/megapixels/commands/datasets/gen_sha256.py +++ /dev/null @@ -1,152 +0,0 @@ -''' - -''' -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', default=None, - help='Override enum input filename CSV') -@click.option('-o', '--output', 'opt_fp_out', default=None, - help='Override enum output filename CSV') -@click.option('-m', '--media', 'opt_dir_media', default=None, - help='Override enum media directory') -@click.option('--data_store', 'opt_data_store', - type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.NAS), - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--dataset', 'opt_dataset', - type=cfg.DatasetVar, - required=True, - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=12, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.option('--identity', 'opt_identity', default='subdir_tail', type=click.Choice(identity_sources), - help='Identity source, blank for no identity') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force): - """Generates sha256/identity index CSV file""" - - import sys - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - - import pandas as pd - from tqdm import tqdm - from glob import glob - - from app.models import DataStore - from app.utils import file_utils, im_utils - - - # set data_store - data_store = DataStore(opt_data_store, opt_dataset) - # get filepath out - fp_out = data_store.metadata(types.Metadata.SHA256) if opt_fp_out is None else opt_fp_out - # exit if exists - if not opt_force and Path(fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - # get filepath in - fp_in = data_store.metadata(types.Metadata.FILEPATH) - df_files = pd.read_csv(fp_in).set_index('index') - # slice if you want - if opt_slice: - df_files = df_files[opt_slice[0]:opt_slice[1]] - - log.info('Processing {:,} images'.format(len(df_files))) - - - # prepare list of images to multithread into sha256s - dir_media = data_store.media_images_original() if opt_dir_media is None else opt_dir_media - file_objs = [] - for ds_file in df_files.itertuples(): - fp_im = join(dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}") - # find the image_index - # append the subdir option, sort by this then increment by unique subdir - file_obj = {'fp': fp_im, 'index': ds_file.Index} - if opt_identity: - subdirs = ds_file.subdir.split('/') - if not len(subdirs) > 0: - log.error(f'Could not split subdir: "{ds_file.subdir}. Try different option for "--identity"') - log.error('exiting') - return - if opt_identity == 'subdir': - subdir = subdirs[0] - elif opt_identity == 'subdir_head': - # use first part of subdir path - subdir = subdirs[0] - elif opt_identity == 'subdir_tail': - # use last part of subdir path - subdir = subdirs[-1] - file_obj['identity_subdir'] = subdir - file_objs.append(file_obj) - - # convert to thread pool - pbar = tqdm(total=len(file_objs)) - - def as_sha256(file_obj): - pbar.update(1) - file_obj['sha256'] = file_utils.sha256(file_obj['fp']) - return file_obj - - # multithread pool - pool_file_objs = [] - st = time.time() - pool = ThreadPool(opt_threads) - with tqdm(total=len(file_objs)) as pbar: - pool_file_objs = pool.map(as_sha256, file_objs) - pbar.close() - - # convert data to dict - data = [] - for pool_file_obj in pool_file_objs: - data.append( { - 'sha256': pool_file_obj['sha256'], - 'index': pool_file_obj['index'], - 'identity_subdir': pool_file_obj.get('identity_subdir', ''), - }) - - # sort based on identity_subdir - # save to CSV - df_sha256 = pd.DataFrame.from_dict(data) - # add new column for identity - df_sha256['identity_index'] = [1] * len(df_sha256) - df_sha256 = df_sha256.sort_values(by=['identity_subdir'], ascending=True) - df_sha256_identity_groups = df_sha256.groupby('identity_subdir') - for identity_index, df_sha256_identity_group_tuple in enumerate(df_sha256_identity_groups): - identity_subdir, df_sha256_identity_group = df_sha256_identity_group_tuple - for ds_sha256 in df_sha256_identity_group.itertuples(): - df_sha256.at[ds_sha256.Index, 'identity_index'] = identity_index - # drop temp identity subdir column - df_sha256 = df_sha256.drop('identity_subdir', axis=1) - # write to CSV - log.info(f'rows: {len(df_sha256)}') - file_utils.mkdirs(fp_out) - df_sha256.set_index('index') - df_sha256 = df_sha256.sort_values(['index'], ascending=[True]) - df_sha256.to_csv(fp_out, index=False) - - # timing - log.info(f'wrote file: {fp_out}') - log.info('time: {:.2f}, theads: {}'.format(time.time() - st, opt_threads)) -
\ No newline at end of file |
