diff options
Diffstat (limited to 'megapixels/commands/datasets/gen_sha256.py')
| -rw-r--r-- | megapixels/commands/datasets/gen_sha256.py | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/gen_sha256.py b/megapixels/commands/datasets/gen_sha256.py new file mode 100644 index 00000000..1616eebf --- /dev/null +++ b/megapixels/commands/datasets/gen_sha256.py @@ -0,0 +1,152 @@ +''' + +''' +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', default=None, + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', default=None, + help='Override enum output filename CSV') +@click.option('-m', '--media', 'opt_dir_media', default=None, + help='Override enum media directory') +@click.option('--data_store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.NAS), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-t', '--threads', 'opt_threads', default=12, + help='Number of threads') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--identity', 'opt_identity', default='subdir_tail', type=click.Choice(identity_sources), + help='Identity source, blank for no identity') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, + opt_identity, opt_force): + """Generates sha256/identity index CSV file""" + + import sys + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + + import pandas as pd + from tqdm import tqdm + from glob import glob + + from app.models import DataStore + from app.utils import file_utils, im_utils + + + # set data_store + data_store = DataStore(opt_data_store, opt_dataset) + # get filepath out + fp_out = data_store.metadata(types.Metadata.SHA256) if opt_fp_out is None else opt_fp_out + # exit if exists + if not opt_force and Path(fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + # get filepath in + fp_in = data_store.metadata(types.Metadata.FILEPATH) + df_files = pd.read_csv(fp_in).set_index('index') + # slice if you want + if opt_slice: + df_files = df_files[opt_slice[0]:opt_slice[1]] + + log.info('Processing {:,} images'.format(len(df_files))) + + + # prepare list of images to multithread into sha256s + dir_media = data_store.media_images_original() if opt_dir_media is None else opt_dir_media + file_objs = [] + for ds_file in df_files.itertuples(): + fp_im = join(dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}") + # find the image_index + # append the subdir option, sort by this then increment by unique subdir + file_obj = {'fp': fp_im, 'index': ds_file.Index} + if opt_identity: + subdirs = ds_file.subdir.split('/') + if not len(subdirs) > 0: + log.error(f'Could not split subdir: "{ds_file.subdir}. Try different option for "--identity"') + log.error('exiting') + return + if opt_identity == 'subdir': + subdir = subdirs[0] + elif opt_identity == 'subdir_head': + # use first part of subdir path + subdir = subdirs[0] + elif opt_identity == 'subdir_tail': + # use last part of subdir path + subdir = subdirs[-1] + file_obj['identity_subdir'] = subdir + file_objs.append(file_obj) + + # convert to thread pool + pbar = tqdm(total=len(file_objs)) + + def as_sha256(file_obj): + pbar.update(1) + file_obj['sha256'] = file_utils.sha256(file_obj['fp']) + return file_obj + + # multithread pool + pool_file_objs = [] + st = time.time() + pool = ThreadPool(opt_threads) + with tqdm(total=len(file_objs)) as pbar: + pool_file_objs = pool.map(as_sha256, file_objs) + pbar.close() + + # convert data to dict + data = [] + for pool_file_obj in pool_file_objs: + data.append( { + 'sha256': pool_file_obj['sha256'], + 'index': pool_file_obj['index'], + 'identity_subdir': pool_file_obj.get('identity_subdir', ''), + }) + + # sort based on identity_subdir + # save to CSV + df_sha256 = pd.DataFrame.from_dict(data) + # add new column for identity + df_sha256['identity_index'] = [1] * len(df_sha256) + df_sha256 = df_sha256.sort_values(by=['identity_subdir'], ascending=True) + df_sha256_identity_groups = df_sha256.groupby('identity_subdir') + for identity_index, df_sha256_identity_group_tuple in enumerate(df_sha256_identity_groups): + identity_subdir, df_sha256_identity_group = df_sha256_identity_group_tuple + for ds_sha256 in df_sha256_identity_group.itertuples(): + df_sha256.at[ds_sha256.Index, 'identity_index'] = identity_index + # drop temp identity subdir column + df_sha256 = df_sha256.drop('identity_subdir', axis=1) + # write to CSV + log.info(f'rows: {len(df_sha256)}') + file_utils.mkdirs(fp_out) + df_sha256.set_index('index') + df_sha256 = df_sha256.sort_values(['index'], ascending=[True]) + df_sha256.to_csv(fp_out, index=False) + + # timing + log.info(f'wrote file: {fp_out}') + log.info('time: {:.2f}, theads: {}'.format(time.time() - st, opt_threads)) +
\ No newline at end of file |
