diff options
Diffstat (limited to 'megapixels/commands/datasets/sha256.py')
| -rw-r--r-- | megapixels/commands/datasets/sha256.py | 89 |
1 files changed, 0 insertions, 89 deletions
diff --git a/megapixels/commands/datasets/sha256.py b/megapixels/commands/datasets/sha256.py deleted file mode 100644 index 4c734073..00000000 --- a/megapixels/commands/datasets/sha256.py +++ /dev/null @@ -1,89 +0,0 @@ -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input directory') -@click.option('-m', '--media', 'opt_dir_media', required=True, - help='Input media directory') -@click.option('-o', '--output', 'opt_fp_out', required=True, - help='Output directory') -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=4, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.pass_context -def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_slice, opt_threads, opt_force): - """Multithreading test""" - - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - - import pandas as pd - from tqdm import tqdm - from glob import glob - - from app.utils import file_utils, im_utils - - - if not opt_force and Path(opt_fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - - df_files = pd.read_csv(opt_fp_in).set_index('index') - - if opt_slice: - df_files = df_files[opt_slice[0]:opt_slice[1]] - - log.info('Processing {:,} images'.format(len(df_files))) - - - # prepare list of images to multithread into sha256s - file_objs = [] - for ds_file in df_files.itertuples(): - fp_im = join(opt_dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}") - file_objs.append({'fp': fp_im, 'index': ds_file.Index}) - - # convert to thread pool - pbar = tqdm(total=len(file_objs)) - - def as_sha256(file_obj): - pbar.update(1) - file_obj['sha256'] = file_utils.sha256(file_obj['fp']) - return file_obj - - # multithread pool - pool_file_objs = [] - st = time.time() - pool = ThreadPool(opt_threads) - with tqdm(total=len(file_objs)) as pbar: - pool_file_objs = pool.map(as_sha256, file_objs) - pbar.close() - - # convert data to dict - data = [] - for pool_file_obj in pool_file_objs: - data.append( { - 'sha256': pool_file_obj['sha256'], - 'index': pool_file_obj['index'] - }) - - # save to CSV - file_utils.mkdirs(opt_fp_out) - df = pd.DataFrame.from_dict(data) - df.to_csv(opt_fp_out, index=False) - - # timing - log.info('time: {:.2f}, theads: {}'.format(time.time() - st, opt_threads))
\ No newline at end of file |
