diff options
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/file_meta.py | 84 | ||||
| -rw-r--r-- | megapixels/commands/datasets/filter_poses.py | 76 | ||||
| -rw-r--r-- | megapixels/commands/datasets/lookup.py | 44 | ||||
| -rw-r--r-- | megapixels/commands/datasets/sha256.py | 55 |
4 files changed, 231 insertions, 28 deletions
diff --git a/megapixels/commands/datasets/file_meta.py b/megapixels/commands/datasets/file_meta.py new file mode 100644 index 00000000..e1456f44 --- /dev/null +++ b/megapixels/commands/datasets/file_meta.py @@ -0,0 +1,84 @@ +""" +Begin with this file to process folder of images +- Converts folders and subdirectories into CSV with file attributes split +""" +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output file for file meta CSV') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, + help='Use glob recursion (slower)') +@click.option('-t', '--threads', 'opt_threads', default=4, + help='Number of threads') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_force): + """Multithreading test""" + + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + + import pandas as pd + from tqdm import tqdm + from glob import glob + + from app.utils import file_utils, im_utils + + + if not opt_force and Path(opt_fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + + fp_ims = [] + log.info(f'Globbing {opt_fp_in}') + for ext in ['jpg', 'png']: + if opt_recursive: + fp_glob = join(opt_fp_in, '**/*.{}'.format(ext)) + fp_ims += glob(fp_glob, recursive=True) + else: + fp_glob = join(opt_fp_in, '*.{}'.format(ext)) + fp_ims += glob(fp_glob) + + if not fp_ims: + log.warn('No images. Try with "--recursive"') + return + + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + + log.info('Processing {:,} images'.format(len(fp_ims))) + + + # convert data to dict + data = [] + for i, fp_im in enumerate(tqdm(fp_ims)): + fpp_im = Path(fp_im) + subdir = str(fpp_im.parent.relative_to(opt_fp_in)) + data.append( { + 'subdir': subdir, + 'fn': fpp_im.stem, + 'ext': fpp_im.suffix.replace('.','') + }) + + # save to CSV + file_utils.mkdirs(opt_fp_out) + df = pd.DataFrame.from_dict(data) + df.index.name = 'index' + df.to_csv(opt_fp_out)
\ No newline at end of file diff --git a/megapixels/commands/datasets/filter_poses.py b/megapixels/commands/datasets/filter_poses.py new file mode 100644 index 00000000..304eeff2 --- /dev/null +++ b/megapixels/commands/datasets/filter_poses.py @@ -0,0 +1,76 @@ +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output directory') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.option('--yaw', 'opt_yaw', type=(float, float), default=(-25,25), + help='Yaw (min, max)') +@click.option('--roll', 'opt_roll', type=(float, float), default=(-15,15), + help='Roll (min, max)') +@click.option('--pitch', 'opt_pitch', type=(float, float), default=(-10,10), + help='Pitch (min, max)') +@click.option('--drop', 'opt_drop', type=click.Choice(['valid', 'invalid']), default='invalid', + help='Drop valid or invalid poses') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_yaw, opt_roll, opt_pitch, + opt_drop, opt_force): + """Filter out exaggerated poses""" + + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + + import pandas as pd + from tqdm import tqdm + from glob import glob + + from app.utils import file_utils, im_utils + + + if not opt_force and Path(opt_fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + + df_poses = pd.read_csv(opt_fp_in).set_index('index') + + if opt_slice: + df_poses = df_poses[opt_slice[0]:opt_slice[1]] + + log.info('Processing {:,} rows'.format(len(df_poses))) + + # extend a new temporary column + df_poses['valid'] = [0] * len(df_poses) + + # filter out extreme poses + for ds_pose in tqdm(df_poses.itertuples(), total=len(df_poses)): + if ds_pose.yaw > opt_yaw[0] and ds_pose.yaw < opt_yaw[1] \ + and ds_pose.roll > opt_roll[0] and ds_pose.roll < opt_roll[1] \ + and ds_pose.pitch > opt_pitch[0] and ds_pose.pitch < opt_pitch[1]: + df_poses.at[ds_pose.Index, 'valid'] = 1 + + # filter out valid/invalid + drop_val = 0 if opt_drop == 'valid' else 0 # drop 0's if drop == valid, else drop 1's + df_poses_filtered = df_poses.drop(df_poses[df_poses.valid == int()].index, axis=0) + + # drop temp column + df_poses_filtered = df_poses_filtered.drop('valid', axis=1) + + # save filtered poses + df_poses_filtered.to_csv(opt_fp_out) + log.info('Saved {:,} rows'.format(len(df_poses_filtered)))
\ No newline at end of file diff --git a/megapixels/commands/datasets/lookup.py b/megapixels/commands/datasets/lookup.py new file mode 100644 index 00000000..11f54957 --- /dev/null +++ b/megapixels/commands/datasets/lookup.py @@ -0,0 +1,44 @@ +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +lookup_types = ['image', 'identity'] + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input CSV file') +@click.option('-t', '--type', 'opt_type', default='image', + type=click.Choice(lookup_types), + help='Type of lookup') +@click.option('-d', '--dataset', 'opt_dataset', required=True, + type=cfg.DatasetVar, + default=click_utils.get_default(types.Dataset.LFW), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--index', 'opt_index', required=True, + help='Index to lookup') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_index): + """Display image info""" + + from glob import glob + from os.path import join + from pathlib import Path + import time + + import pandas as pd + from tqdm import tqdm + + from app.utils import file_utils, im_utils + + # lookup and index and display all information + df = pd.read_csv(opt_fp_in).set_index('index') + + + +
\ No newline at end of file diff --git a/megapixels/commands/datasets/sha256.py b/megapixels/commands/datasets/sha256.py index c04fb504..4c734073 100644 --- a/megapixels/commands/datasets/sha256.py +++ b/megapixels/commands/datasets/sha256.py @@ -10,18 +10,18 @@ log = Logger.getLogger() @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, help='Input directory') -@click.option('-o', '--output', 'opt_fp_out', +@click.option('-m', '--media', 'opt_dir_media', required=True, + help='Input media directory') +@click.option('-o', '--output', 'opt_fp_out', required=True, help='Output directory') @click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), help='Slice list of files') -@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, - help='Use glob recursion (slower)') @click.option('-t', '--threads', 'opt_threads', default=4, help='Number of threads') @click.option('-f', '--force', 'opt_force', is_flag=True, help='Force overwrite file') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_force): +def cli(ctx, opt_fp_in, opt_dir_media, opt_fp_out, opt_slice, opt_threads, opt_force): """Multithreading test""" from glob import glob @@ -42,47 +42,46 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_recursive, opt_threads, opt_f log.error('File exists. Use "-f / --force" to overwite') return - fp_ims = [] - for ext in ['jpg', 'png']: - if opt_recursive: - fp_glob = join(opt_fp_in, '**/*.{}'.format(ext)) - fp_ims += glob(fp_glob, recursive=True) - else: - fp_glob = join(opt_fp_in, '*.{}'.format(ext)) - fp_ims += glob(fp_glob) + df_files = pd.read_csv(opt_fp_in).set_index('index') if opt_slice: - fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + df_files = df_files[opt_slice[0]:opt_slice[1]] - log.info('Processing {:,} images'.format(len(fp_ims))) + log.info('Processing {:,} images'.format(len(df_files))) - pbar = tqdm(total=100) + + # prepare list of images to multithread into sha256s + file_objs = [] + for ds_file in df_files.itertuples(): + fp_im = join(opt_dir_media, str(ds_file.subdir), f"{ds_file.fn}.{ds_file.ext}") + file_objs.append({'fp': fp_im, 'index': ds_file.Index}) + + # convert to thread pool + pbar = tqdm(total=len(file_objs)) - def as_sha256(fp_im): + def as_sha256(file_obj): pbar.update(1) - return file_utils.sha256(fp_im) + file_obj['sha256'] = file_utils.sha256(file_obj['fp']) + return file_obj # multithread pool + pool_file_objs = [] st = time.time() pool = ThreadPool(opt_threads) - with tqdm(total=len(fp_ims)) as pbar: - sha256s = pool.map(as_sha256, fp_ims) + with tqdm(total=len(file_objs)) as pbar: + pool_file_objs = pool.map(as_sha256, file_objs) pbar.close() - + # convert data to dict data = [] - for i, fp_im in enumerate(fp_ims): - fpp_im = Path(fp_im) - subdir = str(fpp_im.parent.relative_to(opt_fp_in)) - sha256 = sha256s[i] + for pool_file_obj in pool_file_objs: data.append( { - 'sha256': sha256, - 'subdir': subdir, - 'fn': fpp_im.stem, - 'ext': fpp_im.suffix.replace('.','') + 'sha256': pool_file_obj['sha256'], + 'index': pool_file_obj['index'] }) # save to CSV + file_utils.mkdirs(opt_fp_out) df = pd.DataFrame.from_dict(data) df.to_csv(opt_fp_out, index=False) |
