diff options
| author | adamhrv <adam@ahprojects.com> | 2019-01-03 12:51:31 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-01-03 12:51:31 +0100 |
| commit | f215db6e84071077082d14f8366ae1cf1aea500f (patch) | |
| tree | 33e4573eb618f21685809cf567fdf196ff673a91 /megapixels/commands/datasets/records.py | |
| parent | 5340bee951c18910fd764241945f1f136b5a22b4 (diff) | |
fix roi index, clean up pose, roi, records, vector
Diffstat (limited to 'megapixels/commands/datasets/records.py')
| -rw-r--r-- | megapixels/commands/datasets/records.py | 167 |
1 files changed, 0 insertions, 167 deletions
diff --git a/megapixels/commands/datasets/records.py b/megapixels/commands/datasets/records.py deleted file mode 100644 index b6ef618b..00000000 --- a/megapixels/commands/datasets/records.py +++ /dev/null @@ -1,167 +0,0 @@ -''' - -''' -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -identity_sources = ['subdir', 'subdir_head', 'subdir_tail'] - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', default=None, - help='Override enum input filename CSV') -@click.option('-o', '--output', 'opt_fp_out', default=None, - help='Override enum output filename CSV') -@click.option('-m', '--media', 'opt_dir_media', default=None, - help='Override enum media directory') -@click.option('--data_store', 'opt_data_store', - type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.SSD), - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--dataset', 'opt_dataset', - type=cfg.DatasetVar, - required=True, - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=12, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.option('--identity', 'opt_identity', default=None, type=click.Choice(identity_sources), - help='Identity source, blank for no identity') -@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, - help='Use glob recursion (slower)') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force, opt_recursive): - """Generates sha256, uuid, and identity index CSV file""" - - import sys - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - import uuid - - import pandas as pd - from tqdm import tqdm - from glob import glob - - from app.models.data_store import DataStore - from app.utils import file_utils, im_utils - - - # set data_store - data_store = DataStore(opt_data_store, opt_dataset) - # get filepath out - fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out - # exit if exists - if not opt_force and Path(fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - - # ---------------------------------------------------------------- - # glob files - - fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() - log.info(f'Globbing {fp_in}') - fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) - # fail if none - if not fp_ims: - log.error('No images. Try with "--recursive"') - return - # slice to reduce - if opt_slice: - fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] - log.info('Found {:,} images'.format(len(fp_ims))) - - - # ---------------------------------------------------------------- - # multithread process into SHA256 - - pbar = tqdm(total=len(fp_ims)) - - def as_sha256(fp_im): - pbar.update(1) - return file_utils.sha256(fp_im) - - # convert to thread pool - sha256s = [] # ? - pool = ThreadPool(opt_threads) - with tqdm(total=len(fp_ims)) as pbar: - sha256s = pool.map(as_sha256, fp_ims) - pbar.close() - - - # ---------------------------------------------------------------- - # convert data to dict - - data = [] - indentity_count = 0 - for sha256, fp_im in zip(sha256s, fp_ims): - fpp_im = Path(fp_im) - subdir = str(fpp_im.parent.relative_to(fp_in)) - - - if opt_identity: - subdirs = subdir.split('/') - if not len(subdirs) > 0: - log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') - log.error('exiting') - return - if opt_identity == 'subdir': - identity = subdirs[0] # use first/only part - elif opt_identity == 'subdir_head': - identity = subdirs[0] # use first part of subdir path - elif opt_identity == 'subdir_tail': - identity = subdirs[-1] # use last part of subdir path - else: - identity = indentity_count # use incrementing number - indentity_count += 1 - - data.append({ - 'subdir': subdir, - 'fn': fpp_im.stem, - 'ext': fpp_im.suffix.replace('.',''), - 'sha256': sha256, - 'uuid': uuid.uuid4(), - 'identity_key': identity - }) - - df_records = pd.DataFrame.from_dict(data) - if opt_identity: - log.info(f'adding identity index using: "{opt_identity}". This may take a while...') - # convert dict to DataFrame - # sort based on identity_key - df_records = df_records.sort_values(by=['identity_key'], ascending=True) - # add new column for identity - df_records['identity_index'] = [-1] * len(df_records) - # populate the identity_index - df_records_identity_groups = df_records.groupby('identity_key') - # enumerate groups to create identity indices - for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): - identity_key, df_records_identity_group = df_records_identity_group_tuple - for ds_record in df_records_identity_group.itertuples(): - df_records.at[ds_record.Index, 'identity_index'] = identity_index - # reset index after being sorted - df_records = df_records.reset_index(drop=True) - else: - # name everyone person 1, 2, 3... - pass - - df_records.index.name = 'index' # reassign 'index' as primary key column - # write to CSV - file_utils.mkdirs(fp_out) - df_records.to_csv(fp_out) - # done - log.info(f'wrote rows: {len(df_records)} to {fp_out}')
\ No newline at end of file |
