diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-10-10 13:33:31 +0200 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-10-10 13:33:31 +0200 |
| commit | 7d72cbb935ec53ce66c6a0c5cdc68f157be1d35f (patch) | |
| tree | a44049683c3c5e44449fe2698bb080329ecf7e61 /megapixels/commands/datasets | |
| parent | 488a65aa5caba91c1384e7bcb2023056e913fc22 (diff) | |
| parent | cdc0c7ad21eb764cfe36d7583e126660d87fe02d (diff) | |
Merge branch 'master' of asdf.us:megapixels_dev
Diffstat (limited to 'megapixels/commands/datasets')
| -rw-r--r-- | megapixels/commands/datasets/file_record.py | 234 | ||||
| -rw-r--r-- | megapixels/commands/datasets/megaface_age_from_orig.py | 62 |
2 files changed, 62 insertions, 234 deletions
diff --git a/megapixels/commands/datasets/file_record.py b/megapixels/commands/datasets/file_record.py deleted file mode 100644 index 41a5df28..00000000 --- a/megapixels/commands/datasets/file_record.py +++ /dev/null @@ -1,234 +0,0 @@ -''' - -''' -import click - -from app.settings import types -from app.utils import click_utils -from app.settings import app_cfg as cfg -from app.utils.logger_utils import Logger - -log = Logger.getLogger() - -# Choose part of the filepath that will be used for the person identity -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_tail] --> "barack_obama" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir_head] --> "batch_1" -# eg subdirectory "lfw/media/original/batch_1/train/barack_obama/001.jpg" --> [subdir] --> "barack_obama" - -identity_sources = ['subdir', 'numeric'] - -@click.command() -@click.option('-i', '--input', 'opt_fp_in', default=None, - help='Override enum input filename CSV') -@click.option('-o', '--output', 'opt_fp_out', default=None, - help='Override enum output filename CSV') -@click.option('-m', '--media', 'opt_dir_media', default=None, - help='Override enum media directory') -@click.option('--data_store', 'opt_data_store', - type=cfg.DataStoreVar, - default=click_utils.get_default(types.DataStore.HDD), - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--dataset', 'opt_dataset', - type=cfg.DatasetVar, - required=True, - show_default=True, - help=click_utils.show_help(types.Dataset)) -@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), - help='Slice list of files') -@click.option('-t', '--threads', 'opt_threads', default=12, - help='Number of threads') -@click.option('-f', '--force', 'opt_force', is_flag=True, - help='Force overwrite file') -@click.option('--identity', 'opt_identity', type=click.Choice(identity_sources), - required=True, - help='Identity source key') -@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, - help='Use glob recursion (slower)') -@click.option('--max-depth', 'opt_max_depth', default=None, type=int, - help='Max number of images per subdirectory') -@click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_dataset, opt_data_store, opt_dir_media, opt_slice, opt_threads, - opt_identity, opt_force, opt_recursive, opt_max_depth): - """Generates sha256, uuid, and identity index CSV file""" - - import sys, os - from glob import glob - from os.path import join - from pathlib import Path - import time - from multiprocessing.dummy import Pool as ThreadPool - import random - import uuid - - from PIL import Image - import cv2 as cv - import pandas as pd - from tqdm import tqdm - from glob import glob - from operator import itemgetter - - from app.models.data_store import DataStore - from app.utils import file_utils, im_utils - - - # set data_store - data_store = DataStore(opt_data_store, opt_dataset) - # get filepath out - fp_out = data_store.metadata(types.Metadata.FILE_RECORD) if opt_fp_out is None else opt_fp_out - # exit if exists - if not opt_force and Path(fp_out).exists(): - log.error('File exists. Use "-f / --force" to overwite') - return - - # ---------------------------------------------------------------- - # glob files - - fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() - log.info(f'Globbing {fp_in}') - fp_ims = file_utils.glob_multi(fp_in, ['jpg', 'png'], recursive=opt_recursive) - - log.info('Found {:,} images'.format(len(fp_ims))) - subdir_groups = {} - if opt_max_depth: - log.debug(f'using max depth: {opt_max_depth}') - for fp_im in fp_ims: - fpp_im = Path(fp_im) - - subdir = fp_im.split('/')[-2] - if not subdir in subdir_groups.keys(): - subdir_groups[subdir] = [] - else: - subdir_groups[subdir].append(fp_im) - # for each subgroup, limit number of files - fp_ims = [] - for subdir_name, items in subdir_groups.items(): - ims = items[0:opt_max_depth] - fp_ims += ims - - log.debug(f'num subdirs: {len(subdir_groups.keys())}') - # fail if none - if not fp_ims: - log.error('No images. Try with "--recursive"') - return - # slice to reduce - if opt_slice: - fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] - log.info('Found {:,} images'.format(len(fp_ims))) - - # ---------------------------------------------------------------- - # multithread process into SHA256 - - pbar = tqdm(total=len(fp_ims)) - - def pool_mapper(fp_im): - pbar.update(1) - try: - sha256 = file_utils.sha256(fp_im) - im = Image.open(fp_im) - im.verify() # throws error if bad file - assert(im.size[0] > 60 and im.size[1] > 60) - except Exception as e: - log.warn(f'skipping file: {fp_im}') - return None - im = cv.imread(fp_im) - w, h = im.shape[:2][::-1] - file_size_kb = os.stat(fp_im).st_size // 1000 - num_channels = im_utils.num_channels(im) - return { - 'width': w, - 'height': h, - 'sha256': sha256, - 'file_size_kb': file_size_kb, - 'num_channels': num_channels - } - - # convert to thread pool - pool_maps = [] # ? - pool = ThreadPool(opt_threads) - with tqdm(total=len(fp_ims)) as pbar: - pool_maps = pool.map(pool_mapper, fp_ims) - pbar.close() - - - # ---------------------------------------------------------------- - # convert data to dict - - data = [] - indentity_count = 0 - for pool_map, fp_im in zip(pool_maps, fp_ims): - if pool_map is None: - log.warn(f'skipping file: {fp_im}') - continue # skip error files - fpp_im = Path(fp_im) - subdir = str(fpp_im.parent.relative_to(fp_in)) - - if opt_identity: - subdirs = subdir.split('/') - if not len(subdirs) > 0: - log.error(f'Could not split subdir: "{subdir}. Try different option for "--identity"') - log.error('exiting') - return - if opt_identity == 'subdir': - identity = subdirs[-1] # use last part of subdir path - elif opt_identity == 'numeric': - identity = indentity_count # use incrementing number - indentity_count += 1 - else: - identity = '' - - data.append({ - 'subdir': subdir, - 'num_channels': pool_map['num_channels'], - 'fn': fpp_im.stem, - 'ext': fpp_im.suffix.replace('.',''), - 'sha256': pool_map['sha256'], - 'uuid': uuid.uuid4(), - 'identity_key': identity, - 'width': pool_map['width'], - 'height': pool_map['height'] - }) - - # create dataframe - df_records = pd.DataFrame.from_dict(data) - - df_records.index.name = 'index' # reassign 'index' as primary key column - # write to CSV - file_utils.mkdirs(fp_out) - df_records.to_csv(fp_out) - # done - log.info(f'wrote {len(df_records)} rows to "{fp_out}"') - # save script - cmd_line = ' '.join(sys.argv) - file_utils.write_text(cmd_line, '{}.sh'.format(fp_out)) - - -''' -# create dataframe - df_records = pd.DataFrame.from_dict(data) - - # add identity key (used for associating identity) - if opt_identity: - log.info(f'adding identity index using: "{opt_identity}" subdirectory') - # convert dict to DataFrame - # sort based on identity_key - df_records = df_records.sort_values(by=['identity_key'], ascending=True) - # add new column for identity - df_records['identity_index'] = [-1] * len(df_records) - # populate the identity_index - df_records_identity_groups = df_records.groupby('identity_key') - # enumerate groups to create identity indices - log.info(f'updating records with identity_key. This may take a while...') - st = time.time() - for identity_index, df_records_identity_group_tuple in enumerate(df_records_identity_groups): - identity_key, df_records_identity_group = df_records_identity_group_tuple - for ds_record in df_records_identity_group.itertuples(): - df_records.at[ds_record.Index, 'identity_index'] = identity_index - # reset index after being sorted - df_records = df_records.reset_index(drop=True) - log.debug('update time: {:.2f}s'.format(time.time() - st)) - else: - # name everyone person 1, 2, 3... - df_records = df_records.sort_values(by=['subdir'], ascending=True) - pass -'''
\ No newline at end of file diff --git a/megapixels/commands/datasets/megaface_age_from_orig.py b/megapixels/commands/datasets/megaface_age_from_orig.py new file mode 100644 index 00000000..489bebf3 --- /dev/null +++ b/megapixels/commands/datasets/megaface_age_from_orig.py @@ -0,0 +1,62 @@ +import click + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input path to metadata directory') +@click.option('-o', '--output', 'opt_fp_out', + help='Output path to age CSV') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out): + """Creates CSV of MegaFace ages from original BBoxes""" + + import os + from os.path import join + from pathlib import Path + from glob import glob + + import dlib + import pandas as pd + from tqdm import tqdm + + from app.settings import types + from app.utils import click_utils + from app.settings import app_cfg + + from PIL import Image, ImageOps, ImageFilter + from app.utils import file_utils, im_utils, logger_utils + + log = logger_utils.Logger.getLogger() + + # ------------------------------------------------- + # process + fp_im_dirs = glob(join(opt_fp_in, '**/'), recursive=True) + + log.info('Found {} directories'.format(len(fp_im_dirs))) + + identities = {} + + for fp_im_dir in tqdm(fp_im_dirs): + # 1234567@N05_identity_1 + try: + dir_id_name = Path(fp_im_dir).name + nsid = dir_id_name.split('_')[0] + identity_num = dir_id_name.split('_')[2] + id_key = '{}_{}'.format(nsid, identity_num) + num_images = len(glob(join(fp_im_dir, '*.jpg'))) + if not id_key in identities.keys(): + identities[id_key] = {'nsid': nsid, 'identity': identity_num, 'images': num_images} + else: + identities[id_key]['images'] += num_images + except Exception as e: + continue + + # convert to dict + identities_list = [v for k, v in identities.items()] + df = pd.DataFrame.from_dict(identities_list) + + file_utils.mkdirs(opt_fp_out) + + log.info('Wrote {} lines to {}'.format(len(df), opt_fp_out)) + df.to_csv(opt_fp_out, index=False) + + |
