diff options
Diffstat (limited to 'megapixels/commands/datasets/gen_filepath.py')
| -rw-r--r-- | megapixels/commands/datasets/gen_filepath.py | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/gen_filepath.py b/megapixels/commands/datasets/gen_filepath.py new file mode 100644 index 00000000..e06fee6b --- /dev/null +++ b/megapixels/commands/datasets/gen_filepath.py @@ -0,0 +1,102 @@ +""" +Begin with this file to process folder of images +- Converts folders and subdirectories into CSV with file attributes split +""" +import click + +from app.settings import types +from app.utils import click_utils +from app.settings import app_cfg as cfg +from app.utils.logger_utils import Logger + +log = Logger.getLogger() + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', + help='Override enum input filename CSV') +@click.option('-o', '--output', 'opt_fp_out', + help='Override enum output filename CSV') +@click.option('--data_store', 'opt_data_store', + type=cfg.DataStoreVar, + default=click_utils.get_default(types.DataStore.NAS), + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--dataset', 'opt_dataset', + type=cfg.DatasetVar, + required=True, + show_default=True, + help=click_utils.show_help(types.Dataset)) +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') +@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False, + help='Use glob recursion (slower)') +@click.option('-t', '--threads', 'opt_threads', default=4, + help='Number of threads') +@click.option('-f', '--force', 'opt_force', is_flag=True, + help='Force overwrite file') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_data_store, opt_dataset, opt_slice, + opt_recursive, opt_threads, opt_force): + """Multithreading test""" + + from glob import glob + from os.path import join + from pathlib import Path + import time + from multiprocessing.dummy import Pool as ThreadPool + import random + + import pandas as pd + from tqdm import tqdm + from glob import glob + + from app.models import DataStore + from app.utils import file_utils, im_utils + + data_store = DataStore(opt_data_store, opt_dataset) + fp_out = opt_fp_out if opt_fp_out is not None else data_store.metadata(types.Metadata.FILEPATH) + if not opt_force and Path(fp_out).exists(): + log.error('File exists. Use "-f / --force" to overwite') + return + + + # glob files + fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original() + fp_ims = [] + log.info(f'Globbing {fp_in}') + for ext in ['jpg', 'png']: + if opt_recursive: + fp_glob = join(fp_in, '**/*.{}'.format(ext)) + fp_ims += glob(fp_glob, recursive=True) + else: + fp_glob = join(fp_in, '*.{}'.format(ext)) + fp_ims += glob(fp_glob) + + if not fp_ims: + log.warn('No images. Try with "--recursive"') + return + + if opt_slice: + fp_ims = fp_ims[opt_slice[0]:opt_slice[1]] + + log.info('Found {:,} images'.format(len(fp_ims))) + + + # convert data to dict + data = [] + for i, fp_im in enumerate(tqdm(fp_ims)): + fpp_im = Path(fp_im) + subdir = str(fpp_im.parent.relative_to(fp_in)) + data.append( { + 'subdir': subdir, + 'fn': fpp_im.stem, + 'ext': fpp_im.suffix.replace('.','') + }) + + # save to CSV + file_utils.mkdirs(fp_out) + df_filepath = pd.DataFrame.from_dict(data) + df_filepath = df_filepath.sort_values(by=['subdir'], ascending=True) + df_filepath = df_filepath.reset_index(drop=True) + df_filepath.index.name = 'index' + df_filepath.to_csv(fp_out)
\ No newline at end of file |
