summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/gen_filepath.py
blob: 5db405c046cb6321b8661eb442215ce7eaa0922e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Begin with this file to process folder of images
- Converts folders and subdirectories into CSV with file attributes split
"""
import click

from app.settings import types
from app.utils import click_utils
from app.settings import app_cfg as cfg
from app.utils.logger_utils import Logger

log = Logger.getLogger()

@click.command()
@click.option('-i', '--input', 'opt_fp_in',
  help='Override enum input filename CSV')
@click.option('-o', '--output', 'opt_fp_out',
  help='Override enum output filename CSV')
@click.option('--data_store', 'opt_data_store',
  type=cfg.DataStoreVar,
  default=click_utils.get_default(types.DataStore.NAS),
  show_default=True,
  help=click_utils.show_help(types.Dataset))
@click.option('--dataset', 'opt_dataset',
  type=cfg.DatasetVar,
  required=True,
  show_default=True,
  help=click_utils.show_help(types.Dataset))
@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
  help='Slice list of files')
@click.option('--recursive/--no-recursive', 'opt_recursive', is_flag=True, default=False,
  help='Use glob recursion (slower)')
@click.option('-t', '--threads', 'opt_threads', default=4,
  help='Number of threads')
@click.option('-f', '--force', 'opt_force', is_flag=True,
  help='Force overwrite file')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_data_store, opt_dataset, opt_slice, 
  opt_recursive, opt_threads, opt_force):
  """Multithreading test"""
  
  from glob import glob
  from os.path import join
  from pathlib import Path
  import time
  from multiprocessing.dummy import Pool as ThreadPool 
  import random

  import pandas as pd
  from tqdm import tqdm
  from glob import glob
  
  from app.models.data_store import DataStore
  from app.utils import file_utils, im_utils

  data_store = DataStore(opt_data_store, opt_dataset)
  fp_out = opt_fp_out if opt_fp_out is not None else data_store.metadata(types.Metadata.FILEPATH)
  if not opt_force and Path(fp_out).exists():
    log.error('File exists. Use "-f / --force" to overwite')
    return


  # glob files
  fp_in = opt_fp_in if opt_fp_in is not None else data_store.media_images_original()
  fp_ims = []
  log.info(f'Globbing {fp_in}')
  for ext in ['jpg', 'png']:
    if opt_recursive:
      fp_glob = join(fp_in, '**/*.{}'.format(ext))
      fp_ims += glob(fp_glob, recursive=True)
    else:
      fp_glob = join(fp_in, '*.{}'.format(ext))
      fp_ims += glob(fp_glob)

  if not fp_ims:
    log.warn('No images. Try with "--recursive"')
    return

  if opt_slice:
    fp_ims = fp_ims[opt_slice[0]:opt_slice[1]]

  log.info('Found {:,} images'.format(len(fp_ims)))


  # convert data to dict
  data = []
  for i, fp_im in enumerate(tqdm(fp_ims)):
    fpp_im = Path(fp_im)
    subdir = str(fpp_im.parent.relative_to(fp_in))
    data.append( {
      'subdir': subdir,
      'fn': fpp_im.stem,
      'ext': fpp_im.suffix.replace('.','')
      })

  # save to CSV
  file_utils.mkdirs(fp_out)
  df_filepath = pd.DataFrame.from_dict(data)
  df_filepath = df_filepath.sort_values(by=['subdir'], ascending=True)
  df_filepath = df_filepath.reset_index()
  df_filepath.index.name = 'index'
  df_filepath.to_csv(fp_out)