megapixels/commands/datasets/imdb_face_download.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

import click

fp_in = '/data_store/datasets/people/imdb_face/downloads/IMDb-Face.csv'
fp_out = '/data_store_hdd/datasets/people/imdb_face/media/'

@click.command()
@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in,
  help='Input')
@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out,
  help='Output')
@click.option('-t', '--threads', 'opt_threads', default=8,
  help='Number of threads')
@click.pass_context
def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
  """Download IMDb-Face URLs"""
  
  from os.path import join
  from functools import partial
  from multiprocessing.dummy import Pool as ThreadPool
  import urllib

  import pandas as pd
  from tqdm import tqdm
  from app.utils import file_utils
  from app.utils.logger_utils import Logger

  log = Logger.getLogger()

  # setup multithreading function
  def pool_process(item):
    # threaded function
    try:
      # download image
      file_utils.mkdirs(item['fp'])
      urllib.request.urlretrieve(item['url'], item['fp'], timeout=20)
      item['status'] = True
    except Exception as e:
      log.debug(f'Error: {e}')
      item['status'] = False
    pbar.update(1)
    return item

  # setup multithreading data holds
  log.debug(f'loading {opt_fp_in}')
  records = pd.read_csv(opt_fp_in).to_dict('records')
  pool_items = [{'url':x['url'], 'fp': join(opt_fp_out, x['index'], x['image'])} for x in records]
  num_items = len(pool_items)
  log.info(f'processing {num_items:,} items')
  pool_results = []

  # run the multithreading with progress bar
  pbar = tqdm(total=num_items)
  pool_process = partial(pool_process)
  pool = ThreadPool(opt_threads) 
  with tqdm(total=num_items) as pbar:
    pool_results = pool.map(pool_process, pool_items)
  
  pbar.close()