import click fp_in = '/data_store/datasets/people/imdb_face/downloads/IMDb-Face.csv' fp_out = '/data_store_hdd/datasets/people/imdb_face/media/' @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, help='Input') @click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, help='Output') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') @click.pass_context def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): """Download IMDb-Face URLs""" from os.path import join from functools import partial from multiprocessing.dummy import Pool as ThreadPool import urllib import pandas as pd from tqdm import tqdm from app.utils import file_utils from app.utils.logger_utils import Logger log = Logger.getLogger() # setup multithreading function def pool_process(item): # threaded function try: # download image file_utils.mkdirs(item['fp']) urllib.request.urlretrieve(item['url'], item['fp'], timeout=20) item['status'] = True except Exception as e: log.debug(f'Error: {e}') item['status'] = False pbar.update(1) return item # setup multithreading data holds log.debug(f'loading {opt_fp_in}') records = pd.read_csv(opt_fp_in).to_dict('records') pool_items = [{'url':x['url'], 'fp': join(opt_fp_out, x['index'], x['image'])} for x in records] num_items = len(pool_items) log.info(f'processing {num_items:,} items') pool_results = [] # run the multithreading with progress bar pbar = tqdm(total=num_items) pool_process = partial(pool_process) pool = ThreadPool(opt_threads) with tqdm(total=num_items) as pbar: pool_results = pool.map(pool_process, pool_items) pbar.close()