diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-13 02:00:57 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-13 02:00:57 +0100 |
| commit | dc7d9cbba842472efb33186e97ee55751e4d50ca (patch) | |
| tree | ca12f9b2f381fee7590bc8587ee08b59ccff6487 | |
| parent | dc1889f15ab1b1338c557cda0b1bcd989e1fdf9b (diff) | |
| parent | 6b3923624f352b13633e83ac18ac2f7fd74be34a (diff) | |
Merge branch 'master' of github.com:adamhrv/megapixels_dev
| -rw-r--r-- | megapixels/commands/datasets/imdb_face_download.py | 60 | ||||
| -rw-r--r-- | megapixels/commands/templates/multithreaded.py | 17 |
2 files changed, 68 insertions, 9 deletions
diff --git a/megapixels/commands/datasets/imdb_face_download.py b/megapixels/commands/datasets/imdb_face_download.py new file mode 100644 index 00000000..4180fac0 --- /dev/null +++ b/megapixels/commands/datasets/imdb_face_download.py @@ -0,0 +1,60 @@ +import click + +fp_in = '/data_store/datasets/people/imdb_face/downloads/IMDb-Face.csv' +fp_out = '/data_store_hdd/datasets/people/imdb_face/media/' + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, default=fp_in, + help='Input') +@click.option('-o', '--output', 'opt_fp_out', required=True, default=fp_out, + help='Output') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): + """Download IMDb-Face URLs""" + + from os.path import join + from functools import partial + from multiprocessing.dummy import Pool as ThreadPool + import urllib + + import pandas as pd + from tqdm import tqdm + from app.utils import file_utils + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + try: + # download image + file_utils.mkdirs(item['fp']) + urllib.request.urlretrieve(item['url'], item['fp'], timeout=20) + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holds + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + pool_items = [{'url':x['url'], 'fp': join(opt_fp_out, x['index'], x['image'])} for x in records] + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + pool_results = [] + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + diff --git a/megapixels/commands/templates/multithreaded.py b/megapixels/commands/templates/multithreaded.py index fec3dac4..a9b287f8 100644 --- a/megapixels/commands/templates/multithreaded.py +++ b/megapixels/commands/templates/multithreaded.py @@ -2,9 +2,9 @@ import click @click.command() @click.option('-i', '--input', 'opt_fp_in', required=True, - help='Input file') + help='Input') @click.option('-o', '--output', 'opt_fp_out', required=True, - help='Output file') + help='Output') @click.option('-t', '--threads', 'opt_threads', default=4, help='Number of threads') @click.pass_context @@ -22,28 +22,27 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): log.info('multithreaded template') # setup multithreading function - def pool_process(data_obj): + def pool_process(item): # threaded function - global parse_yt_page results = [] try: - # do something here with data_obj + # do something here with item except Exception as e: log.debug(f'Error: {e}') pbar.update(1) return results # setup multithreading data holds - items = [] # list of dicts to process - results = [] - num_items = len(items) + pool_items = [] # list of dicts to process + pool_results = [] + num_items = len(pool_items) # run the multithreading with progress bar pbar = tqdm(total=num_items) pool_process = partial(pool_process) pool = ThreadPool(opt_threads) with tqdm(total=num_items) as pbar: - results = pool.map(pool_process, media_items) + pool_results = pool.map(pool_process, pool_items) pbar.close() |
