diff options
| author | adamhrv <adam@ahprojects.com> | 2019-03-13 17:47:58 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-03-13 17:47:58 +0100 |
| commit | b2dcfb3ef79557b7fccfdd94aa8ac4991552d7e1 (patch) | |
| tree | 1571e1d9d47ffe63b93670ae5870faca29b1bd3f /megapixels/commands/datasets/download_images.py | |
| parent | 1fe0e5c79c3cbc4b13083116980e62b449866100 (diff) | |
add downloader
Diffstat (limited to 'megapixels/commands/datasets/download_images.py')
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py new file mode 100644 index 00000000..f1519c61 --- /dev/null +++ b/megapixels/commands/datasets/download_images.py @@ -0,0 +1,82 @@ +import click + + +@click.command() +@click.option('-i', '--input', 'opt_fp_in', required=True, + help='Input') +@click.option('-o', '--output', 'opt_fp_out', required=True, + help='Output') +@click.option('-t', '--threads', 'opt_threads', default=8, + help='Number of threads') +@click.pass_context +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): + """Threaded image downloader""" + + """ + CSV should be formatted as + + |url|filepath| + |---|---| + |https:/site.com/photo.jpg|myfolder/myname.jpg| + + Saves logfile.csv output and uses for errors + """ + + from os.path import join + from functools import partial + from pathlib import Path + from multiprocessing.dummy import Pool as ThreadPool + import urllib + + import pandas as pd + from tqdm import tqdm + from app.utils import file_utils + from app.utils.logger_utils import Logger + + log = Logger.getLogger() + + # setup multithreading function + def pool_process(item): + # threaded function + fp_out = item['filepath'] + try: + # download image + file_utils.mkdirs(item['filepath']) + urllib.request.urlretrieve(item['url'], fp_out) + item['status'] = True + except Exception as e: + log.debug(f'Error: {e}') + fp_error = f'{fp_out}_error.txt' + with open(fp_error, 'w') as fp: + fp.write('') + item['status'] = False + pbar.update(1) + return item + + # setup multithreading data holds + log.debug(f'loading {opt_fp_in}') + records = pd.read_csv(opt_fp_in).to_dict('records') + + + pool_items = [] + for x in tqdm(records): + fp_dst = join(opt_fp_out, x['filepath']) + fp_dst_is_file = Path(fp_dst).is_file() + fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() + if not fp_dst_is_file and not fp_dst_is_err: + pool_items.append({'url':x['url'], 'filepath': fp_dst}) + + num_items = len(pool_items) + log.info(f'processing {num_items:,} items') + pool_results = [] + + # run the multithreading with progress bar + pbar = tqdm(total=num_items) + pool_process = partial(pool_process) + pool = ThreadPool(opt_threads) + with tqdm(total=num_items) as pbar: + pool_results = pool.map(pool_process, pool_items) + + pbar.close() + + |
