diff options
| author | adamhrv <adam@ahprojects.com> | 2019-03-19 12:20:38 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-03-19 12:20:38 +0100 |
| commit | 53f6e26015e65b8696ed1a6e5c74bdfef14b3ac2 (patch) | |
| tree | 8bf8b0019ff604b2165bc66e3b5deaba355b46af /megapixels/commands/datasets/download_images.py | |
| parent | 389f1f162720b577fcc652c95620eadd5e77ec43 (diff) | |
add cmds
Diffstat (limited to 'megapixels/commands/datasets/download_images.py')
| -rw-r--r-- | megapixels/commands/datasets/download_images.py | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py index f1519c61..c64afbba 100644 --- a/megapixels/commands/datasets/download_images.py +++ b/megapixels/commands/datasets/download_images.py @@ -8,8 +8,10 @@ import click help='Output') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') +@click.option('--wayback', 'opt_wayback', is_flag=True, + help='Check Wayback archive for URL and download cached image') @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): +def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback): """Threaded image downloader""" """ @@ -38,6 +40,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): # setup multithreading function def pool_process(item): # threaded function + url_wayback_base = 'https://archive.org/wayback/available?url=' fp_out = item['filepath'] try: # download image @@ -45,7 +48,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): urllib.request.urlretrieve(item['url'], fp_out) item['status'] = True except Exception as e: - log.debug(f'Error: {e}') + log.debug(f'Error: {e}, url: {item["url"]}') + estr = str(e) + if item['opt_wayback']: + if 'HTTP Error' in estr: + # check + url_wayback = url_wayback_base + item['url'] fp_error = f'{fp_out}_error.txt' with open(fp_error, 'w') as fp: fp.write('') @@ -64,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads): fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() if not fp_dst_is_file and not fp_dst_is_err: - pool_items.append({'url':x['url'], 'filepath': fp_dst}) + pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback}) num_items = len(pool_items) log.info(f'processing {num_items:,} items') |
