summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/download_images.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-03-19 12:20:38 +0100
committeradamhrv <adam@ahprojects.com>2019-03-19 12:20:38 +0100
commit53f6e26015e65b8696ed1a6e5c74bdfef14b3ac2 (patch)
tree8bf8b0019ff604b2165bc66e3b5deaba355b46af /megapixels/commands/datasets/download_images.py
parent389f1f162720b577fcc652c95620eadd5e77ec43 (diff)
add cmds
Diffstat (limited to 'megapixels/commands/datasets/download_images.py')
-rw-r--r--megapixels/commands/datasets/download_images.py14
1 files changed, 11 insertions, 3 deletions
diff --git a/megapixels/commands/datasets/download_images.py b/megapixels/commands/datasets/download_images.py
index f1519c61..c64afbba 100644
--- a/megapixels/commands/datasets/download_images.py
+++ b/megapixels/commands/datasets/download_images.py
@@ -8,8 +8,10 @@ import click
help='Output')
@click.option('-t', '--threads', 'opt_threads', default=8,
help='Number of threads')
+@click.option('--wayback', 'opt_wayback', is_flag=True,
+ help='Check Wayback archive for URL and download cached image')
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_wayback):
"""Threaded image downloader"""
"""
@@ -38,6 +40,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
# setup multithreading function
def pool_process(item):
# threaded function
+ url_wayback_base = 'https://archive.org/wayback/available?url='
fp_out = item['filepath']
try:
# download image
@@ -45,7 +48,12 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
urllib.request.urlretrieve(item['url'], fp_out)
item['status'] = True
except Exception as e:
- log.debug(f'Error: {e}')
+ log.debug(f'Error: {e}, url: {item["url"]}')
+ estr = str(e)
+ if item['opt_wayback']:
+ if 'HTTP Error' in estr:
+ # check
+ url_wayback = url_wayback_base + item['url']
fp_error = f'{fp_out}_error.txt'
with open(fp_error, 'w') as fp:
fp.write('')
@@ -64,7 +72,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads):
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
if not fp_dst_is_file and not fp_dst_is_err:
- pool_items.append({'url':x['url'], 'filepath': fp_dst})
+ pool_items.append({'url':x['url'], 'filepath': fp_dst, 'opt_wayback': opt_wayback})
num_items = len(pool_items)
log.info(f'processing {num_items:,} items')