summaryrefslogtreecommitdiff
path: root/megapixels/commands/datasets/download_ibmdif.py
diff options
context:
space:
mode:
Diffstat (limited to 'megapixels/commands/datasets/download_ibmdif.py')
-rw-r--r--megapixels/commands/datasets/download_ibmdif.py11
1 files changed, 8 insertions, 3 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index ed717662..0b81fef6 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t
help='Output path')
@click.option('-t', '--threads', 'opt_threads', default=8,
help='Number of threads')
+@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None),
+ help='Slice list of files')
@click.option('--agents', 'opt_fp_agents', default=fp_user_agents)
@click.pass_context
-def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
+def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents):
"""Threaded image/file downloader"""
"""
@@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
urllib.request.urlretrieve(item['url'], fp_out)
item['status'] = True
except Exception as e:
+ log.debug(f'Failed: user: {item["username"]}, url: {url}')
if str(e) != 'HTTP Error 403: Forbidden':
log.debug(f'Error: {e}')
fp_error = f'{fp_out}_error.txt'
@@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
# setup multithreading data holders
log.debug(f'loading {opt_fp_in}')
df_records = pd.read_csv(opt_fp_in)
+ if opt_slice:
+ df_records = df_records[opt_slice[0]:opt_slice[1]]
log.debug(f'loaded {len(df_records):,} csv records')
log.debug('deduplicating')
df_records = df_records.drop_duplicates(subset='sha256', keep="last")
@@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
for x in tqdm(records):
sha256 = x['sha256']
-
+ username = x['username']
fp_dst = join(opt_fp_out, f"{sha256}.json")
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
@@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
if not (fp_dst_is_file or fp_dst_is_err):
url = url_prefix + sha256 + '.json'
user_agent = user_agents[randint(0, len(user_agents)) - 1]
- pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+ pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent})
else:
n_skipped += 1