summaryrefslogtreecommitdiff
path: root/megapixels
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-03-14 15:08:31 +0100
committeradamhrv <adam@ahprojects.com>2019-03-14 15:08:31 +0100
commitf1ceb3953fae55a06c94a058f90ed278cf0240f5 (patch)
tree2aae68c2787597aa78bcf3e010b2b92190e30e86 /megapixels
parent26646e6adf3833f6282e9515c14ad61e485440c0 (diff)
uniq sha
Diffstat (limited to 'megapixels')
-rw-r--r--megapixels/commands/datasets/download_ibmdif.py35
1 files changed, 31 insertions, 4 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index 48aca5f0..3ac835e1 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -67,19 +67,46 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
# setup multithreading data holders
log.debug(f'loading {opt_fp_in}')
- records = pd.read_csv(opt_fp_in).to_dict('records')
+ df_records = pd.read_csv(opt_fp_in)
+ log.debug(f'loaded {len(df_records):,} csv records')
+ records = df_records.to_dict('records')
+ log.debug(f'loaded {len(records):,} items')
pool_items = []
+ n_skipped = 0
+ n_valids = 0
+ n_errors = 0
+ sha256_list = []
+
for x in tqdm(records):
- fp_dst = join(opt_fp_out, x['sha256'] + '.json')
+ sha256 = x['sha256']
+
+ if sha256 in sha256_list:
+ continue;
+ else:
+ sha256_list.append(sha256)
+
+ fp_dst = join(opt_fp_out, f"{sha256}.json")
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
- if not fp_dst_is_file and not fp_dst_is_err:
- url = url_prefix + x['sha256'] + '.json'
+
+ if fp_dst_is_file:
+ n_valids += 1
+ elif fp_dst_is_err:
+ n_errors += 1
+
+ if not (fp_dst_is_file or fp_dst_is_err):
+ url = url_prefix + sha256 + '.json'
user_agent = user_agents[randint(0, len(user_agents)) - 1]
pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent})
+ else:
+ n_skipped += 1
num_items = len(pool_items)
+ log.info(f'Error files: {n_errors:,} items')
+ log.info(f'Valid files: {n_valids:,} items')
+ log.info(f'skipping {n_skipped:,} items')
+ log.info(f'Unique sha256s {len(sha256_list):,} items')
log.info(f'processing {num_items:,} items')
pool_results = []