summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--megapixels/commands/datasets/download_ibmdif.py10
1 files changed, 3 insertions, 7 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py
index 3ac835e1..ed717662 100644
--- a/megapixels/commands/datasets/download_ibmdif.py
+++ b/megapixels/commands/datasets/download_ibmdif.py
@@ -69,6 +69,9 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
log.debug(f'loading {opt_fp_in}')
df_records = pd.read_csv(opt_fp_in)
log.debug(f'loaded {len(df_records):,} csv records')
+ log.debug('deduplicating')
+ df_records = df_records.drop_duplicates(subset='sha256', keep="last")
+ log.debug(f'unique records {len(df_records):,}')
records = df_records.to_dict('records')
log.debug(f'loaded {len(records):,} items')
@@ -76,16 +79,10 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
n_skipped = 0
n_valids = 0
n_errors = 0
- sha256_list = []
for x in tqdm(records):
sha256 = x['sha256']
- if sha256 in sha256_list:
- continue;
- else:
- sha256_list.append(sha256)
-
fp_dst = join(opt_fp_out, f"{sha256}.json")
fp_dst_is_file = Path(fp_dst).is_file()
fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file()
@@ -106,7 +103,6 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents):
log.info(f'Error files: {n_errors:,} items')
log.info(f'Valid files: {n_valids:,} items')
log.info(f'skipping {n_skipped:,} items')
- log.info(f'Unique sha256s {len(sha256_list):,} items')
log.info(f'processing {num_items:,} items')
pool_results = []