diff options
| author | adamhrv <adam@ahprojects.com> | 2019-03-14 15:23:29 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-03-14 15:23:29 +0100 |
| commit | 389f1f162720b577fcc652c95620eadd5e77ec43 (patch) | |
| tree | b707e30b54c0549a508ae6dade480b6ef2e6bdbd /megapixels | |
| parent | df6db850a37d1a14a852b7932bef0c6bc5a2d27c (diff) | |
drop dupes
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 10 |
1 files changed, 3 insertions, 7 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py index 3ac835e1..ed717662 100644 --- a/megapixels/commands/datasets/download_ibmdif.py +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -69,6 +69,9 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): log.debug(f'loading {opt_fp_in}') df_records = pd.read_csv(opt_fp_in) log.debug(f'loaded {len(df_records):,} csv records') + log.debug('deduplicating') + df_records = df_records.drop_duplicates(subset='sha256', keep="last") + log.debug(f'unique records {len(df_records):,}') records = df_records.to_dict('records') log.debug(f'loaded {len(records):,} items') @@ -76,16 +79,10 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): n_skipped = 0 n_valids = 0 n_errors = 0 - sha256_list = [] for x in tqdm(records): sha256 = x['sha256'] - if sha256 in sha256_list: - continue; - else: - sha256_list.append(sha256) - fp_dst = join(opt_fp_out, f"{sha256}.json") fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() @@ -106,7 +103,6 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): log.info(f'Error files: {n_errors:,} items') log.info(f'Valid files: {n_valids:,} items') log.info(f'skipping {n_skipped:,} items') - log.info(f'Unique sha256s {len(sha256_list):,} items') log.info(f'processing {num_items:,} items') pool_results = [] |
