diff options
| author | adamhrv <adam@ahprojects.com> | 2019-03-14 15:08:31 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-03-14 15:08:31 +0100 |
| commit | f1ceb3953fae55a06c94a058f90ed278cf0240f5 (patch) | |
| tree | 2aae68c2787597aa78bcf3e010b2b92190e30e86 /megapixels | |
| parent | 26646e6adf3833f6282e9515c14ad61e485440c0 (diff) | |
uniq sha
Diffstat (limited to 'megapixels')
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 35 |
1 files changed, 31 insertions, 4 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py index 48aca5f0..3ac835e1 100644 --- a/megapixels/commands/datasets/download_ibmdif.py +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -67,19 +67,46 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): # setup multithreading data holders log.debug(f'loading {opt_fp_in}') - records = pd.read_csv(opt_fp_in).to_dict('records') + df_records = pd.read_csv(opt_fp_in) + log.debug(f'loaded {len(df_records):,} csv records') + records = df_records.to_dict('records') + log.debug(f'loaded {len(records):,} items') pool_items = [] + n_skipped = 0 + n_valids = 0 + n_errors = 0 + sha256_list = [] + for x in tqdm(records): - fp_dst = join(opt_fp_out, x['sha256'] + '.json') + sha256 = x['sha256'] + + if sha256 in sha256_list: + continue; + else: + sha256_list.append(sha256) + + fp_dst = join(opt_fp_out, f"{sha256}.json") fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() - if not fp_dst_is_file and not fp_dst_is_err: - url = url_prefix + x['sha256'] + '.json' + + if fp_dst_is_file: + n_valids += 1 + elif fp_dst_is_err: + n_errors += 1 + + if not (fp_dst_is_file or fp_dst_is_err): + url = url_prefix + sha256 + '.json' user_agent = user_agents[randint(0, len(user_agents)) - 1] pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + else: + n_skipped += 1 num_items = len(pool_items) + log.info(f'Error files: {n_errors:,} items') + log.info(f'Valid files: {n_valids:,} items') + log.info(f'skipping {n_skipped:,} items') + log.info(f'Unique sha256s {len(sha256_list):,} items') log.info(f'processing {num_items:,} items') pool_results = [] |
