diff options
| author | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-06-03 03:33:06 +0200 |
| commit | 1d8162a47bda87b38feef95cf3d5903831b6f4d6 (patch) | |
| tree | 86c37309ff5bcb62716638562489ddb747c16159 /megapixels/commands/datasets/download_ibmdif.py | |
| parent | e5773e7fffc11265c86bf1dcfa05df236193f4a1 (diff) | |
add msc working utils
Diffstat (limited to 'megapixels/commands/datasets/download_ibmdif.py')
| -rw-r--r-- | megapixels/commands/datasets/download_ibmdif.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/megapixels/commands/datasets/download_ibmdif.py b/megapixels/commands/datasets/download_ibmdif.py index ed717662..0b81fef6 100644 --- a/megapixels/commands/datasets/download_ibmdif.py +++ b/megapixels/commands/datasets/download_ibmdif.py @@ -9,9 +9,11 @@ fp_user_agents = '/data_store_hdd/datasets/people/ibm_dif/research/user-agents.t help='Output path') @click.option('-t', '--threads', 'opt_threads', default=8, help='Number of threads') +@click.option('--slice', 'opt_slice', type=(int, int), default=(None, None), + help='Slice list of files') @click.option('--agents', 'opt_fp_agents', default=fp_user_agents) @click.pass_context -def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): +def cli(ctx, opt_fp_in, opt_fp_out, opt_slice, opt_threads, opt_fp_agents): """Threaded image/file downloader""" """ @@ -56,6 +58,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): urllib.request.urlretrieve(item['url'], fp_out) item['status'] = True except Exception as e: + log.debug(f'Failed: user: {item["username"]}, url: {url}') if str(e) != 'HTTP Error 403: Forbidden': log.debug(f'Error: {e}') fp_error = f'{fp_out}_error.txt' @@ -68,6 +71,8 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): # setup multithreading data holders log.debug(f'loading {opt_fp_in}') df_records = pd.read_csv(opt_fp_in) + if opt_slice: + df_records = df_records[opt_slice[0]:opt_slice[1]] log.debug(f'loaded {len(df_records):,} csv records') log.debug('deduplicating') df_records = df_records.drop_duplicates(subset='sha256', keep="last") @@ -82,7 +87,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): for x in tqdm(records): sha256 = x['sha256'] - + username = x['username'] fp_dst = join(opt_fp_out, f"{sha256}.json") fp_dst_is_file = Path(fp_dst).is_file() fp_dst_is_err = Path(f'{fp_dst}_error.txt').is_file() @@ -95,7 +100,7 @@ def cli(ctx, opt_fp_in, opt_fp_out, opt_threads, opt_fp_agents): if not (fp_dst_is_file or fp_dst_is_err): url = url_prefix + sha256 + '.json' user_agent = user_agents[randint(0, len(user_agents)) - 1] - pool_items.append({'url':url, 'filepath': fp_dst, 'user_agent': user_agent}) + pool_items.append({'url':url, 'username': username, 'filepath': fp_dst, 'user_agent': user_agent}) else: n_skipped += 1 |
