diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-05-30 14:03:34 +0200 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-05-30 14:03:34 +0200 |
| commit | 10c38b6b5916b2c7f84ca65fa471dda963dd9b5d (patch) | |
| tree | 479d5dab47ab7bf4d02fb8d233a220a233989ae5 /scraper/s2-dump-missing-paper-ids.py | |
| parent | 2963cd2ec73860e3bf3a5e4d469b4e573ce4817c (diff) | |
s2 fetch missing verified papers
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
| -rw-r--r-- | scraper/s2-dump-missing-paper-ids.py | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index b30fe167..47dd4238 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -4,7 +4,7 @@ import glob import click from util import * -DB_PAPER_DIR = './datasets/s2/db_papers' +# DB_PAPER_DIR = './datasets/s2/db_papers' RAW_PAPER_DIR = './datasets/s2/raw_papers' @click.command() @@ -20,15 +20,25 @@ def load_missing_ids(fn): found_count = 0 missing_count = 0 for paper_id in ids: - db_paper_path = make_db_paper_path(paper_id) + # db_paper_path = make_db_paper_path(paper_id) raw_paper_path = make_raw_paper_path(paper_id) # if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): if os.path.exists(raw_paper_path): lookup[paper_id] = True found_count += 1 else: + print(">> {} {}".format(dataset paper_id)) missing_lookup[paper_id] = True missing_count += 1 + + verified_lookup, verified_totals = fetch_verified_paper_lookup() + rows = [] + for dataset, lookup in verified_lookup.items(): + for paper_id in lookup.keys(): + paper_path = data_path('raw_papers', paper_id): + if not os.path.exists(paper_path): + print(">> {} {}".format(dataset paper_id)) + print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() |
