diff options
| author | adamhrv <adam@ahprojects.com> | 2019-05-30 17:20:11 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-05-30 17:20:11 +0200 |
| commit | dfaa5b28c6874bc70715c94aa9918f1b08a92b9c (patch) | |
| tree | 511ec5100abc8497a649d4f9ca8e12d31ae601c7 /scraper/s2-dump-missing-paper-ids.py | |
| parent | ca0d3ed1a451ce65960ff2e0f44fd5a9008eeaf4 (diff) | |
| parent | a7b940665c82b4710c73099d22f347fc30017e3c (diff) | |
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
| -rw-r--r-- | scraper/s2-dump-missing-paper-ids.py | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index b30fe167..6f7eb8ba 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -4,7 +4,7 @@ import glob import click from util import * -DB_PAPER_DIR = './datasets/s2/db_papers' +# DB_PAPER_DIR = './datasets/s2/db_papers' RAW_PAPER_DIR = './datasets/s2/raw_papers' @click.command() @@ -20,15 +20,29 @@ def load_missing_ids(fn): found_count = 0 missing_count = 0 for paper_id in ids: - db_paper_path = make_db_paper_path(paper_id) + # db_paper_path = make_db_paper_path(paper_id) raw_paper_path = make_raw_paper_path(paper_id) # if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): if os.path.exists(raw_paper_path): lookup[paper_id] = True found_count += 1 else: + # print(">> {} {}".format(dataset, paper_id)) missing_lookup[paper_id] = True missing_count += 1 + + verified_lookup, verified_totals = fetch_verified_paper_lookup() + rows = [] + for dataset, lookup in verified_lookup.items(): + for paper_id in lookup.keys(): + if dataset == 'brainwash': + print('>> {} {}'.format(dataset, paper_id)) + paper_path = make_raw_paper_path(paper_id) + if not os.path.exists(paper_path) and paper_id not in missing_lookup: + print(">> {} {}".format(dataset, paper_id)) + missing_count += 1 + missing_lookup[paper_id] = True + print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() |
