diff options
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
| -rw-r--r-- | scraper/s2-dump-missing-paper-ids.py | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index 47dd4238..6f7eb8ba 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -27,7 +27,7 @@ def load_missing_ids(fn): lookup[paper_id] = True found_count += 1 else: - print(">> {} {}".format(dataset paper_id)) + # print(">> {} {}".format(dataset, paper_id)) missing_lookup[paper_id] = True missing_count += 1 @@ -35,9 +35,13 @@ def load_missing_ids(fn): rows = [] for dataset, lookup in verified_lookup.items(): for paper_id in lookup.keys(): - paper_path = data_path('raw_papers', paper_id): - if not os.path.exists(paper_path): - print(">> {} {}".format(dataset paper_id)) + if dataset == 'brainwash': + print('>> {} {}'.format(dataset, paper_id)) + paper_path = make_raw_paper_path(paper_id) + if not os.path.exists(paper_path) and paper_id not in missing_lookup: + print(">> {} {}".format(dataset, paper_id)) + missing_count += 1 + missing_lookup[paper_id] = True print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() |
