summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-missing-paper-ids.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
-rw-r--r--scraper/s2-dump-missing-paper-ids.py14
1 files changed, 12 insertions, 2 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py
index b30fe167..47dd4238 100644
--- a/scraper/s2-dump-missing-paper-ids.py
+++ b/scraper/s2-dump-missing-paper-ids.py
@@ -4,7 +4,7 @@ import glob
import click
from util import *
-DB_PAPER_DIR = './datasets/s2/db_papers'
+# DB_PAPER_DIR = './datasets/s2/db_papers'
RAW_PAPER_DIR = './datasets/s2/raw_papers'
@click.command()
@@ -20,15 +20,25 @@ def load_missing_ids(fn):
found_count = 0
missing_count = 0
for paper_id in ids:
- db_paper_path = make_db_paper_path(paper_id)
+ # db_paper_path = make_db_paper_path(paper_id)
raw_paper_path = make_raw_paper_path(paper_id)
# if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
if os.path.exists(raw_paper_path):
lookup[paper_id] = True
found_count += 1
else:
+ print(">> {} {}".format(dataset paper_id))
missing_lookup[paper_id] = True
missing_count += 1
+
+ verified_lookup, verified_totals = fetch_verified_paper_lookup()
+ rows = []
+ for dataset, lookup in verified_lookup.items():
+ for paper_id in lookup.keys():
+ paper_path = data_path('raw_papers', paper_id):
+ if not os.path.exists(paper_path):
+ print(">> {} {}".format(dataset paper_id))
+
print("{} papers found, {} must be fetched".format(found_count, missing_count))
return missing_lookup.keys()