summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-missing-paper-ids.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-05-30 14:03:34 +0200
committerJules Laplace <julescarbon@gmail.com>2019-05-30 14:03:34 +0200
commit10c38b6b5916b2c7f84ca65fa471dda963dd9b5d (patch)
tree479d5dab47ab7bf4d02fb8d233a220a233989ae5 /scraper/s2-dump-missing-paper-ids.py
parent2963cd2ec73860e3bf3a5e4d469b4e573ce4817c (diff)
s2 fetch missing verified papers
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
-rw-r--r--scraper/s2-dump-missing-paper-ids.py14
1 files changed, 12 insertions, 2 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py
index b30fe167..47dd4238 100644
--- a/scraper/s2-dump-missing-paper-ids.py
+++ b/scraper/s2-dump-missing-paper-ids.py
@@ -4,7 +4,7 @@ import glob
import click
from util import *
-DB_PAPER_DIR = './datasets/s2/db_papers'
+# DB_PAPER_DIR = './datasets/s2/db_papers'
RAW_PAPER_DIR = './datasets/s2/raw_papers'
@click.command()
@@ -20,15 +20,25 @@ def load_missing_ids(fn):
found_count = 0
missing_count = 0
for paper_id in ids:
- db_paper_path = make_db_paper_path(paper_id)
+ # db_paper_path = make_db_paper_path(paper_id)
raw_paper_path = make_raw_paper_path(paper_id)
# if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
if os.path.exists(raw_paper_path):
lookup[paper_id] = True
found_count += 1
else:
+ print(">> {} {}".format(dataset paper_id))
missing_lookup[paper_id] = True
missing_count += 1
+
+ verified_lookup, verified_totals = fetch_verified_paper_lookup()
+ rows = []
+ for dataset, lookup in verified_lookup.items():
+ for paper_id in lookup.keys():
+ paper_path = data_path('raw_papers', paper_id):
+ if not os.path.exists(paper_path):
+ print(">> {} {}".format(dataset paper_id))
+
print("{} papers found, {} must be fetched".format(found_count, missing_count))
return missing_lookup.keys()