summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-dump-missing-paper-ids.py14
-rw-r--r--scraper/s2-fetch-pdf.py5
2 files changed, 14 insertions, 5 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py
index b30fe167..47dd4238 100644
--- a/scraper/s2-dump-missing-paper-ids.py
+++ b/scraper/s2-dump-missing-paper-ids.py
@@ -4,7 +4,7 @@ import glob
import click
from util import *
-DB_PAPER_DIR = './datasets/s2/db_papers'
+# DB_PAPER_DIR = './datasets/s2/db_papers'
RAW_PAPER_DIR = './datasets/s2/raw_papers'
@click.command()
@@ -20,15 +20,25 @@ def load_missing_ids(fn):
found_count = 0
missing_count = 0
for paper_id in ids:
- db_paper_path = make_db_paper_path(paper_id)
+ # db_paper_path = make_db_paper_path(paper_id)
raw_paper_path = make_raw_paper_path(paper_id)
# if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
if os.path.exists(raw_paper_path):
lookup[paper_id] = True
found_count += 1
else:
+ print(">> {} {}".format(dataset paper_id))
missing_lookup[paper_id] = True
missing_count += 1
+
+ verified_lookup, verified_totals = fetch_verified_paper_lookup()
+ rows = []
+ for dataset, lookup in verified_lookup.items():
+ for paper_id in lookup.keys():
+ paper_path = data_path('raw_papers', paper_id):
+ if not os.path.exists(paper_path):
+ print(">> {} {}".format(dataset paper_id))
+
print("{} papers found, {} must be fetched".format(found_count, missing_count))
return missing_lookup.keys()
diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py
index 72ca4ca8..61574b90 100644
--- a/scraper/s2-fetch-pdf.py
+++ b/scraper/s2-fetch-pdf.py
@@ -25,13 +25,12 @@ def fetch_pdf(paper_id, url):
pdf_fn = make_pdf_fn(paper_id)
txt_fn = make_txt_fn(paper_id)
if os.path.exists(pdf_fn) or os.path.exists(txt_fn):
- # return read_json(pdf_fn)
- return
+ return None
size = s2.fetch_file(url, pdf_fn)
if size is None:
print("{} empty?".format(paper_id))
return None
- print("{} {} kb".format(paper_id, int(size / 1024)))
+ print("{} {} kb {}".format(paper_id, int(size / 1024), url))
def make_pdf_path(paper_id):
return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id)