From 4f1d44719221bb8195e32b8f1e97feb4c3e14991 Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Thu, 30 May 2019 14:30:39 +0200 Subject: fetching verified papers --- scraper/s2-fetch-pdf.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'scraper/s2-fetch-pdf.py') diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py index 61574b90..c1b767b0 100644 --- a/scraper/s2-fetch-pdf.py +++ b/scraper/s2-fetch-pdf.py @@ -24,12 +24,14 @@ def fetch_pdf(paper_id, url): os.makedirs(make_pdf_path(paper_id), exist_ok=True) pdf_fn = make_pdf_fn(paper_id) txt_fn = make_txt_fn(paper_id) - if os.path.exists(pdf_fn) or os.path.exists(txt_fn): - return None + empty_fn = make_empty_fn(paper_id) + if os.path.exists(pdf_fn) or os.path.exists(txt_fn) or os.path.exists(empty_fn): + return size = s2.fetch_file(url, pdf_fn) if size is None: print("{} empty?".format(paper_id)) - return None + write_json(empty_fn, { 'paper_id': paper_id, 'url': url }) + return print("{} {} kb {}".format(paper_id, int(size / 1024), url)) def make_pdf_path(paper_id): @@ -38,6 +40,8 @@ def make_pdf_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.pdf'.format(paper_id[0:2], paper_id) def make_txt_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id) +def make_empty_fn(paper_id): + return './datasets/s2/pdf/{}/{}/pdf.empty'.format(paper_id[0:2], paper_id) if __name__ == '__main__': fetch_pdfs() -- cgit v1.2.3-70-g09d2