summaryrefslogtreecommitdiff
path: root/scraper/s2-fetch-pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-fetch-pdf.py')
-rw-r--r--scraper/s2-fetch-pdf.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py
index 72ca4ca8..c1b767b0 100644
--- a/scraper/s2-fetch-pdf.py
+++ b/scraper/s2-fetch-pdf.py
@@ -24,14 +24,15 @@ def fetch_pdf(paper_id, url):
os.makedirs(make_pdf_path(paper_id), exist_ok=True)
pdf_fn = make_pdf_fn(paper_id)
txt_fn = make_txt_fn(paper_id)
- if os.path.exists(pdf_fn) or os.path.exists(txt_fn):
- # return read_json(pdf_fn)
+ empty_fn = make_empty_fn(paper_id)
+ if os.path.exists(pdf_fn) or os.path.exists(txt_fn) or os.path.exists(empty_fn):
return
size = s2.fetch_file(url, pdf_fn)
if size is None:
print("{} empty?".format(paper_id))
- return None
- print("{} {} kb".format(paper_id, int(size / 1024)))
+ write_json(empty_fn, { 'paper_id': paper_id, 'url': url })
+ return
+ print("{} {} kb {}".format(paper_id, int(size / 1024), url))
def make_pdf_path(paper_id):
return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id)
@@ -39,6 +40,8 @@ def make_pdf_fn(paper_id):
return './datasets/s2/pdf/{}/{}/paper.pdf'.format(paper_id[0:2], paper_id)
def make_txt_fn(paper_id):
return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id)
+def make_empty_fn(paper_id):
+ return './datasets/s2/pdf/{}/{}/pdf.empty'.format(paper_id[0:2], paper_id)
if __name__ == '__main__':
fetch_pdfs()