summaryrefslogtreecommitdiff
path: root/s2-dump-pdf-urls.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-dump-pdf-urls.py')
-rw-r--r--s2-dump-pdf-urls.py27
1 files changed, 11 insertions, 16 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py
index 111ed830..b833d0fc 100644
--- a/s2-dump-pdf-urls.py
+++ b/s2-dump-pdf-urls.py
@@ -12,26 +12,21 @@ def s2_dump_pdf_urls():
# get all the PDF urls, pick the best one
# store it and the paper id
# another script will fetch the urls from this process
- ids = {}
+ lookups = {}
for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
- process_paper(fn, ids)
- id_list = list(ids.keys())
+ process_paper(fn, lookups)
+ lookups_list = list(lookups.keys())
print("Wrote {} ids".format(len(id_list)))
write_csv('pdf_list.csv', id_list)
- for line in lines:
- label = line[0]
- title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
- entry_fn = './datasets/s2/entries/{}.json'.format(title)
- if not os.path.exists(entry_fn):
- results = s2.search(title)
- write_json(dump_fn, results)
- if len(results['results']) == 0:
- print("No results for {}".format(title))
- else:
- print(title)
- write_json(entry_fn, results['results'][0])
- time.sleep(random.randint(10, 20))
+def process_paper(fn, lookups):
+ paper = read_json(fn)
+ paper_id = paper['id']
+ pdf_url = None
+ if paper['s2PdfUrl']:
+ pdf_url = paper['s2PdfUrl']
+ elif len(paper['pdfUrls']):
+ pdf_url = paper['pdfUrls'][0]
if __name__ == '__main__':
s2_dump_pdf_urls()