1 files changed, 11 insertions, 16 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py
index 111ed830..b833d0fc 100644
--- a/s2-dump-pdf-urls.py
+++ b/s2-dump-pdf-urls.py
@@ -12,26 +12,21 @@ def s2_dump_pdf_urls():
   # get all the PDF urls, pick the best one
   # store it and the paper id
   # another script will fetch the urls from this process
-  ids = {}
+  lookups = {}
   for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
-    process_paper(fn, ids)
-  id_list = list(ids.keys())
+    process_paper(fn, lookups)
+  lookups_list = list(lookups.keys())
   print("Wrote {} ids".format(len(id_list)))
   write_csv('pdf_list.csv', id_list)
 
-  for line in lines:
-    label = line[0]
-    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
-    entry_fn = './datasets/s2/entries/{}.json'.format(title)
-    if not os.path.exists(entry_fn):
-      results = s2.search(title)
-      write_json(dump_fn, results)
-      if len(results['results']) == 0:
-        print("No results for {}".format(title))
-      else:
-        print(title)
-        write_json(entry_fn, results['results'][0])
-      time.sleep(random.randint(10, 20))
+def process_paper(fn, lookups):
+  paper = read_json(fn)
+  paper_id = paper['id']
+  pdf_url = None
+  if paper['s2PdfUrl']:
+    pdf_url = paper['s2PdfUrl']
+  elif len(paper['pdfUrls']):
+    pdf_url = paper['pdfUrls'][0]
 
 if __name__ == '__main__':
   s2_dump_pdf_urls()