begin client

author: Jules Laplace <julescarbon@gmail.com> 2018-11-03 17:29:49 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-03 17:29:49 +0100
commit: aa0470a3076f5ac65a0311c76e58254547f3eae0 (patch)
tree: 001173d70a0ae93d367773453cbdc9091bbc9fb7 /s2-dump-pdf-urls.py
parent: 3e534be7b919bf402d3602fde5b45809201f06b1 (diff)
1 files changed, 11 insertions, 16 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py
index 111ed830..b833d0fc 100644
--- a/s2-dump-pdf-urls.py
+++ b/s2-dump-pdf-urls.py
@@ -12,26 +12,21 @@ def s2_dump_pdf_urls():
   # get all the PDF urls, pick the best one
   # store it and the paper id
   # another script will fetch the urls from this process
-  ids = {}
+  lookups = {}
   for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
-    process_paper(fn, ids)
-  id_list = list(ids.keys())
+    process_paper(fn, lookups)
+  lookups_list = list(lookups.keys())
   print("Wrote {} ids".format(len(id_list)))
   write_csv('pdf_list.csv', id_list)
 
-  for line in lines:
-    label = line[0]
-    title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1])
-    entry_fn = './datasets/s2/entries/{}.json'.format(title)
-    if not os.path.exists(entry_fn):
-      results = s2.search(title)
-      write_json(dump_fn, results)
-      if len(results['results']) == 0:
-        print("No results for {}".format(title))
-      else:
-        print(title)
-        write_json(entry_fn, results['results'][0])
-      time.sleep(random.randint(10, 20))
+def process_paper(fn, lookups):
+  paper = read_json(fn)
+  paper_id = paper['id']
+  pdf_url = None
+  if paper['s2PdfUrl']:
+    pdf_url = paper['s2PdfUrl']
+  elif len(paper['pdfUrls']):
+    pdf_url = paper['pdfUrls'][0]
 
 if __name__ == '__main__':
   s2_dump_pdf_urls()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-03 17:29:49 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-03 17:29:49 +0100
commit	aa0470a3076f5ac65a0311c76e58254547f3eae0 (patch)
tree	001173d70a0ae93d367773453cbdc9091bbc9fb7 /s2-dump-pdf-urls.py
parent	3e534be7b919bf402d3602fde5b45809201f06b1 (diff)