s2-dump-db-pdf-urls.py

author: Jules Laplace <julescarbon@gmail.com> 2018-11-03 18:41:23 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-03 18:41:23 +0100
commit: 2278adead1ff16115f8b989dc316bdf9efe9e37d (patch)
tree: 96fae400f35025e2565b9e0e8d7c6a2d020d822b /s2-dump-pdf-urls.py
parent: fde14c19ef77f1bbe67f4cac7cadddbd9d3129b3 (diff)
1 files changed, 0 insertions, 32 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py
deleted file mode 100644
index b833d0fc..00000000
--- a/s2-dump-pdf-urls.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-import glob
-import simplejson as json
-import click
-from util import *
-
-PAPER_JSON_DIR = 'datasets/s2/db_papers'
-
-@click.command()
-def s2_dump_pdf_urls():
-  # loop over all the papers in db_papers
-  # get all the PDF urls, pick the best one
-  # store it and the paper id
-  # another script will fetch the urls from this process
-  lookups = {}
-  for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
-    process_paper(fn, lookups)
-  lookups_list = list(lookups.keys())
-  print("Wrote {} ids".format(len(id_list)))
-  write_csv('pdf_list.csv', id_list)
-
-def process_paper(fn, lookups):
-  paper = read_json(fn)
-  paper_id = paper['id']
-  pdf_url = None
-  if paper['s2PdfUrl']:
-    pdf_url = paper['s2PdfUrl']
-  elif len(paper['pdfUrls']):
-    pdf_url = paper['pdfUrls'][0]
-
-if __name__ == '__main__':
-  s2_dump_pdf_urls()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-03 18:41:23 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-03 18:41:23 +0100
commit	2278adead1ff16115f8b989dc316bdf9efe9e37d (patch)
tree	96fae400f35025e2565b9e0e8d7c6a2d020d822b /s2-dump-pdf-urls.py
parent	fde14c19ef77f1bbe67f4cac7cadddbd9d3129b3 (diff)