summaryrefslogtreecommitdiff
path: root/s2-dump-pdf-urls.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-03 18:41:23 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-03 18:41:23 +0100
commit2278adead1ff16115f8b989dc316bdf9efe9e37d (patch)
tree96fae400f35025e2565b9e0e8d7c6a2d020d822b /s2-dump-pdf-urls.py
parentfde14c19ef77f1bbe67f4cac7cadddbd9d3129b3 (diff)
s2-dump-db-pdf-urls.py
Diffstat (limited to 's2-dump-pdf-urls.py')
-rw-r--r--s2-dump-pdf-urls.py32
1 files changed, 0 insertions, 32 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py
deleted file mode 100644
index b833d0fc..00000000
--- a/s2-dump-pdf-urls.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-import glob
-import simplejson as json
-import click
-from util import *
-
-PAPER_JSON_DIR = 'datasets/s2/db_papers'
-
-@click.command()
-def s2_dump_pdf_urls():
- # loop over all the papers in db_papers
- # get all the PDF urls, pick the best one
- # store it and the paper id
- # another script will fetch the urls from this process
- lookups = {}
- for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
- process_paper(fn, lookups)
- lookups_list = list(lookups.keys())
- print("Wrote {} ids".format(len(id_list)))
- write_csv('pdf_list.csv', id_list)
-
-def process_paper(fn, lookups):
- paper = read_json(fn)
- paper_id = paper['id']
- pdf_url = None
- if paper['s2PdfUrl']:
- pdf_url = paper['s2PdfUrl']
- elif len(paper['pdfUrls']):
- pdf_url = paper['pdfUrls'][0]
-
-if __name__ == '__main__':
- s2_dump_pdf_urls()