summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-03 18:41:23 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-03 18:41:23 +0100
commit2278adead1ff16115f8b989dc316bdf9efe9e37d (patch)
tree96fae400f35025e2565b9e0e8d7c6a2d020d822b
parentfde14c19ef77f1bbe67f4cac7cadddbd9d3129b3 (diff)
s2-dump-db-pdf-urls.py
-rw-r--r--s2-dump-db-pdf-urls.py (renamed from s2-dump-pdf-urls.py)21
1 files changed, 13 insertions, 8 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-db-pdf-urls.py
index b833d0fc..520b513e 100644
--- a/s2-dump-pdf-urls.py
+++ b/s2-dump-db-pdf-urls.py
@@ -12,21 +12,26 @@ def s2_dump_pdf_urls():
# get all the PDF urls, pick the best one
# store it and the paper id
# another script will fetch the urls from this process
- lookups = {}
- for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
- process_paper(fn, lookups)
- lookups_list = list(lookups.keys())
- print("Wrote {} ids".format(len(id_list)))
- write_csv('pdf_list.csv', id_list)
+ rows = [process_paper(fn) for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True)]
+ print("Wrote {} rows".format(len(rows)))
+ write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows)
def process_paper(fn, lookups):
paper = read_json(fn)
paper_id = paper['id']
pdf_url = None
+ ieee_url = None
+ extra_url = None
if paper['s2PdfUrl']:
pdf_url = paper['s2PdfUrl']
- elif len(paper['pdfUrls']):
- pdf_url = paper['pdfUrls'][0]
+ for url in paper['pdfUrls']:
+ if 'ieeexplore.ieee.org' in url:
+ ieee_url = url
+ elif pdf_url is None and 'pdf' in url:
+ pdf_url = url
+ else:
+ extra_url = url
+ return [paper_id, pdf_url, ieee_url, extra_url]
if __name__ == '__main__':
s2_dump_pdf_urls()