diff options
| -rw-r--r-- | s2-dump-db-pdf-urls.py (renamed from s2-dump-pdf-urls.py) | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-db-pdf-urls.py index b833d0fc..520b513e 100644 --- a/s2-dump-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -12,21 +12,26 @@ def s2_dump_pdf_urls(): # get all the PDF urls, pick the best one # store it and the paper id # another script will fetch the urls from this process - lookups = {} - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - process_paper(fn, lookups) - lookups_list = list(lookups.keys()) - print("Wrote {} ids".format(len(id_list))) - write_csv('pdf_list.csv', id_list) + rows = [process_paper(fn) for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True)] + print("Wrote {} rows".format(len(rows))) + write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows) def process_paper(fn, lookups): paper = read_json(fn) paper_id = paper['id'] pdf_url = None + ieee_url = None + extra_url = None if paper['s2PdfUrl']: pdf_url = paper['s2PdfUrl'] - elif len(paper['pdfUrls']): - pdf_url = paper['pdfUrls'][0] + for url in paper['pdfUrls']: + if 'ieeexplore.ieee.org' in url: + ieee_url = url + elif pdf_url is None and 'pdf' in url: + pdf_url = url + else: + extra_url = url + return [paper_id, pdf_url, ieee_url, extra_url] if __name__ == '__main__': s2_dump_pdf_urls() |
