diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 18:41:23 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 18:41:23 +0100 |
| commit | 2278adead1ff16115f8b989dc316bdf9efe9e37d (patch) | |
| tree | 96fae400f35025e2565b9e0e8d7c6a2d020d822b /s2-dump-pdf-urls.py | |
| parent | fde14c19ef77f1bbe67f4cac7cadddbd9d3129b3 (diff) | |
s2-dump-db-pdf-urls.py
Diffstat (limited to 's2-dump-pdf-urls.py')
| -rw-r--r-- | s2-dump-pdf-urls.py | 32 |
1 files changed, 0 insertions, 32 deletions
diff --git a/s2-dump-pdf-urls.py b/s2-dump-pdf-urls.py deleted file mode 100644 index b833d0fc..00000000 --- a/s2-dump-pdf-urls.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import glob -import simplejson as json -import click -from util import * - -PAPER_JSON_DIR = 'datasets/s2/db_papers' - -@click.command() -def s2_dump_pdf_urls(): - # loop over all the papers in db_papers - # get all the PDF urls, pick the best one - # store it and the paper id - # another script will fetch the urls from this process - lookups = {} - for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): - process_paper(fn, lookups) - lookups_list = list(lookups.keys()) - print("Wrote {} ids".format(len(id_list))) - write_csv('pdf_list.csv', id_list) - -def process_paper(fn, lookups): - paper = read_json(fn) - paper_id = paper['id'] - pdf_url = None - if paper['s2PdfUrl']: - pdf_url = paper['s2PdfUrl'] - elif len(paper['pdfUrls']): - pdf_url = paper['pdfUrls'][0] - -if __name__ == '__main__': - s2_dump_pdf_urls() |
