blob: b833d0fc6f4c9ef5d05eb7b735a2344cc6f33f44 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
import os
import glob
import simplejson as json
import click
from util import *
PAPER_JSON_DIR = 'datasets/s2/db_papers'
@click.command()
def s2_dump_pdf_urls():
# loop over all the papers in db_papers
# get all the PDF urls, pick the best one
# store it and the paper id
# another script will fetch the urls from this process
lookups = {}
for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
process_paper(fn, lookups)
lookups_list = list(lookups.keys())
print("Wrote {} ids".format(len(id_list)))
write_csv('pdf_list.csv', id_list)
def process_paper(fn, lookups):
paper = read_json(fn)
paper_id = paper['id']
pdf_url = None
if paper['s2PdfUrl']:
pdf_url = paper['s2PdfUrl']
elif len(paper['pdfUrls']):
pdf_url = paper['pdfUrls'][0]
if __name__ == '__main__':
s2_dump_pdf_urls()
|