1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from multiprocessing import Pool
from s2 import SemanticScholarAPI
from util import *
s2 = SemanticScholarAPI()
@click.command()
@click.option('--fn', '-i', default='db_paper_pdf.csv', help='Filename of CSV (id, url,)')
def fetch_pdfs(fn):
lines = read_csv(fn, keys=False)
parallelize(fetch_pdf, lines)
print("{} papers processed".format(len(lines)))
def fetch_pdf(paper_id, url):
os.makedirs(make_pdf_path(paper_id), exist_ok=True)
pdf_fn = make_pdf_fn(paper_id)
txt_fn = make_txt_fn(paper_id)
if os.path.exists(pdf_fn) or os.path.exists(txt_fn):
return None
size = s2.fetch_file(url, pdf_fn)
if size is None:
print("{} empty?".format(paper_id))
return None
print("{} {} kb {}".format(paper_id, int(size / 1024), url))
def make_pdf_path(paper_id):
return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id)
def make_pdf_fn(paper_id):
return './datasets/s2/pdf/{}/{}/paper.pdf'.format(paper_id[0:2], paper_id)
def make_txt_fn(paper_id):
return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id)
if __name__ == '__main__':
fetch_pdfs()
|