1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from urllib.parse import urlparse
from s2 import SemanticScholarAPI
from util import *
s2 = SemanticScholarAPI()
@click.command()
@click.option('--fn', '-i', default='db_paper_doi.csv', help='Filename of CSV (id, url,)')
def fetch_doi_list(fn):
lines = read_csv(fn, keys=False)
urls = []
for line in lines:
paper_id, url = line
if url:
urls.append(line)
parallelize(fetch_doi, urls)
print("{} papers processed".format(len(lines)))
def fetch_doi(paper_id, url, replace=False):
os.makedirs(make_doi_path(paper_id), exist_ok=True)
doi_fn = make_doi_fn(paper_id)
url_fn = make_url_fn(paper_id)
txt_fn = make_txt_fn(paper_id)
if replace and os.path.exists(doi_fn):
os.rename(doi_fn, old_doi_fn(paper_id))
os.rename(url_fn, old_url_fn(paper_id))
if os.path.exists(doi_fn) or os.path.exists(txt_fn):
# return read_json(doi_fn)
return None, None
size, final_url = s2.fetch_doi(url, doi_fn)
if size is None:
print("{} empty?".format(paper_id))
time.sleep(random.randint(2, 5))
return None, None
print("{} {} kb".format(paper_id, int(size / 1024)))
domain = urlparse(final_url).netloc
write_json(url_fn, {
'paper_id': paper_id,
'domain': domain
})
time.sleep(random.randint(2, 5))
return domain
# return paper
def make_doi_path(paper_id):
return './datasets/s2/doi/{}/{}'.format(paper_id[0:2], paper_id)
def make_doi_fn(paper_id):
return './datasets/s2/doi/{}/{}/paper.doi'.format(paper_id[0:2], paper_id)
def make_url_fn(paper_id):
return './datasets/s2/doi/{}/{}/paper.url'.format(paper_id[0:2], paper_id)
def make_txt_fn(paper_id):
return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id)
def old_doi_fn(paper_id):
return './datasets/s2/doi/{}/{}/paper.doi2'.format(paper_id[0:2], paper_id)
def old_url_fn(paper_id):
return './datasets/s2/doi/{}/{}/paper.url2'.format(paper_id[0:2], paper_id)
if __name__ == '__main__':
fetch_doi_list()
|