1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
import os
import glob
import simplejson as json
import click
from urllib.parse import urlparse
import operator
from util import *
@click.command()
def s2_dump_pdf_urls():
# loop over all the papers in db_papers
# get all the PDF urls, pick the best one
# store it and the paper id
# another script will fetch the urls from this process
rows = []
pdf_count = 0
ieee_count = 0
url_count = 0
doi_count = 0
empty_count = 0
domains = {}
pdf = []
doi = []
for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True):
if 'db_paper' in fn:
row = process_db_paper(fn)
elif 'raw_paper' in fn:
row = process_raw_paper(fn)
if row is not None:
rows.append(row)
if row[1] is not None:
pdf.append([row[0], row[1]])
pdf_count += 1
elif row[2] is not None:
doi.append([row[0], row[2]])
ieee_count += 1
elif row[3] is not None:
doi.append([row[0], row[3]])
doi_count += 1
elif row[4] is not None:
if 'pdf' not in row[4]:
doi.append([row[0], row[4]])
url_count += 1
domain = urlparse(row[4]).netloc
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
else:
empty_count += 1
print("Wrote {} rows".format(len(rows)))
print("pdf count: {}".format(pdf_count))
print("ieee count: {}".format(ieee_count))
print("doi count: {}".format(doi_count))
print("url count: {}".format(url_count))
for domain, count in sorted(domains.items(), key=operator.itemgetter(1)):
print(" -- {} - {}".format(domain, count))
print("empty count: {}".format(empty_count))
write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
write_csv('db_paper_pdf.csv', keys=None, rows=pdf)
write_csv('db_paper_doi.csv', keys=None, rows=doi)
# write_csv('raw_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'DOI URL', 'Extra URL'], rows=rows)
# write_csv('raw_paper_pdf.csv', keys=None, rows=pdf)
# write_csv('raw_paper_doi.csv', keys=None, rows=doi)
def process_db_paper(fn):
# print(fn)
paper = read_json(fn)
if paper is None:
return None
paper_id = paper['id']
pdf_url = None
ieee_url = None
doi_url = None
extra_url = None
if paper['s2PdfUrl']:
pdf_url = paper['s2PdfUrl']
for url in paper['pdfUrls']:
if 'ieeexplore.ieee.org' in url:
ieee_url = url
elif 'doi.org' in url:
doi_url = url
elif pdf_url is None and 'pdf' in url:
pdf_url = url
else:
extra_url = url
return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
def process_raw_paper(fn):
# print(fn)
data = read_json(fn)
if 'paper' not in data:
print(data)
return
paper = data['paper']
if paper is None:
return None
paper_id = paper['id']
pdf_url = None
ieee_url = None
doi_url = None
extra_url = None
if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']:
primary_url = paper['primaryPaperLink']['url']
if 'pdf' in primary_url:
pdf_url = primary_url
elif 'doi' in primary_url:
doi_url = primary_url
for link in paper['links']:
url = link['url']
if 'ieeexplore.ieee.org' in url:
ieee_url = url
elif 'doi.org' in url:
doi_url = url
elif pdf_url is None and 'pdf' in url:
pdf_url = url
else:
extra_url = url
return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
if __name__ == '__main__':
s2_dump_pdf_urls()
|