1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
import os
import csv
import codecs
import simplejson as json
def read_citation_list(index=0):
filename = './datasets/citations.csv'
if index > 0:
fn, ext = os.path.splitext(filename)
filename = fn + '-' + str(index) + ext
with open(filename, 'r') as f:
reader = csv.reader(f)
lines = list(reader)
keys = lines[0]
lines = lines[1:]
return keys, lines
def unfussy_reader(reader):
while True:
try:
yield next(reader)
except StopIteration:
return
except csv.Error:
print(csv.Error)
# log the problem or whatever
continue
def read_csv(fn, keys=True, create=False):
try:
with open(fn, 'r', newline='', encoding='utf-8') as f:
# reader = csv.reader( (line.replace('\0','') for line in f) )
reader = csv.reader(f)
lines = list(unfussy_reader(reader))
if keys:
keys = lines[0]
lines = lines[1:]
return keys, lines
return lines
except:
if create:
return []
raise
def write_csv(fn, keys, rows):
with open(fn, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
if keys is not None:
writer.writerow(keys)
for row in rows:
writer.writerow(row)
def read_json(fn):
with open(fn, 'r') as json_file:
return json.load(json_file)
def write_json(fn, data):
with open(fn, 'w') as outfile:
json.dump(data, outfile)
def write_report(fn, title=None, keys=None, rows=[]):
count = 0
with open(fn, 'w') as f:
f.write("<!doctype html>")
f.write("<html>")
f.write("<head>")
if title is not None:
f.write("<title>{}</title>".format(title))
f.write("<link rel='stylesheet' href='reports.css'>")
f.write("</head>")
f.write("<body>")
if title is not None:
f.write("<h2>{}</h2>".format(title))
f.write("<table border='1' cellpadding='3' cellspacing='3'>")
if keys is not None:
for key in keys:
f.write("<th>{}</th>".format(key))
for row in rows:
if row is None:
return
count += 1
f.write("<tr>")
for cell in row:
if isinstance(cell, list) or isinstance(cell, tuple):
f.write("<td>{}</td>".format('<br/>'.join(str(x) for x in cell)))
else:
f.write("<td>{}</td>".format(cell))
f.write("</tr>")
f.write("</table>")
f.write("</body>")
f.write("</html>")
print("{} {}".format(fn, count))
def paper_path(key='papers', paper_id=''):
return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id)
class DbPaper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
self.data = read_json(paper_path('db_papers', paper_id))
@property
def title(self):
return self.data['title']
@property
def journal(self):
return self.data['journalName']
@property
def authors(self):
return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ]
class RawPaper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
data = read_json(paper_path('raw_papers', paper_id))
if 'paper' not in data:
print(data)
return None
self.data = data['paper']
@property
def title(self):
return self.data['title']['text']
@property
def journal(self):
return self.data['journal']['name']
@property
def authors(self):
return [ (author[0]['ids'][0], author[0]['name']) for author in self.data['authors'] ]
def load_paper(paper_id):
if os.path.exists(paper_path('db_papers', paper_id)):
# print('db paper')
return DbPaper(paper_id)
if os.path.exists(paper_path('raw_papers', paper_id)):
# print('raw paper')
return RawPaper(paper_id)
print('no paper')
return None
|