1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
import re
import os
import gzip
import glob
import simplejson as json
import click
import math
import string
from util import *
PDF_DIR = 'datasets/s2/pdf'
@click.command()
def report_first_pages():
rows = []
institution_names = []
institutions = []
no_institutions = []
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
data = process_paper(fn)
rows.append(data['first_pages'])
if data['institutions']:
for institution in data['institutions']:
institutions.append(institution)
institution_names.append(institution[1])
if data['no_institutions']:
no_institutions.append(data['no_institutions'])
deduped_institutions = dedupe(institution_names)
write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
write_csv('reports/institution_names_extracted.csv', keys=None, rows=[(name,) for name in deduped_institutions])
print("{} deduped institutions".format(len(deduped_institutions)))
def dedupe(a):
p = {}
for s in a:
p[s] = None
ss = sorted(p.keys())
return ss
def process_paper(fn):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
return None
with open(fn, 'r') as f:
lines = []
emails = []
institutions = []
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
journal = paper.journal.lower()
found_authors = []
for line in f.readlines():
l = line.lower()
if 'abstract' in l:
break
if len(line) < 3:
continue
if journal and journal in l:
continue
if '@' in line:
# print('email {}'.format(line))
emails.append(line)
continue
names = [s.strip() for s in re.split(',| and ', l)]
was_found = False
for name in names:
found = find_authors(authors, name)
if found:
was_found = True
# print("found {}".format(found[1]))
if found[0]:
found_authors.append(found)
if was_found:
# lines.append(NameLine(line))
continue
if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
inst = re.sub(r'^[\W\d]+', '', line)
inst = re.sub(r'[\W\d]+$', '', inst)
inst = re.sub(r'\s+', ' ', inst)
inst = re.sub(r'Dept.', 'Department ', inst)
if len(inst) < 160:
inst = inst.replace('&', 'and')
inst_parts = []
department = ''
for inst_part in inst.split(','):
inst_part = inst_part.strip()
inst_low = inst_part.lower()
if 'prof' in inst_low:
continue
if 'article ' in inst_low:
continue
if 'department' in inst_low:
department = inst_part
else:
inst_parts.append(inst_part)
inst = ', '.join(inst_parts)
if inst:
inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
institutions.append([ paper_id, inst, department ])
lines.append(BoldLine(inst))
continue
lines.append(line)
write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
return {
'first_pages': [
paper_id,
lines,
found_authors,
emails,
],
'institutions': None if not len(institutions) else institutions,
'no_institutions': None if len(institutions) else [
paper_id,
lines,
],
}
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
def paper_path(paper_id):
return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
report_first_pages()
|