1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
import re
import os
import gzip
import glob
import json
import click
import operator
from util import *
DOI_DIR = 'datasets/s2/doi'
@click.command()
def doi_report():
rows = []
domains = {}
institutions = {}
geocode_lookup = load_geocode_lookup()
for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
url_info = read_json(fn)
domain = url_info['domain']
paper_id = url_info['paper_id']
doi_fn = fn.replace('.url', '.doi')
institutions_fn = fn.replace('paper.url', 'institutions.json')
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
if os.path.exists(institutions_fn):
continue
if 'ieee' in domain:
affiliations = load_ieee(paper_id, doi_fn)
for affiliation in affiliations:
if affiliation in institutions:
institutions[affiliation] += 1
else:
institutions[affiliation] = 1
domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
# for domain, count in domain_list:
# print('{}\t{}'.format(count, domain))
institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
# for institution, count in institution_list:
# print('{}\t{}'.format(count, institution))
display_institution_list = []
raw_institution_list = []
for inst in institution_list:
raw_institution_list.append(inst)
if inst[0] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
continue
inst_parts = inst[0].split(',')
if inst_parts[0] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
else:
display_institution_list.append(inst)
write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list)
# print(domain_list)
# rows.append(data['first_pages'])
# if data['institutions']:
# for institution in data['institutions']:
# institutions.append(institution)
# institution_names.append(institution[1])
# if data['no_institutions']:
# no_institutions.append(data['no_institutions'])
# deduped_institutions = dedupe(institution_names)
# write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
# write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
# write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
# write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
# print("{} deduped institutions".format(len(deduped_institutions)))
def dedupe(a):
p = {}
for s in a:
p[s] = None
ss = sorted(p.keys())
return ss
def load_ieee(paper_id, fn):
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
return []
with open(fn, 'r') as f:
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
try:
data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
data = json.loads(data)
write_json(fn.replace('paper.doi', 'ieee.json'), data)
# print(data)
except:
print('could not read data')
return []
affiliations = [ author['affiliation'] for author in data['authors'] ]
# print(affiliations)
return affiliations
def load_geocode_lookup():
insts = read_csv('reports/institutions_found.csv', keys=None)
lookup = {}
for inst in insts:
# print(inst)
lookup[inst[0]] = True
lookup[inst[3]] = True
return lookup
class NameLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '<span class="name">' + self.s + '</span>'
class BoldLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '<b>' + self.s + '</b>'
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
if __name__ == '__main__':
doi_report()
|