1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
import re
import os
import gzip
import glob
import json
import click
import operator
from util import *
DOI_DIR = 'datasets/s2/doi'
@click.command()
def doi_report():
rows = []
domains = {}
institutions = {}
geocode_lookup = load_geocode_lookup()
for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
url_info = read_json(fn)
domain = url_info['domain']
paper_id = url_info['paper_id']
doi_fn = fn.replace('.url', '.doi')
institutions_fn = fn.replace('paper.url', 'institutions.json')
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
if os.path.exists(institutions_fn):
continue
if 'ieee' in domain:
affiliations = load_ieee(paper_id, doi_fn)
for affiliation in affiliations:
if affiliation in institutions:
institutions[affiliation] += 1
else:
institutions[affiliation] = 1
domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
# for domain, count in domain_list:
# print('{}\t{}'.format(count, domain))
institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1)))
# for institution, count in institution_list:
# print('{}\t{}'.format(count, institution))
display_institution_list = []
raw_institution_list = []
for inst in institution_list:
raw_institution_list.append(inst)
if inst[0] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
continue
inst_parts = inst[0].split(',')
if inst_parts[0] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
else:
display_institution_list.append(inst)
write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list)
# print(domain_list)
# rows.append(data['first_pages'])
# if data['institutions']:
# for institution in data['institutions']:
# institutions.append(institution)
# institution_names.append(institution[1])
# if data['no_institutions']:
# no_institutions.append(data['no_institutions'])
# deduped_institutions = dedupe(institution_names)
# write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
# write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
# write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
# write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
# print("{} deduped institutions".format(len(deduped_institutions)))
def dedupe(a):
p = {}
for s in a:
p[s] = None
ss = sorted(p.keys())
return ss
def load_ieee(paper_id, fn):
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
return []
with open(fn, 'r') as f:
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
try:
data = f.read().split('global.document.metadata=')[1].split('</script>')[0].strip()[:-1]
data = json.loads(data)
write_json(fn.replace('paper.doi', 'ieee.json'), data)
# print(data)
except:
print('could not read data')
return []
affiliations = [ author['affiliation'] for author in data['authors'] ]
institutions = [ [ paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ]
# print(affiliations)
write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
return affiliations
def load_geocode_lookup():
insts = read_csv('reports/institutions_found.csv', keys=None)
lookup = {}
for inst in insts:
# print(inst)
lookup[inst[0]] = True
lookup[inst[3]] = True
return lookup
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
def paper_path(paper_id):
return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
doi_report()
|