summaryrefslogtreecommitdiff
path: root/s2-doi-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-doi-report.py')
-rw-r--r--s2-doi-report.py80
1 files changed, 31 insertions, 49 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py
index d2facb92..611b6391 100644
--- a/s2-doi-report.py
+++ b/s2-doi-report.py
@@ -14,27 +14,45 @@ def doi_report():
rows = []
domains = {}
institutions = {}
- geocode_lookup = load_geocode_lookup()
+ # geocode_lookup = load_geocode_lookup()
+ addresses = AddressBook()
+
+ geocoded_papers = []
+ unknown_papers = []
+ unattributed_papers = []
for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True):
url_info = read_json(fn)
domain = url_info['domain']
paper_id = url_info['paper_id']
+ paper = load_paper(paper_id)
doi_fn = fn.replace('.url', '.doi')
- institutions_fn = fn.replace('paper.url', 'institutions.json')
+ # institutions_fn = fn.replace('paper.url', 'institutions.json')
+ address = None
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
- if os.path.exists(institutions_fn):
- continue
+ # if not os.path.exists(institutions_fn):
+ # continue
+ paper_affiliation_count = 0
if 'ieee' in domain:
affiliations = load_ieee(paper_id, doi_fn)
for affiliation in affiliations:
- if affiliation in institutions:
- institutions[affiliation] += 1
- else:
- institutions[affiliation] = 1
+ if affiliation:
+ paper_affiliation_count += 1
+ if affiliation in institutions:
+ institutions[affiliation] += 1
+ else:
+ institutions[affiliation] = 1
+ address = addresses.find(affiliation)
+ if not address:
+ unknown_papers.append([paper.paper_id, paper.title, affiliation])
+ if paper_affiliation_count == 0:
+ unattributed_papers.append([paper.paper_id, paper.title])
+ if address:
+ geocoded_papers.append([paper.paper_id, paper.title] + address)
+
domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1)))
# for domain, count in domain_list:
# print('{}\t{}'.format(count, domain))
@@ -44,43 +62,16 @@ def doi_report():
display_institution_list = []
raw_institution_list = []
for inst in institution_list:
- raw_institution_list.append(inst)
- if inst[0] in geocode_lookup:
- display_institution_list.append((BoldLine(inst[0]), inst[1],))
- continue
- inst_parts = inst[0].split(',')
- if inst_parts[0] in geocode_lookup:
- display_institution_list.append((BoldLine(inst[0]), inst[1],))
- elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup:
+ addr = addresses.find(inst[0])
+ if addr:
display_institution_list.append((BoldLine(inst[0]), inst[1],))
else:
display_institution_list.append(inst)
write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list)
write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list)
- write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list)
-
- # print(domain_list)
- # rows.append(data['first_pages'])
- # if data['institutions']:
- # for institution in data['institutions']:
- # institutions.append(institution)
- # institution_names.append(institution[1])
- # if data['no_institutions']:
- # no_institutions.append(data['no_institutions'])
- # deduped_institutions = dedupe(institution_names)
-
- # write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
- # write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
- # write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
- # write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
- # print("{} deduped institutions".format(len(deduped_institutions)))
-
-def dedupe(a):
- p = {}
- for s in a:
- p[s] = None
- ss = sorted(p.keys())
- return ss
+ write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers)
+ write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers)
+ write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers)
def load_ieee(paper_id, fn):
paper = load_paper(paper_id)
@@ -103,15 +94,6 @@ def load_ieee(paper_id, fn):
write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
return affiliations
-def load_geocode_lookup():
- insts = read_csv('reports/institutions_found.csv', keys=None)
- lookup = {}
- for inst in insts:
- # print(inst)
- lookup[inst[0]] = True
- lookup[inst[3]] = True
- return lookup
-
def find_authors(authors, line):
for a in authors:
if a[2] in line: