import re import os import gzip import glob import json import click import operator from util import * DOI_DIR = 'datasets/s2/doi' @click.command() def doi_report(): rows = [] domains = {} institutions = {} geocode_lookup = load_geocode_lookup() for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): url_info = read_json(fn) domain = url_info['domain'] paper_id = url_info['paper_id'] doi_fn = fn.replace('.url', '.doi') institutions_fn = fn.replace('paper.url', 'institutions.json') if domain in domains: domains[domain] += 1 else: domains[domain] = 1 if os.path.exists(institutions_fn): continue if 'ieee' in domain: affiliations = load_ieee(paper_id, doi_fn) for affiliation in affiliations: if affiliation in institutions: institutions[affiliation] += 1 else: institutions[affiliation] = 1 domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) # for domain, count in domain_list: # print('{}\t{}'.format(count, domain)) institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) # for institution, count in institution_list: # print('{}\t{}'.format(count, institution)) display_institution_list = [] raw_institution_list = [] for inst in institution_list: raw_institution_list.append(inst) if inst[0] in geocode_lookup: display_institution_list.append((BoldLine(inst[0]), inst[1],)) continue inst_parts = inst[0].split(',') if inst_parts[0] in geocode_lookup: display_institution_list.append((BoldLine(inst[0]), inst[1],)) elif len(inst_parts) > 1 and inst_parts[1] in geocode_lookup: display_institution_list.append((BoldLine(inst[0]), inst[1],)) else: display_institution_list.append(inst) write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) write_csv('reports/doi_institutions.csv', keys=None, rows=raw_institution_list) # print(domain_list) # rows.append(data['first_pages']) # if data['institutions']: # for institution in data['institutions']: # institutions.append(institution) # institution_names.append(institution[1]) # if data['no_institutions']: # no_institutions.append(data['no_institutions']) # deduped_institutions = dedupe(institution_names) # write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) # write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) # write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) # write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) # print("{} deduped institutions".format(len(deduped_institutions))) def dedupe(a): p = {} for s in a: p[s] = None ss = sorted(p.keys()) return ss def load_ieee(paper_id, fn): paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return [] with open(fn, 'r') as f: authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] try: data = f.read().split('global.document.metadata=')[1].split('')[0].strip()[:-1] data = json.loads(data) write_json(fn.replace('paper.doi', 'ieee.json'), data) # print(data) except: print('could not read data') return [] affiliations = [ author['affiliation'] for author in data['authors'] ] # print(affiliations) return affiliations def load_geocode_lookup(): insts = read_csv('reports/institutions_found.csv', keys=None) lookup = {} for inst in insts: # print(inst) lookup[inst[0]] = True lookup[inst[3]] = True return lookup class NameLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' class BoldLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' def find_authors(authors, line): for a in authors: if a[2] in line: return a return None if __name__ == '__main__': doi_report()