import re import os import gzip import glob import json import click import operator from util import * DOI_DIR = 'datasets/s2/doi' @click.command() def doi_report(): rows = [] domains = {} institutions = {} # geocode_lookup = load_geocode_lookup() addresses = AddressBook() geocoded_papers = [] unknown_papers = [] unattributed_papers = [] paper_count = 0 ieee_count = 0 unparsed_count = 0 for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): paper_count += 1 url_info = read_json(fn) domain = url_info['domain'] paper_id = url_info['paper_id'] paper = load_paper(paper_id) doi_fn = fn.replace('.url', '.doi') # institutions_fn = fn.replace('paper.url', 'institutions.json') address = None if domain in domains: domains[domain] += 1 else: domains[domain] = 1 # if not os.path.exists(institutions_fn): # continue paper_affiliation_count = 0 if 'ieee' in domain: ieee_count += 1 affiliations = load_ieee(paper_id, doi_fn) for affiliation in affiliations: if affiliation: paper_affiliation_count += 1 if affiliation in institutions: institutions[affiliation] += 1 else: institutions[affiliation] = 1 address = addresses.find(affiliation) if not address: unknown_papers.append([paper.paper_id, paper.title, affiliation]) if paper_affiliation_count == 0: unattributed_papers.append([paper.paper_id, paper.title]) else: unparsed_count += 1 if address: geocoded_papers.append([paper.paper_id, paper.title] + address) domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) # for domain, count in domain_list: # print('{}\t{}'.format(count, domain)) institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) # for institution, count in institution_list: # print('{}\t{}'.format(count, institution)) display_institution_list = [] raw_institution_list = [] for inst in institution_list: addr = addresses.find(inst[0]) if addr: display_institution_list.append((BoldLine(inst[0]), inst[1],)) else: display_institution_list.append(inst) write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) print("total papers: {}".format(paper_count)) print("ieee papers: {}".format(ieee_count)) print("unparsed papers: {}".format(unparsed_count)) print("geocoded papers: {}".format(len(geocoded_papers))) print("unknown papers: {}".format(len(unknown_papers))) print("unattributed papers: {}".format(len(unattributed_papers))) def load_ieee(paper_id, fn): paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return [] with open(fn, 'r') as f: authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] try: data = f.read().split('global.document.metadata=')[1].split('')[0].strip()[:-1] data = json.loads(data) write_json(fn.replace('paper.doi', 'ieee.json'), data) # print(data) except: print('could not read data') return [] affiliations = [ author['affiliation'] for author in data['authors'] ] institutions = [ [ paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] ] # print(affiliations) write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': doi_report()