import re import os import gzip import glob import simplejson as json import click import operator from util import * from bs4 import BeautifulSoup from urllib.parse import unquote from importlib import import_module doi = import_module('s2-fetch-doi') DOI_DIR = 'datasets/s2/doi' @click.command() def doi_report(): rows = [] domains = {} institutions = {} # geocode_lookup = load_geocode_lookup() addresses = AddressBook() geocoded_papers = [] unknown_papers = [] unattributed_papers = [] paper_count = 0 ieee_count = 0 springer_count = 0 sciencedirect_count = 0 acm_count = 0 computerorg_count = 0 elsevier_count = 0 unparsed_count = 0 for fn in glob.iglob('{}/**/*.url'.format(DOI_DIR), recursive=True): paper_count += 1 url_info = read_json(fn) domain = url_info['domain'] paper_id = url_info['paper_id'] paper = load_paper(paper_id) if paper is None: continue if paper.data is None: continue doi_fn = fn.replace('.url', '.doi') address = None if domain in domains: domains[domain] += 1 else: domains[domain] = 1 affiliations = None paper_affiliation_count = 0 if 'ieee.org' in domain: ieee_count += 1 affiliations = load_ieee(paper, doi_fn) elif 'link.springer.com' in domain: springer_count += 1 affiliations = load_springer(paper, doi_fn) elif 'sciencedirect.com' in domain: sciencedirect_count += 1 affiliations = load_sciencedirect(paper, doi_fn) elif 'acm.org' in domain: acm_count += 1 affiliations = load_acm(paper, doi_fn) elif 'computer.org' in domain: computerorg_count += 1 affiliations = load_computerorg(paper, doi_fn) elif 'elsevier.com' in domain: elsevier_count += 1 affiliations = load_elsevier(paper, doi_fn) else: unparsed_count += 1 if affiliations: for affiliation in affiliations: if affiliation: paper_affiliation_count += 1 if affiliation in institutions: institutions[affiliation] += 1 else: institutions[affiliation] = 1 address = addresses.find(affiliation) if not address: unknown_papers.append([paper.paper_id, paper.title, affiliation]) if paper_affiliation_count == 0: unattributed_papers.append([paper.paper_id, paper.title]) if address: geocoded_papers.append([paper.paper_id, paper.title] + address) domain_list = reversed(sorted(domains.items(), key=operator.itemgetter(1))) # for domain, count in domain_list: # print('{}\t{}'.format(count, domain)) institution_list = reversed(sorted(institutions.items(), key=operator.itemgetter(1))) # for institution, count in institution_list: # print('{}\t{}'.format(count, institution)) display_institution_list = [] unknown_institution_list = [] for inst in institution_list: addr = addresses.find(inst[0]) if addr: display_institution_list.append((BoldLine(inst[0]), inst[1],)) elif len(inst[0]) > 1: display_institution_list.append(inst) unknown_institution_list.append(inst) write_report('reports/doi_domains.html', title='DOI Domains', keys=None, rows=domain_list) write_report('reports/doi_institutions.html', title='Institutions from IEEE', keys=None, rows=display_institution_list) write_report('reports/doi_institutions_unknown.html', title='Unknown Institutions from DOI', keys=None, rows=unknown_institution_list) write_csv('reports/doi_institutions_geocoded.csv', keys=None, rows=geocoded_papers) write_csv('reports/doi_institutions_unknown.csv', keys=None, rows=unknown_papers) write_csv('reports/doi_institutions_unattributed.csv', keys=None, rows=unattributed_papers) print("total papers: {}".format(paper_count)) print(".. ieee: {}".format(ieee_count)) print(".. springer: {}".format(springer_count)) print(".. acm: {}".format(acm_count)) print(".. computerorg: {}".format(computerorg_count)) print(".. sciencedirect: {}".format(sciencedirect_count)) print(".. elsevier: {}".format(elsevier_count)) print(".. unparsed: {}".format(unparsed_count)) print("geocoded papers: {}".format(len(geocoded_papers))) print("unknown papers: {}".format(len(unknown_papers))) print("unattributed papers: {}".format(len(unattributed_papers))) def load_ieee(paper, fn): with open(fn, 'r') as f: try: data = f.read().split('global.document.metadata=')[1].split('')[0].strip()[:-1] data = json.loads(data) write_json(fn.replace('paper.doi', 'ieee.json'), data) # print(data) except: print('ieee: could not read data') return None if 'authors' in data: affiliations = [ author['affiliation'] for author in data['authors'] if 'affiliation' in author ] institutions = [ [ paper.paper_id, author['affiliation'], author['affiliation'] ] for author in data['authors'] if 'affiliation' in author ] # print(affiliations) write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations return None def load_springer(paper, fn): # print('springer: {}'.format(paper.paper_id)) with open(fn, 'r') as f: try: soup = BeautifulSoup(f.read(), 'html.parser') except: print('springer: could not read data') return None items = soup.find_all(class_='affiliation__item') affiliations = [ ', '.join(item.strings) for item in items ] institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations def load_sciencedirect(paper, fn): # print('sciencedirect: {}'.format(paper.paper_id)) with open(fn, 'r') as f: try: soup = BeautifulSoup(f.read(), 'html.parser') except: print('sciencedirect: could not read data') return None items = soup.find_all("script", type='application/json', limit=1) if len(items) == 0: return None try: data = json.loads(items[0].string) write_json(fn.replace('paper.doi', 'sciencedirect.json'), data) # print(data) except: print('sciencedirect: json error') return None affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()] institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations def load_acm(paper, fn): # print('acm: {}'.format(paper.paper_id)) with open(fn, 'r') as f: try: soup = BeautifulSoup(f.read(), 'html.parser') except: print('acm: could not read data') return None items = soup.find_all("a", title='Institutional Profile Page') affiliations = [ item.string for item in items ] # print(affiliations) institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) return affiliations def load_computerorg(paper, fn): # print('computerorg: {}'.format(paper.paper_id)) # if not os.path.exists(doi.old_doi_fn(fn)): pass # with open(fn, 'r') as f: # try: # soup = BeautifulSoup(f.read(), 'html.parser') # except: # print('computerorg: could not read data') # return None # items = soup.find_all("a", title='Institutional Profile Page') # affiliations = [ item.string for item in items ] # print(affiliations) # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) # return affiliations def load_elsevier(paper, fn): print('elsevier: {}'.format(paper.paper_id)) if not os.path.exists(doi.old_doi_fn(paper.paper_id)): with open(fn, 'r') as f: try: soup = BeautifulSoup(f.read(), 'html.parser') except: print('elsevier: could not read data') return None item = soup.find_all("input", attrs={"name": 'redirectURL'})[0] #new_url = unquote(item['value']) #if new_url: # print(new_url) # doi.fetch_doi(paper.paper_id, new_url, replace=True) #else: # print("missing redirect url: {}".format(paper.paper_id)) # print('elsevier: {}'.format(paper.paper_id)) # with open(fn, 'r') as f: # try: # soup = BeautifulSoup(f.read(), 'html.parser') # except: # print('elsevier: could not read data') # return None # items = soup.find_all("a", title='Institutional Profile Page') # affiliations = [ item.string for item in items ] # # print(affiliations) # institutions = [ [ paper.paper_id, affiliation ] for affiliation in affiliations ] # write_json('{}/{}'.format(paper_path(paper.paper_id), 'institutions.json'), { 'institutions': institutions }) # return affiliations def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(DOI_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': doi_report()