import re import os import gzip import glob import json import click import operator from util import * DOI_DIR = 'datasets/s2/doi' @click.command() def doi_report(): rows = [] domains = {} institution_names = [] institutions = [] no_institutions = [] for fn in glob.iglob('{}/**/*.url'.format(PDF_DIR), recursive=True): url_info = read_json(fn) domain = url_info['domain'] if domain in domains: domains[domain] += 1 else: domains[domain] = 1 domain_list = sorted(domains.items(), key=operator.itemgetter(1)) print(domain_list) # rows.append(data['first_pages']) # if data['institutions']: # for institution in data['institutions']: # institutions.append(institution) # institution_names.append(institution[1]) # if data['no_institutions']: # no_institutions.append(data['no_institutions']) # deduped_institutions = dedupe(institution_names) # write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) # write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) # write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) # write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) # print("{} deduped institutions".format(len(deduped_institutions))) def dedupe(a): p = {} for s in a: p[s] = None ss = sorted(p.keys()) return ss def process_paper(fn): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return None with open(fn, 'r') as f: lines = [] emails = [] institutions = [] authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] journal = paper.journal.lower() found_authors = [] for line in f.readlines(): class NameLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' class BoldLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': doi_report()