import re
import os
import gzip
import glob
import json
import click
import operator
from util import *
DOI_DIR = 'datasets/s2/doi'
@click.command()
def doi_report():
rows = []
domains = {}
institution_names = []
institutions = []
no_institutions = []
for fn in glob.iglob('{}/**/*.url'.format(PDF_DIR), recursive=True):
url_info = read_json(fn)
domain = url_info['domain']
if domain in domains:
domains[domain] += 1
else:
domains[domain] = 1
domain_list = sorted(domains.items(), key=operator.itemgetter(1))
print(domain_list)
# rows.append(data['first_pages'])
# if data['institutions']:
# for institution in data['institutions']:
# institutions.append(institution)
# institution_names.append(institution[1])
# if data['no_institutions']:
# no_institutions.append(data['no_institutions'])
# deduped_institutions = dedupe(institution_names)
# write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
# write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
# write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
# write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
# print("{} deduped institutions".format(len(deduped_institutions)))
def dedupe(a):
p = {}
for s in a:
p[s] = None
ss = sorted(p.keys())
return ss
def process_paper(fn):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
return None
with open(fn, 'r') as f:
lines = []
emails = []
institutions = []
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
journal = paper.journal.lower()
found_authors = []
for line in f.readlines():
class NameLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '' + self.s + ''
class BoldLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '' + self.s + ''
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
def paper_path(paper_id):
return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
doi_report()