diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 23:01:37 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-06 23:01:37 +0100 |
| commit | f56705020fa22527ee3494793a3817dffd25fa87 (patch) | |
| tree | 17679e6b4e66d3b109a0e4345b800577fbe4a837 | |
| parent | aa3db56d3c665a3a3139180dc11dc765a2a08520 (diff) | |
starting doi report while pdfs process
| -rw-r--r-- | s2-doi-report.py | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/s2-doi-report.py b/s2-doi-report.py new file mode 100644 index 00000000..e322b531 --- /dev/null +++ b/s2-doi-report.py @@ -0,0 +1,87 @@ +import re +import os +import gzip +import glob +import json +import click +import operator +from util import * + +DOI_DIR = 'datasets/s2/doi' + +@click.command() +def doi_report(): + rows = [] + domains = {} + institution_names = [] + institutions = [] + no_institutions = [] + for fn in glob.iglob('{}/**/*.url'.format(PDF_DIR), recursive=True): + url_info = read_json(fn) + domain = url_info['domain'] + if domain in domains: + domains[domain] += 1 + else: + domains[domain] = 1 + domain_list = sorted(domains.items(), key=operator.itemgetter(1)) + print(domain_list) + # rows.append(data['first_pages']) + # if data['institutions']: + # for institution in data['institutions']: + # institutions.append(institution) + # institution_names.append(institution[1]) + # if data['no_institutions']: + # no_institutions.append(data['no_institutions']) + # deduped_institutions = dedupe(institution_names) + + # write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) + # write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) + # write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) + # write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) + # print("{} deduped institutions".format(len(deduped_institutions))) + +def dedupe(a): + p = {} + for s in a: + p[s] = None + ss = sorted(p.keys()) + return ss + +def process_paper(fn): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + if paper is None: + print("{} no paper found!".format(paper_id)) + return None + with open(fn, 'r') as f: + lines = [] + emails = [] + institutions = [] + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + journal = paper.journal.lower() + found_authors = [] + for line in f.readlines(): + +class NameLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '<span class="name">' + self.s + '</span>' + +class BoldLine(object): + def __init__(self, s): + self.s = s.strip() + def __str__(self): + return '<b>' + self.s + '</b>' + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + doi_report() |
