import re import os import gzip import glob import json import click from util import * PDF_DIR = 'datasets/s2/pdf' @click.command() def pdf_report_first_pages(): rows = [] institution_names = [] institutions = [] no_institutions = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): data = process_paper(fn) rows.append(data['first_pages']) if data['institutions']: for institution in data['institutions']: institutions.append(institution) institution_names.append(institution[1]) if data['no_institutions']: no_institutions.append(data['no_institutions']) deduped_institutions = dedupe(institution_names) write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) print("{} deduped institutions".format(len(deduped_institutions))) def dedupe(a): p = {} for s in a: p[s] = None ss = sorted(p.keys()) return ss def process_paper(fn): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return None with open(fn, 'r') as f: lines = [] emails = [] institutions = [] authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] journal = paper.journal.lower() found_authors = [] for line in f.readlines(): l = line.lower() if 'abstract' in l: break if len(line) < 3: continue if journal and journal in l: continue if '@' in line: # print('email {}'.format(line)) emails.append(line) continue names = [s.strip() for s in re.split(',| and ', l)] was_found = False for name in names: found = find_authors(authors, name) if found: was_found = True # print("found {}".format(found[1])) if found[0]: found_authors.append(found) if was_found: # lines.append(NameLine(line)) continue if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l: inst = re.sub(r'^[\W\d]+', '', line) inst = re.sub(r'[\W\d]+$', '', inst) inst = re.sub(r'\s+', ' ', inst) inst = re.sub(r'Dept.', 'Department ', inst) if len(inst) < 160: inst = inst.replace('&', 'and') inst_parts = [] department = '' for inst_part in inst.split(','): inst_part = inst_part.strip() inst_low = inst_part.lower() if 'prof' in inst_low: continue if 'article ' in inst_low: continue if 'department' in inst_low: department = inst_part else: inst_parts.append(inst_part) inst = ', '.join(inst_parts) if inst: inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip() institutions.append([ paper_id, inst, department ]) lines.append(BoldLine(inst)) continue lines.append(line) return { 'first_pages': [ paper_id, lines, found_authors, emails, ], 'institutions': None if not len(institutions) else institutions, 'no_institutions': None if len(institutions) else [ paper_id, lines, ], } class NameLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' class BoldLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': pdf_report_first_pages()