import re import os import gzip import glob import json import click from util import * PDF_DIR = 'datasets/s2/pdf' @click.command() def pdf_report_first_pages(): rows = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): row, institutions = process_paper(fn) print(row) rows.append(row) write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) write_report('reports/institutions.html', title='Institutions', keys=None, rows=institutions) print("Wrote {} rows".format(len(rows))) def process_paper(fn): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return None with open(fn, 'r') as f: lines = [] emails = [] institutions = [] authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] journal = paper.journal.lower() found_authors = [] for line in f.readlines(): l = line.lower() if 'abstract' in l: break if len(line) < 3: continue if journal and journal in l: continue if '@' in line: # print('email {}'.format(line)) emails.append(line) continue names = [s.strip() for s in re.split(',| and ', l)] was_found = False for name in names: found = find_authors(authors, name) if found: was_found = True # print("found {}".format(found[1])) if found[0]: found_authors.append(found) if was_found: # lines.append(NameLine(line)) continue if 'university' in l or 'universiteit' in l or 'research center' in l: institutions.append(line) lines.append(BoldLine(line)) continue lines.append(line) return [ paper_id, lines, found_authors, emails, ], [ paper_id, sorted(institutions), ] class NameLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' class BoldLine(object): def __init__(self, s): self.s = s.strip() def __str__(self): return '' + self.s + '' def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': pdf_report_first_pages()