import re import os import gzip import glob import json import click import math import string from util import * PDF_DIR = 'datasets/s2/pdf' @click.group() def s2_pdf_report(): pass @s2_pdf_report.command() def report_geocoded_papers(): rows = [] empty_papers = [] no_separator_papers = [] geocoded_papers = [] unknown_papers = [] found_count = 0 total_count = 0 addresses = AddressBook() for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) total_count += 1 # print(paper_id) headings, found_abstract = read_headings(fn, paper) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if not found_abstract: if len(headings) == 0: empty_papers.append(paper.record()) continue if len(headings) > 20: no_separator_papers.append(paper.record()) # continue for heading in headings: l = heading.lower().strip() address = addresses.find(l) if address: found_addresses.append(address) # if not len(found_addresses): # l = heading_string.lower().strip() # address = addresses.find(l) # if address: # found_addresses.append(address) if len(found_addresses): found_count += 1 for address in found_addresses: geocoded_papers.append([paper.paper_id, paper.title] + address) else: unknown_papers.append([paper.paper_id, paper.title, heading_string]) write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers) write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers) write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers) write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers) print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count))) print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count))) print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count))) print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count))) print("{} {} entities".format('geocoded', len(geocoded_papers))) def percent(a,b): return round(100 * a / b) def read_headings(fn, paper): headings = [] found_abstract = False found_authors = [] journal = paper.journal.lower() authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] with open(fn, 'r') as f: for line in f.readlines(): line = re.sub(r"\S*@\S*\s?", '', line) l = line.lower().strip() if len(l) < 5: continue if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4': line = line[1:] line = line.strip("∗†‡") line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l') line = line.strip() if 'abstract' in l: found_abstract = True break if journal and journal in l: continue names = [s.strip() for s in re.split(',| and ', l)] was_found = False for name in names: found = find_authors(authors, name) if found: was_found = True # print("found {}".format(found[1])) if found[0]: found_authors.append(found) continue headings.append(line.strip()) return headings, found_abstract class AddressBook (object): def __init__(self): lookup = {} data = read_csv('reports/all_institutions_sorted.csv', keys=None) for index, line in enumerate(data): lookup[line[1].lower().strip()] = index self.data = data self.lookup = lookup def find(self, address): address = address.lower().strip().strip(string.digits) if address in self.lookup: index = self.lookup[address] return self.data[index] for part in address.split(','): part = part.strip().replace(' ', ' ') if part in self.lookup: index = self.lookup[part] return self.data[index] return None @s2_pdf_report.command() def report_first_pages(): rows = [] institution_names = [] institutions = [] no_institutions = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): data = process_paper(fn) rows.append(data['first_pages']) if data['institutions']: for institution in data['institutions']: institutions.append(institution) institution_names.append(institution[1]) if data['no_institutions']: no_institutions.append(data['no_institutions']) deduped_institutions = dedupe(institution_names) write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) print("{} deduped institutions".format(len(deduped_institutions))) def dedupe(a): p = {} for s in a: p[s] = None ss = sorted(p.keys()) return ss def process_paper(fn): paper_id = fn.replace(PDF_DIR, '').split('/')[2] paper = load_paper(paper_id) if paper is None: print("{} no paper found!".format(paper_id)) return None with open(fn, 'r') as f: lines = [] emails = [] institutions = [] authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] journal = paper.journal.lower() found_authors = [] for line in f.readlines(): l = line.lower() if 'abstract' in l: break if len(line) < 3: continue if journal and journal in l: continue if '@' in line: # print('email {}'.format(line)) emails.append(line) continue names = [s.strip() for s in re.split(',| and ', l)] was_found = False for name in names: found = find_authors(authors, name) if found: was_found = True # print("found {}".format(found[1])) if found[0]: found_authors.append(found) if was_found: # lines.append(NameLine(line)) continue if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l: inst = re.sub(r'^[\W\d]+', '', line) inst = re.sub(r'[\W\d]+$', '', inst) inst = re.sub(r'\s+', ' ', inst) inst = re.sub(r'Dept.', 'Department ', inst) if len(inst) < 160: inst = inst.replace('&', 'and') inst_parts = [] department = '' for inst_part in inst.split(','): inst_part = inst_part.strip() inst_low = inst_part.lower() if 'prof' in inst_low: continue if 'article ' in inst_low: continue if 'department' in inst_low: department = inst_part else: inst_parts.append(inst_part) inst = ', '.join(inst_parts) if inst: inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip() institutions.append([ paper_id, inst, department ]) lines.append(BoldLine(inst)) continue lines.append(line) write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) return { 'first_pages': [ paper_id, lines, found_authors, emails, ], 'institutions': None if not len(institutions) else institutions, 'no_institutions': None if len(institutions) else [ paper_id, lines, ], } def find_authors(authors, line): for a in authors: if a[2] in line: return a return None def paper_path(paper_id): return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': s2_pdf_report()