import re
import os
import gzip
import glob
import json
import click
from util import *
PDF_DIR = 'datasets/s2/pdf'
@click.command()
def pdf_report_first_pages():
rows = []
institution_names = []
institutions = []
no_institutions = []
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
data = process_paper(fn)
rows.append(data['first_pages'])
if data['institutions']:
for institution in data['institutions']:
institutions.append(institution)
institution_names.append(institution[1])
if data['no_institutions']:
no_institutions.append(data['no_institutions'])
deduped_institutions = dedupe(institution_names)
write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
print("{} deduped institutions".format(len(deduped_institutions)))
def dedupe(a):
p = {}
for s in a:
p[s] = None
ss = sorted(p.keys())
return ss
def process_paper(fn):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
paper = load_paper(paper_id)
if paper is None:
print("{} no paper found!".format(paper_id))
return None
with open(fn, 'r') as f:
lines = []
emails = []
institutions = []
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
journal = paper.journal.lower()
found_authors = []
for line in f.readlines():
l = line.lower()
if 'abstract' in l:
break
if len(line) < 3:
continue
if journal and journal in l:
continue
if '@' in line:
# print('email {}'.format(line))
emails.append(line)
continue
names = [s.strip() for s in re.split(',| and ', l)]
was_found = False
for name in names:
found = find_authors(authors, name)
if found:
was_found = True
# print("found {}".format(found[1]))
if found[0]:
found_authors.append(found)
if was_found:
# lines.append(NameLine(line))
continue
if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
inst = re.sub(r'^[\W\d]+', '', line)
inst = re.sub(r'[\W\d]+$', '', inst)
inst = re.sub(r'\s+', ' ', inst)
inst = re.sub(r'Dept.', 'Department ', inst)
if len(inst) < 160:
inst = inst.replace('&', 'and')
inst_parts = []
department = ''
for inst_part in inst.split(','):
inst_part = inst_part.strip()
inst_low = inst_part.lower()
if 'prof' in inst_low:
continue
if 'article ' in inst_low:
continue
if 'department' in inst_low:
department = inst_part
else:
inst_parts.append(inst_part)
inst = ', '.join(inst_parts)
if inst:
inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
institutions.append([ paper_id, inst, department ])
lines.append(BoldLine(inst))
continue
lines.append(line)
return {
'first_pages': [
paper_id,
lines,
found_authors,
emails,
],
'institutions': None if not len(institutions) else institutions,
'no_institutions': None if len(institutions) else [
paper_id,
lines,
],
}
class NameLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '' + self.s + ''
class BoldLine(object):
def __init__(self, s):
self.s = s.strip()
def __str__(self):
return '' + self.s + ''
def find_authors(authors, line):
for a in authors:
if a[2] in line:
return a
return None
def paper_path(paper_id):
return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
pdf_report_first_pages()