From aacdf0fa056b51000ff88479da479ded3f36b59c Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Tue, 6 Nov 2018 15:05:40 +0100 Subject: we geocoding --- s2-pdf-report.py | 79 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 17 deletions(-) (limited to 's2-pdf-report.py') diff --git a/s2-pdf-report.py b/s2-pdf-report.py index 7c89381f..7977660f 100644 --- a/s2-pdf-report.py +++ b/s2-pdf-report.py @@ -11,13 +11,32 @@ PDF_DIR = 'datasets/s2/pdf' @click.command() def pdf_report_first_pages(): rows = [] + institution_names = [] + institutions = [] + no_institutions = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - row, institutions = process_paper(fn) - print(row) - rows.append(row) + data = process_paper(fn) + rows.append(data['first_pages']) + if data['institutions']: + for institution in data['institutions']: + institutions.append(institution) + institution_names.append(institution[1]) + if data['no_institutions']: + no_institutions.append(data['no_institutions']) + deduped_institutions = dedupe(institution_names) + write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) - write_report('reports/institutions.html', title='Institutions', keys=None, rows=institutions) - print("Wrote {} rows".format(len(rows))) + write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) + write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) + write_csv('reports/institution_names.txt', keys=None, rows=[(name,) for name in deduped_institutions]) + print("{} deduped institutions".format(len(deduped_institutions))) + +def dedupe(a): + p = {} + for s in a: + p[s] = None + ss = sorted(p.keys()) + return ss def process_paper(fn): paper_id = fn.replace(PDF_DIR, '').split('/')[2] @@ -56,20 +75,46 @@ def process_paper(fn): if was_found: # lines.append(NameLine(line)) continue - if 'university' in l or 'universiteit' in l or 'research center' in l: - institutions.append(line) - lines.append(BoldLine(line)) + if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l: + inst = re.sub(r'^[\W\d]+', '', line) + inst = re.sub(r'[\W\d]+$', '', inst) + inst = re.sub(r'\s+', ' ', inst) + inst = re.sub(r'Dept.', 'Department ', inst) + if len(inst) < 160: + inst = inst.replace('&', 'and') + inst_parts = [] + department = '' + for inst_part in inst.split(','): + inst_part = inst_part.strip() + inst_low = inst_part.lower() + if 'prof' in inst_low: + continue + if 'article ' in inst_low: + continue + if 'department' in inst_low: + department = inst_part + else: + inst_parts.append(inst_part) + inst = ', '.join(inst_parts) + if inst: + inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip() + institutions.append([ paper_id, inst, department ]) + lines.append(BoldLine(inst)) continue lines.append(line) - return [ - paper_id, - lines, - found_authors, - emails, - ], [ - paper_id, - sorted(institutions), - ] + return { + 'first_pages': [ + paper_id, + lines, + found_authors, + emails, + ], + 'institutions': None if not len(institutions) else institutions, + 'no_institutions': None if len(institutions) else [ + paper_id, + lines, + ], + } class NameLine(object): def __init__(self, s): -- cgit v1.2.3-70-g09d2