diff options
Diffstat (limited to 'scraper/s2-pdf-first-pages.py')
| -rw-r--r-- | scraper/s2-pdf-first-pages.py | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/scraper/s2-pdf-first-pages.py b/scraper/s2-pdf-first-pages.py new file mode 100644 index 00000000..c8a34af4 --- /dev/null +++ b/scraper/s2-pdf-first-pages.py @@ -0,0 +1,133 @@ +import re +import os +import gzip +import glob +import json +import click +import math +import string +from util import * + +PDF_DIR = 'datasets/s2/pdf' + +@click.command() +def report_first_pages(): + rows = [] + institution_names = [] + institutions = [] + no_institutions = [] + for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): + data = process_paper(fn) + rows.append(data['first_pages']) + if data['institutions']: + for institution in data['institutions']: + institutions.append(institution) + institution_names.append(institution[1]) + if data['no_institutions']: + no_institutions.append(data['no_institutions']) + deduped_institutions = dedupe(institution_names) + + write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows) + write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1])) + write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions) + write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions]) + print("{} deduped institutions".format(len(deduped_institutions))) + +def dedupe(a): + p = {} + for s in a: + p[s] = None + ss = sorted(p.keys()) + return ss + +def process_paper(fn): + paper_id = fn.replace(PDF_DIR, '').split('/')[2] + paper = load_paper(paper_id) + if paper is None: + print("{} no paper found!".format(paper_id)) + return None + with open(fn, 'r') as f: + lines = [] + emails = [] + institutions = [] + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + journal = paper.journal.lower() + found_authors = [] + for line in f.readlines(): + l = line.lower() + if 'abstract' in l: + break + if len(line) < 3: + continue + if journal and journal in l: + continue + if '@' in line: + # print('email {}'.format(line)) + emails.append(line) + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + if was_found: + # lines.append(NameLine(line)) + continue + + if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l: + inst = re.sub(r'^[\W\d]+', '', line) + inst = re.sub(r'[\W\d]+$', '', inst) + inst = re.sub(r'\s+', ' ', inst) + inst = re.sub(r'Dept.', 'Department ', inst) + if len(inst) < 160: + inst = inst.replace('&', 'and') + inst_parts = [] + department = '' + for inst_part in inst.split(','): + inst_part = inst_part.strip() + inst_low = inst_part.lower() + if 'prof' in inst_low: + continue + if 'article ' in inst_low: + continue + if 'department' in inst_low: + department = inst_part + else: + inst_parts.append(inst_part) + inst = ', '.join(inst_parts) + if inst: + inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip() + institutions.append([ paper_id, inst, department ]) + lines.append(BoldLine(inst)) + continue + lines.append(line) + write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions }) + return { + 'first_pages': [ + paper_id, + lines, + found_authors, + emails, + ], + 'institutions': None if not len(institutions) else institutions, + 'no_institutions': None if len(institutions) else [ + paper_id, + lines, + ], + } + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + +def paper_path(paper_id): + return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + report_first_pages() |
