scraper/s2-pdf-first-pages.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

import re
import os
import gzip
import glob
import simplejson as json
import click
import math
import string
from util import *

PDF_DIR = 'datasets/s2/pdf'

@click.command()
def report_first_pages():
  rows = []
  institution_names = []
  institutions = []
  no_institutions = []
  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
    data = process_paper(fn)
    rows.append(data['first_pages'])
    if data['institutions']:
      for institution in data['institutions']:
        institutions.append(institution)
        institution_names.append(institution[1])
    if data['no_institutions']:
      no_institutions.append(data['no_institutions'])
  deduped_institutions = dedupe(institution_names)

  write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
  write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
  write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
  write_csv('reports/institution_names_extracted.csv', keys=None, rows=[(name,) for name in deduped_institutions])
  print("{} deduped institutions".format(len(deduped_institutions)))

def dedupe(a):
  p = {}
  for s in a:
    p[s] = None
  ss = sorted(p.keys())
  return ss

def process_paper(fn):
  paper_id = fn.replace(PDF_DIR, '').split('/')[2]
  paper = load_paper(paper_id)
  if paper is None:
    print("{} no paper found!".format(paper_id))
    return None
  with open(fn, 'r') as f:
    lines = []
    emails = []
    institutions = []
    authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
    journal = paper.journal.lower()
    found_authors = []
    for line in f.readlines():
      l = line.lower()
      if 'abstract' in l:
        break
      if len(line) < 3:
        continue
      if journal and journal in l:
        continue
      if '@' in line:
        # print('email {}'.format(line))
        emails.append(line)
        continue
      names = [s.strip() for s in re.split(',| and ', l)]
      was_found = False
      for name in names:
        found = find_authors(authors, name)
        if found:
          was_found = True
          # print("found {}".format(found[1]))
          if found[0]:
            found_authors.append(found)
      if was_found:
        # lines.append(NameLine(line))
        continue

      if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
        inst = re.sub(r'^[\W\d]+', '', line)
        inst = re.sub(r'[\W\d]+$', '', inst)
        inst = re.sub(r'\s+', ' ', inst)
        inst = re.sub(r'Dept.', 'Department ', inst)
        if len(inst) < 160:
          inst = inst.replace('&', 'and')
          inst_parts = []
          department = ''
          for inst_part in inst.split(','):
            inst_part = inst_part.strip()
            inst_low = inst_part.lower()
            if 'prof' in inst_low:
              continue
            if 'article ' in inst_low:
              continue
            if 'department' in inst_low:
              department = inst_part
            else:
              inst_parts.append(inst_part)
          inst = ', '.join(inst_parts)
          if inst:
            inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
            institutions.append([ paper_id, inst, department ])
        lines.append(BoldLine(inst))
        continue
      lines.append(line)
    write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
    return {
      'first_pages': [
        paper_id,
        lines,
        found_authors,
        emails,
      ],
      'institutions': None if not len(institutions) else institutions,
      'no_institutions': None if len(institutions) else [
        paper_id,
        lines,
      ],
    }

def find_authors(authors, line):
  for a in authors:
    if a[2] in line:
      return a
  return None

def paper_path(paper_id):
  return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
  
if __name__ == '__main__':
  report_first_pages()