summaryrefslogtreecommitdiff
path: root/s2-pdf-first-pages.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-25 22:19:15 +0100
commitee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree41372528e78d4328bc2a47bbbabac7e809c58894 /s2-pdf-first-pages.py
parent255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
moving stuff
Diffstat (limited to 's2-pdf-first-pages.py')
-rw-r--r--s2-pdf-first-pages.py133
1 files changed, 0 insertions, 133 deletions
diff --git a/s2-pdf-first-pages.py b/s2-pdf-first-pages.py
deleted file mode 100644
index c8a34af4..00000000
--- a/s2-pdf-first-pages.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import re
-import os
-import gzip
-import glob
-import json
-import click
-import math
-import string
-from util import *
-
-PDF_DIR = 'datasets/s2/pdf'
-
-@click.command()
-def report_first_pages():
- rows = []
- institution_names = []
- institutions = []
- no_institutions = []
- for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- data = process_paper(fn)
- rows.append(data['first_pages'])
- if data['institutions']:
- for institution in data['institutions']:
- institutions.append(institution)
- institution_names.append(institution[1])
- if data['no_institutions']:
- no_institutions.append(data['no_institutions'])
- deduped_institutions = dedupe(institution_names)
-
- write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
- write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
- write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
- write_csv('reports/institution_names.csv', keys=None, rows=[(name,) for name in deduped_institutions])
- print("{} deduped institutions".format(len(deduped_institutions)))
-
-def dedupe(a):
- p = {}
- for s in a:
- p[s] = None
- ss = sorted(p.keys())
- return ss
-
-def process_paper(fn):
- paper_id = fn.replace(PDF_DIR, '').split('/')[2]
- paper = load_paper(paper_id)
- if paper is None:
- print("{} no paper found!".format(paper_id))
- return None
- with open(fn, 'r') as f:
- lines = []
- emails = []
- institutions = []
- authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
- journal = paper.journal.lower()
- found_authors = []
- for line in f.readlines():
- l = line.lower()
- if 'abstract' in l:
- break
- if len(line) < 3:
- continue
- if journal and journal in l:
- continue
- if '@' in line:
- # print('email {}'.format(line))
- emails.append(line)
- continue
- names = [s.strip() for s in re.split(',| and ', l)]
- was_found = False
- for name in names:
- found = find_authors(authors, name)
- if found:
- was_found = True
- # print("found {}".format(found[1]))
- if found[0]:
- found_authors.append(found)
- if was_found:
- # lines.append(NameLine(line))
- continue
-
- if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
- inst = re.sub(r'^[\W\d]+', '', line)
- inst = re.sub(r'[\W\d]+$', '', inst)
- inst = re.sub(r'\s+', ' ', inst)
- inst = re.sub(r'Dept.', 'Department ', inst)
- if len(inst) < 160:
- inst = inst.replace('&', 'and')
- inst_parts = []
- department = ''
- for inst_part in inst.split(','):
- inst_part = inst_part.strip()
- inst_low = inst_part.lower()
- if 'prof' in inst_low:
- continue
- if 'article ' in inst_low:
- continue
- if 'department' in inst_low:
- department = inst_part
- else:
- inst_parts.append(inst_part)
- inst = ', '.join(inst_parts)
- if inst:
- inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
- institutions.append([ paper_id, inst, department ])
- lines.append(BoldLine(inst))
- continue
- lines.append(line)
- write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
- return {
- 'first_pages': [
- paper_id,
- lines,
- found_authors,
- emails,
- ],
- 'institutions': None if not len(institutions) else institutions,
- 'no_institutions': None if len(institutions) else [
- paper_id,
- lines,
- ],
- }
-
-def find_authors(authors, line):
- for a in authors:
- if a[2] in line:
- return a
- return None
-
-def paper_path(paper_id):
- return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
-
-if __name__ == '__main__':
- report_first_pages()