summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-06 15:05:40 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-06 15:05:40 +0100
commitaacdf0fa056b51000ff88479da479ded3f36b59c (patch)
treede9e221ffd9bf8c67ef54607d6267f00b5233312 /s2-pdf-report.py
parent002e72bb172c34bb71756f9e6c23294913f1ef85 (diff)
we geocoding
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py79
1 files changed, 62 insertions, 17 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index 7c89381f..7977660f 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -11,13 +11,32 @@ PDF_DIR = 'datasets/s2/pdf'
@click.command()
def pdf_report_first_pages():
rows = []
+ institution_names = []
+ institutions = []
+ no_institutions = []
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- row, institutions = process_paper(fn)
- print(row)
- rows.append(row)
+ data = process_paper(fn)
+ rows.append(data['first_pages'])
+ if data['institutions']:
+ for institution in data['institutions']:
+ institutions.append(institution)
+ institution_names.append(institution[1])
+ if data['no_institutions']:
+ no_institutions.append(data['no_institutions'])
+ deduped_institutions = dedupe(institution_names)
+
write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
- write_report('reports/institutions.html', title='Institutions', keys=None, rows=institutions)
- print("Wrote {} rows".format(len(rows)))
+ write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
+ write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
+ write_csv('reports/institution_names.txt', keys=None, rows=[(name,) for name in deduped_institutions])
+ print("{} deduped institutions".format(len(deduped_institutions)))
+
+def dedupe(a):
+ p = {}
+ for s in a:
+ p[s] = None
+ ss = sorted(p.keys())
+ return ss
def process_paper(fn):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
@@ -56,20 +75,46 @@ def process_paper(fn):
if was_found:
# lines.append(NameLine(line))
continue
- if 'university' in l or 'universiteit' in l or 'research center' in l:
- institutions.append(line)
- lines.append(BoldLine(line))
+ if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
+ inst = re.sub(r'^[\W\d]+', '', line)
+ inst = re.sub(r'[\W\d]+$', '', inst)
+ inst = re.sub(r'\s+', ' ', inst)
+ inst = re.sub(r'Dept.', 'Department ', inst)
+ if len(inst) < 160:
+ inst = inst.replace('&', 'and')
+ inst_parts = []
+ department = ''
+ for inst_part in inst.split(','):
+ inst_part = inst_part.strip()
+ inst_low = inst_part.lower()
+ if 'prof' in inst_low:
+ continue
+ if 'article ' in inst_low:
+ continue
+ if 'department' in inst_low:
+ department = inst_part
+ else:
+ inst_parts.append(inst_part)
+ inst = ', '.join(inst_parts)
+ if inst:
+ inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
+ institutions.append([ paper_id, inst, department ])
+ lines.append(BoldLine(inst))
continue
lines.append(line)
- return [
- paper_id,
- lines,
- found_authors,
- emails,
- ], [
- paper_id,
- sorted(institutions),
- ]
+ return {
+ 'first_pages': [
+ paper_id,
+ lines,
+ found_authors,
+ emails,
+ ],
+ 'institutions': None if not len(institutions) else institutions,
+ 'no_institutions': None if len(institutions) else [
+ paper_id,
+ lines,
+ ],
+ }
class NameLine(object):
def __init__(self, s):