From aacdf0fa056b51000ff88479da479ded3f36b59c Mon Sep 17 00:00:00 2001
From: Jules Laplace <julescarbon@gmail.com>
Date: Tue, 6 Nov 2018 15:05:40 +0100
Subject: we geocoding

---
 s2-pdf-report.py | 79 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 62 insertions(+), 17 deletions(-)

(limited to 's2-pdf-report.py')

diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index 7c89381f..7977660f 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -11,13 +11,32 @@ PDF_DIR = 'datasets/s2/pdf'
 @click.command()
 def pdf_report_first_pages():
   rows = []
+  institution_names = []
+  institutions = []
+  no_institutions = []
   for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
-    row, institutions = process_paper(fn)
-    print(row)
-    rows.append(row)
+    data = process_paper(fn)
+    rows.append(data['first_pages'])
+    if data['institutions']:
+      for institution in data['institutions']:
+        institutions.append(institution)
+        institution_names.append(institution[1])
+    if data['no_institutions']:
+      no_institutions.append(data['no_institutions'])
+  deduped_institutions = dedupe(institution_names)
+
   write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
-  write_report('reports/institutions.html', title='Institutions', keys=None, rows=institutions)
-  print("Wrote {} rows".format(len(rows)))
+  write_report('reports/institutions.html', title='Institutions', keys=None, rows=sorted(institutions, key=lambda x: x[1]))
+  write_report('reports/institutions_missing.html', title='Institutions', keys=None, rows=no_institutions)
+  write_csv('reports/institution_names.txt', keys=None, rows=[(name,) for name in deduped_institutions])
+  print("{} deduped institutions".format(len(deduped_institutions)))
+
+def dedupe(a):
+  p = {}
+  for s in a:
+    p[s] = None
+  ss = sorted(p.keys())
+  return ss
 
 def process_paper(fn):
   paper_id = fn.replace(PDF_DIR, '').split('/')[2]
@@ -56,20 +75,46 @@ def process_paper(fn):
       if was_found:
         # lines.append(NameLine(line))
         continue
-      if 'university' in l or 'universiteit' in l or 'research center' in l:
-        institutions.append(line)
-        lines.append(BoldLine(line))
+      if 'university' in l or 'universiteit' in l or 'research center' in l or 'research lab' in l or 'college' in l or ', inc' in l or 'institute' in l:
+        inst = re.sub(r'^[\W\d]+', '', line)
+        inst = re.sub(r'[\W\d]+$', '', inst)
+        inst = re.sub(r'\s+', ' ', inst)
+        inst = re.sub(r'Dept.', 'Department ', inst)
+        if len(inst) < 160:
+          inst = inst.replace('&', 'and')
+          inst_parts = []
+          department = ''
+          for inst_part in inst.split(','):
+            inst_part = inst_part.strip()
+            inst_low = inst_part.lower()
+            if 'prof' in inst_low:
+              continue
+            if 'article ' in inst_low:
+              continue
+            if 'department' in inst_low:
+              department = inst_part
+            else:
+              inst_parts.append(inst_part)
+          inst = ', '.join(inst_parts)
+          if inst:
+            inst = ''.join([i if ord(i) < 128 else ' ' for i in inst]).strip()
+            institutions.append([ paper_id, inst, department ])
+        lines.append(BoldLine(inst))
         continue
       lines.append(line)
-    return [
-      paper_id,
-      lines,
-      found_authors,
-      emails,
-    ], [
-      paper_id,
-      sorted(institutions),
-    ]
+    return {
+      'first_pages': [
+        paper_id,
+        lines,
+        found_authors,
+        emails,
+      ],
+      'institutions': None if not len(institutions) else institutions,
+      'no_institutions': None if len(institutions) else [
+        paper_id,
+        lines,
+      ],
+    }
 
 class NameLine(object):
   def __init__(self, s):
-- 
cgit v1.2.3-70-g09d2