1 files changed, 122 insertions, 16 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index b22d44d5..4475f3a9 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -4,12 +4,129 @@ import gzip
 import glob
 import json
 import click
+import math
+import string
 from util import *
 
 PDF_DIR = 'datasets/s2/pdf'
 
-@click.command()
-def pdf_report_first_pages():
+@click.group()
+def s2_pdf_report():
+  pass
+
+@s2_pdf_report.command()
+def report_geocoded_papers():
+  rows = []
+  empty_papers = []
+  no_separator_papers = []
+  geocoded_papers = [] 
+  unknown_papers = []
+  found_count = 0
+  total_count = 0
+  addresses = AddressBook()
+  for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+    paper_id = fn.replace(PDF_DIR, '').split('/')[2]
+    paper = load_paper(paper_id)
+    total_count += 1
+    # print(paper_id)
+    headings, found_abstract = read_headings(fn, paper)
+    heading_string = '\n'.join(headings[0:20])
+    found_addresses = []
+    if not found_abstract:
+      if len(headings) == 0:
+        empty_papers.append(paper.record())
+        continue
+      if len(headings) > 20:
+        no_separator_papers.append(paper.record())
+        # continue
+    for heading in headings:
+      l = heading.lower().strip()
+      address = addresses.find(l)
+      if address:
+        found_addresses.append(address)
+
+    # if not len(found_addresses):
+    #   l = heading_string.lower().strip()
+    #   address = addresses.find(l)
+    #   if address:
+    #     found_addresses.append(address)
+
+    if len(found_addresses):
+      found_count += 1
+      for address in found_addresses:
+        geocoded_papers.append([paper.paper_id, paper.title] + address)
+    else:
+      unknown_papers.append([paper.paper_id, paper.title, heading_string])
+  write_csv('reports/stats/empty_papers.csv', keys=None, rows=empty_papers)
+  write_csv('reports/stats/no_separator_papers.csv', keys=None, rows=no_separator_papers)
+  write_csv('reports/stats/geocoded_papers.csv', keys=None, rows=geocoded_papers)
+  write_csv('reports/stats/unknown_papers.csv', keys=None, rows=unknown_papers)
+  print("{} {} ({}%)".format('empty', len(empty_papers), percent(len(empty_papers), total_count)))
+  print("{} {} ({}%)".format('no separator', len(no_separator_papers), percent(len(no_separator_papers), total_count)))
+  print("{} {} ({}%)".format('found', found_count, percent(found_count, total_count)))
+  print("{} {} ({}%)".format('unknown', len(unknown_papers), percent(len(unknown_papers), total_count)))
+  print("{} {} entities".format('geocoded', len(geocoded_papers)))
+
+def percent(a,b):
+  return round(100 * a / b)
+
+def read_headings(fn, paper):
+  headings = []
+  found_abstract = False
+  found_authors = []
+  journal = paper.journal.lower()
+  authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+  with open(fn, 'r') as f:
+    for line in f.readlines():
+      line = re.sub(r"\S*@\S*\s?", '', line)
+      l = line.lower().strip()
+      if len(l) < 5:
+        continue
+      if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+        line = line[1:]
+      line = line.strip("∗†‡")
+      line = line.replace("ﬂ", "fl").replace('ﬀ', 'ff').replace('ﬃ', 'f‌f‌i').replace('ﬄ', 'f‌f‌l')
+      line = line.strip()
+      if 'abstract' in l:
+        found_abstract = True
+        break
+      if journal and journal in l:
+        continue
+      names = [s.strip() for s in re.split(',| and ', l)]
+      was_found = False
+      for name in names:
+        found = find_authors(authors, name)
+        if found:
+          was_found = True
+          # print("found {}".format(found[1]))
+          if found[0]:
+            found_authors.append(found)
+            continue
+      headings.append(line.strip())
+  return headings, found_abstract
+
+class AddressBook (object):
+  def __init__(self):
+    lookup = {}
+    data = read_csv('reports/all_institutions_sorted.csv', keys=None)
+    for index, line in enumerate(data):
+      lookup[line[1].lower().strip()] = index
+    self.data = data
+    self.lookup = lookup
+  def find(self, address):
+    address = address.lower().strip().strip(string.digits)
+    if address in self.lookup:
+      index = self.lookup[address]
+      return self.data[index]
+    for part in address.split(','):
+      part = part.strip().replace('  ', ' ')
+      if part in self.lookup:
+        index = self.lookup[part]
+        return self.data[index]
+    return None
+
+@s2_pdf_report.command()
+def report_first_pages():
   rows = []
   institution_names = []
   institutions = []
@@ -102,6 +219,7 @@ def process_paper(fn):
         lines.append(BoldLine(inst))
         continue
       lines.append(line)
+    write_json('{}/{}'.format(paper_path(paper_id), 'institutions.json'), { 'institutions': institutions })
     return {
       'first_pages': [
         paper_id,
@@ -116,18 +234,6 @@ def process_paper(fn):
       ],
     }
 
-class NameLine(object):
-  def __init__(self, s):
-    self.s = s.strip()
-  def __str__(self):
-    return '<span class="name">' + self.s + '</span>'
-
-class BoldLine(object):
-  def __init__(self, s):
-    self.s = s.strip()
-  def __str__(self):
-    return '<b>' + self.s + '</b>'
-
 def find_authors(authors, line):
   for a in authors:
     if a[2] in line:
@@ -135,7 +241,7 @@ def find_authors(authors, line):
   return None
 
 def paper_path(paper_id):
-  return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+  return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)
   
 if __name__ == '__main__':
-  pdf_report_first_pages()
+  s2_pdf_report()