1 files changed, 61 insertions, 2 deletions
diff --git a/util.py b/util.py
index d851d797..ed7ee396 100644
--- a/util.py
+++ b/util.py
@@ -1,8 +1,11 @@
+import re
 import os
 import csv
 import string
 import codecs
+import gspread
 import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
 
 def read_citation_list(index=0):
   filename = './datasets/citations.csv'
@@ -190,13 +193,55 @@ def dedupe(a):
   ss = sorted(p.keys())
   return ss
 
+def read_headings(fn, paper):
+  headings = []
+  found_abstract = False
+  found_authors = []
+  journal = paper.journal.lower()
+  authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+  with open(fn, 'r') as f:
+    for line in f.readlines():
+      line = re.sub(r"\S*@\S*\s?", '', line)
+      l = line.lower().strip()
+      if len(l) < 5:
+        continue
+      if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+        line = line[1:]
+      line = line.strip("∗†‡")
+      line = line.replace("ﬂ", "fl").replace('ﬀ', 'ff').replace('ﬃ', 'f‌f‌i').replace('ﬄ', 'f‌f‌l')
+      line = line.strip()
+      if 'abstract' in l:
+        found_abstract = True
+        break
+      if journal and journal in l:
+        continue
+      names = [s.strip() for s in re.split(',| and ', l)]
+      was_found = False
+      for name in names:
+        found = find_authors(authors, name)
+        if found:
+          was_found = True
+          # print("found {}".format(found[1]))
+          if found[0]:
+            found_authors.append(found)
+            continue
+      headings.append(line.strip())
+  return headings, found_abstract
+
+def find_authors(authors, line):
+  for a in authors:
+    if a[2] in line:
+      return a
+  return None
+
 class AddressBook (object):
   def __init__(self):
     entities = {}
     lookup = {}
-    data = read_csv('reports/all_institutions_sorted.csv', keys=None)
+    data = fetch_google_sheet()
+    # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True)
     for index, line in enumerate(data):
-      if line[0] == line[1]:
+      if line[0] == line[1] or line[0] not in entities:
         entities[line[0]] = index
       lookup[line[1].lower().strip()] = line[0]
     self.data = data
@@ -216,3 +261,17 @@ class AddressBook (object):
         index = self.entities[entity]
         return self.data[index]
     return None
+
+def fetch_worksheet():
+  scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+  credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope)
+  docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+  client = gspread.authorize(credentials)
+  spreadsheet = client.open_by_key(docid)
+  return spreadsheet.worksheet("institutions")
+
+def fetch_google_sheet():
+  rows = fetch_worksheet().get_all_values()
+  keys = rows[0]
+  lines = rows[1:]
+  return lines