summaryrefslogtreecommitdiff
path: root/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'util.py')
-rw-r--r--util.py63
1 files changed, 61 insertions, 2 deletions
diff --git a/util.py b/util.py
index d851d797..ed7ee396 100644
--- a/util.py
+++ b/util.py
@@ -1,8 +1,11 @@
+import re
import os
import csv
import string
import codecs
+import gspread
import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
def read_citation_list(index=0):
filename = './datasets/citations.csv'
@@ -190,13 +193,55 @@ def dedupe(a):
ss = sorted(p.keys())
return ss
+def read_headings(fn, paper):
+ headings = []
+ found_abstract = False
+ found_authors = []
+ journal = paper.journal.lower()
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ with open(fn, 'r') as f:
+ for line in f.readlines():
+ line = re.sub(r"\S*@\S*\s?", '', line)
+ l = line.lower().strip()
+ if len(l) < 5:
+ continue
+ if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+ line = line[1:]
+ line = line.strip("∗†‡")
+ line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
+ line = line.strip()
+ if 'abstract' in l:
+ found_abstract = True
+ break
+ if journal and journal in l:
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ continue
+ headings.append(line.strip())
+ return headings, found_abstract
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
class AddressBook (object):
def __init__(self):
entities = {}
lookup = {}
- data = read_csv('reports/all_institutions_sorted.csv', keys=None)
+ data = fetch_google_sheet()
+ # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True)
for index, line in enumerate(data):
- if line[0] == line[1]:
+ if line[0] == line[1] or line[0] not in entities:
entities[line[0]] = index
lookup[line[1].lower().strip()] = line[0]
self.data = data
@@ -216,3 +261,17 @@ class AddressBook (object):
index = self.entities[entity]
return self.data[index]
return None
+
+def fetch_worksheet():
+ scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+ credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope)
+ docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+ client = gspread.authorize(credentials)
+ spreadsheet = client.open_by_key(docid)
+ return spreadsheet.worksheet("institutions")
+
+def fetch_google_sheet():
+ rows = fetch_worksheet().get_all_values()
+ keys = rows[0]
+ lines = rows[1:]
+ return lines