diff options
Diffstat (limited to 'util.py')
| -rw-r--r-- | util.py | 63 |
1 files changed, 61 insertions, 2 deletions
@@ -1,8 +1,11 @@ +import re import os import csv import string import codecs +import gspread import simplejson as json +from oauth2client.service_account import ServiceAccountCredentials def read_citation_list(index=0): filename = './datasets/citations.csv' @@ -190,13 +193,55 @@ def dedupe(a): ss = sorted(p.keys()) return ss +def read_headings(fn, paper): + headings = [] + found_abstract = False + found_authors = [] + journal = paper.journal.lower() + authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ] + with open(fn, 'r') as f: + for line in f.readlines(): + line = re.sub(r"\S*@\S*\s?", '', line) + l = line.lower().strip() + if len(l) < 5: + continue + if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4': + line = line[1:] + line = line.strip("∗†‡") + line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'ffi').replace('ffl', 'ffl') + line = line.strip() + if 'abstract' in l: + found_abstract = True + break + if journal and journal in l: + continue + names = [s.strip() for s in re.split(',| and ', l)] + was_found = False + for name in names: + found = find_authors(authors, name) + if found: + was_found = True + # print("found {}".format(found[1])) + if found[0]: + found_authors.append(found) + continue + headings.append(line.strip()) + return headings, found_abstract + +def find_authors(authors, line): + for a in authors: + if a[2] in line: + return a + return None + class AddressBook (object): def __init__(self): entities = {} lookup = {} - data = read_csv('reports/all_institutions_sorted.csv', keys=None) + data = fetch_google_sheet() + # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True) for index, line in enumerate(data): - if line[0] == line[1]: + if line[0] == line[1] or line[0] not in entities: entities[line[0]] = index lookup[line[1].lower().strip()] = line[0] self.data = data @@ -216,3 +261,17 @@ class AddressBook (object): index = self.entities[entity] return self.data[index] return None + +def fetch_worksheet(): + scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] + credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope) + docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc" + client = gspread.authorize(credentials) + spreadsheet = client.open_by_key(docid) + return spreadsheet.worksheet("institutions") + +def fetch_google_sheet(): + rows = fetch_worksheet().get_all_values() + keys = rows[0] + lines = rows[1:] + return lines |
