summaryrefslogtreecommitdiff
path: root/scraper/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/util.py')
-rw-r--r--scraper/util.py302
1 files changed, 302 insertions, 0 deletions
diff --git a/scraper/util.py b/scraper/util.py
new file mode 100644
index 00000000..47e5a4aa
--- /dev/null
+++ b/scraper/util.py
@@ -0,0 +1,302 @@
+import re
+import os
+import csv
+import string
+import codecs
+import gspread
+import simplejson as json
+from oauth2client.service_account import ServiceAccountCredentials
+
+def read_citation_list(index=0):
+ filename = './datasets/citations.csv'
+ if index > 0:
+ fn, ext = os.path.splitext(filename)
+ filename = fn + '-' + str(index) + ext
+ with open(filename, 'r') as f:
+ reader = csv.reader(f)
+ lines = list(reader)
+ keys = lines[0]
+ lines = lines[1:]
+ return keys, lines
+
+def unfussy_reader(reader):
+ while True:
+ try:
+ yield next(reader)
+ except StopIteration:
+ return
+ except csv.Error:
+ print(csv.Error)
+ # log the problem or whatever
+ continue
+
+def read_csv(fn, keys=True, create=False):
+ try:
+ with open(fn, 'r', newline='', encoding='utf-8') as f:
+ # reader = csv.reader( (line.replace('\0','') for line in f) )
+ reader = csv.reader(f)
+ lines = list(unfussy_reader(reader))
+ if keys:
+ keys = lines[0]
+ lines = lines[1:]
+ return keys, lines
+ return lines
+ except:
+ if create:
+ return []
+ raise
+
+def csv_writer(fn):
+ with open(fn, 'w', newline='', encoding='utf-8') as f:
+ return csv.writer(f)
+
+def write_csv(fn, keys, rows):
+ with open(fn, 'w', newline='', encoding='utf-8') as f:
+ writer = csv.writer(f)
+ if keys is not None:
+ writer.writerow(keys)
+ for row in rows:
+ writer.writerow(row)
+
+def read_text(fn):
+ with open(fn, 'r') as f:
+ return f.read()
+
+def read_json(fn):
+ with open(fn, 'r') as json_file:
+ return json.load(json_file)
+
+def write_json(fn, data):
+ with open(fn, 'w') as outfile:
+ json.dump(data, outfile)
+
+def write_report(fn, title=None, keys=None, rows=[]):
+ with open(fn, 'w') as f:
+ f.write("<!doctype html>")
+ f.write("<html>")
+ f.write("<head>")
+ f.write("<meta charset='utf-8'>")
+ if title is not None:
+ f.write("<title>{}</title>".format(title))
+ f.write("<link rel='stylesheet' href='reports.css'>")
+ f.write("</head>")
+ f.write("<body>")
+ if title is not None:
+ f.write("<h2>{}</h2>".format(title))
+ count = write_table(f, keys=keys, rows=rows)
+ f.write("</body>")
+ f.write("</html>")
+ print("{} {}".format(fn, count))
+
+def percent(m, n):
+ if n == 0:
+ return 100
+ return round(m / n * 100)
+
+class NameLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<span class="name">' + self.s + '</span>'
+
+class BoldLine(object):
+ def __init__(self, s):
+ self.s = s.strip()
+ def __str__(self):
+ return '<b>' + self.s + '</b>'
+
+class LinkLine(object):
+ def __init__(self, href, txt):
+ self.href = href
+ self.txt = txt.strip()
+ def __str__(self):
+ if self.href:
+ return '<a href="{}">{}</a>'.format(self.href, self.txt)
+ else:
+ return '<span class="gray">{}</a>'.format(self.txt)
+
+def write_table(f, keys, rows):
+ count = 0
+ f.write("<table border='1' cellpadding='3' cellspacing='3'>")
+ if keys is not None:
+ for key in keys:
+ f.write("<th>{}</th>".format(key))
+ for row in rows:
+ if row is None:
+ return
+ count += 1
+ f.write("<tr>")
+ for cell in row:
+ if isinstance(cell, list) or isinstance(cell, tuple):
+ f.write("<td>{}</td>".format('<br/>'.join(str(x) for x in cell)))
+ else:
+ f.write("<td>{}</td>".format(cell))
+ f.write("</tr>")
+ f.write("</table>")
+ return count
+
+def paper_path(key='papers', paper_id=''):
+ return '{}/{}/{}/{}/paper.json'.format('./datasets/s2', key, paper_id[0:2], paper_id)
+
+class DbPaper(object):
+ def __init__(self, paper_id):
+ self.paper_id = paper_id
+ self.data = read_json(paper_path('db_papers', paper_id))
+ @property
+ def title(self):
+ return self.data['title']
+ @property
+ def journal(self):
+ return self.data['journalName']
+ @property
+ def year(self):
+ return self.data['year'] if 'year' in self.data else ''
+ @property
+ def authors(self):
+ return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ]
+ @property
+ def pdf_link(self):
+ if self.data['s2PdfUrl']:
+ return self.data['s2PdfUrl']
+ if len(self.data['pdfUrls']):
+ return self.data['pdfUrls'][0]
+ return None
+ def record(self):
+ return [ self.paper_id, self.title, self.journal, self.year ]
+
+class RawPaper(object):
+ def __init__(self, paper_id):
+ self.paper_id = paper_id
+ data = read_json(paper_path('raw_papers', paper_id))
+ # print(data)
+ if 'paper' not in data:
+ print(data)
+ self.data = None
+ return None
+ self.data = data['paper']
+ @property
+ def title(self):
+ return self.data['title']['text']
+ @property
+ def year(self):
+ return self.data['year']['text']
+ @property
+ def journal(self):
+ if 'journal' in self.data and 'name' in self.data['journal']:
+ return self.data['journal']['name']
+ else:
+ return 'Unknown'
+ @property
+ def authors(self):
+ return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ]
+ @property
+ def pdf_link(self):
+ if 'primaryPaperLink' in self.data:
+ return self.data['primaryPaperLink']
+ return None
+ def record(self):
+ return [ self.paper_id, self.title, self.journal, self.year ]
+
+def load_paper(paper_id):
+ if os.path.exists(paper_path('db_papers', paper_id)):
+ # print('db paper')
+ return DbPaper(paper_id)
+ if os.path.exists(paper_path('raw_papers', paper_id)):
+ # print('raw paper')
+ return RawPaper(paper_id)
+ print('no paper')
+ return None
+
+def dedupe(a):
+ p = {}
+ for s in a:
+ p[s] = None
+ ss = sorted(p.keys())
+ return ss
+
+def read_headings(fn, paper):
+ headings = []
+ found_abstract = False
+ found_authors = []
+ journal = paper.journal.lower()
+ authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
+ with open(fn, 'r') as f:
+ for line in f.readlines():
+ line = re.sub(r"\S*@\S*\s?", '', line)
+ l = line.lower().strip()
+ if len(l) < 5:
+ continue
+ if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
+ line = line[1:]
+ line = line.strip("∗†‡")
+ line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
+ line = line.strip()
+ if 'abstract' in l:
+ found_abstract = True
+ break
+ if journal and journal in l:
+ continue
+ names = [s.strip() for s in re.split(',| and ', l)]
+ was_found = False
+ for name in names:
+ found = find_authors(authors, name)
+ if found:
+ was_found = True
+ # print("found {}".format(found[1]))
+ if found[0]:
+ found_authors.append(found)
+ continue
+ headings.append(line.strip())
+ return headings, found_abstract
+
+def find_authors(authors, line):
+ for a in authors:
+ if a[2] in line:
+ return a
+ return None
+
+class AddressBook (object):
+ def __init__(self):
+ entities = {}
+ lookup = {}
+ keys, data = fetch_google_sheet('institutions')
+ # keys, data = read_csv('reports/pdf_institutions_deduped.csv', keys=True)
+ for index, line in enumerate(data):
+ if line[0] == line[1] or line[0] not in entities:
+ entities[line[0]] = index
+ lookup[line[1].lower().strip()] = line[0]
+ self.data = data
+ self.lookup = lookup
+ self.entities = entities
+
+ def find(self, address):
+ address = address.lower().strip().strip(string.digits)
+ if address in self.lookup:
+ entity = self.lookup[address]
+ index = self.entities[entity]
+ return self.data[index]
+ for part in address.split(','):
+ part = part.strip().replace(' ', ' ')
+ if part in self.lookup:
+ entity = self.lookup[part]
+ index = self.entities[entity]
+ return self.data[index]
+ return None
+
+def fetch_spreadsheet():
+ scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
+ credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope)
+ docid = "1denb7TjYsN9igHyvYah7fQ0daABW32Z30lwV7QrDJQc"
+ client = gspread.authorize(credentials)
+ spreadsheet = client.open_by_key(docid)
+ return spreadsheet
+
+def fetch_worksheet(name="institutions"):
+ spreadsheet = fetch_spreadsheet()
+ return spreadsheet.worksheet(name)
+
+def fetch_google_sheet(name="institutions"):
+ rows = fetch_worksheet(name).get_all_values()
+ keys = rows[0]
+ lines = rows[1:]
+ return keys, lines