summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-extract-full-pdf-txt.py4
-rw-r--r--scraper/util.py9
2 files changed, 11 insertions, 2 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py
index c748b6a1..32e7daec 100644
--- a/scraper/s2-extract-full-pdf-txt.py
+++ b/scraper/s2-extract-full-pdf-txt.py
@@ -18,8 +18,8 @@ s2 = SemanticScholarAPI()
def extract_full_pdf_txt():
fns = []
for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'):
- path = os.path.dirname(fn).replace('pdf', 'txt')
- out_fn = os.path.join(path, 'paper.txt')
+ out_path = os.path.dirname(fn).replace('pdf', 'txt')
+ out_fn = os.path.join(out_path, 'paper.txt')
if not os.path.exists(out_fn):
fns.append((fn, out_path, out_fn))
parallelize(extract_txt, fns)
diff --git a/scraper/util.py b/scraper/util.py
index 0c3e2169..fa9f6a22 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -400,6 +400,7 @@ def fetch_paper(s2, paper_id):
return paper
def fetch_spreadsheet():
+ """Open the Google Spreadsheet, which contains the individual worksheets"""
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
path = os.path.dirname(os.path.abspath(__file__))
credentials = ServiceAccountCredentials.from_json_keyfile_name(os.path.join(path, '.creds/Megapixels-ef28f91112a9.json'), scope)
@@ -409,16 +410,22 @@ def fetch_spreadsheet():
return spreadsheet
def fetch_worksheet(name="institutions"):
+ """Get a reference to a particular "worksheet" from the Google Spreadsheet"""
spreadsheet = fetch_spreadsheet()
return spreadsheet.worksheet(name)
def fetch_google_sheet(name="institutions"):
+ """Get all the values from a particular worksheet as a list of lists.
+ Returns:
+ :keys - the first row of the document
+ :lines - a list of lists with the rest of the rows"""
rows = fetch_worksheet(name).get_all_values()
keys = rows[0]
lines = rows[1:]
return keys, lines
def fetch_google_sheet_objects(name):
+ """Get all the values from a worksheet as a list of dictionaries"""
keys, rows = fetch_google_sheet(name)
recs = []
for row in rows:
@@ -429,6 +436,8 @@ def fetch_google_sheet_objects(name):
return recs
def fetch_google_lookup(name, item_key='key'):
+ """Get all the values from a worksheet as a dictionary of dictionaries.
+ Specify which field you want to use as the dictionary key."""
keys, rows = fetch_google_sheet(name)
lookup = {}
for row in rows: