diff options
| -rw-r--r-- | scraper/s2-extract-full-pdf-txt.py | 4 | ||||
| -rw-r--r-- | scraper/util.py | 9 |
2 files changed, 11 insertions, 2 deletions
diff --git a/scraper/s2-extract-full-pdf-txt.py b/scraper/s2-extract-full-pdf-txt.py index c748b6a1..32e7daec 100644 --- a/scraper/s2-extract-full-pdf-txt.py +++ b/scraper/s2-extract-full-pdf-txt.py @@ -18,8 +18,8 @@ s2 = SemanticScholarAPI() def extract_full_pdf_txt(): fns = [] for fn in glob.iglob('datasets/s2/pdf/*/*/*.pdf'): - path = os.path.dirname(fn).replace('pdf', 'txt') - out_fn = os.path.join(path, 'paper.txt') + out_path = os.path.dirname(fn).replace('pdf', 'txt') + out_fn = os.path.join(out_path, 'paper.txt') if not os.path.exists(out_fn): fns.append((fn, out_path, out_fn)) parallelize(extract_txt, fns) diff --git a/scraper/util.py b/scraper/util.py index 0c3e2169..fa9f6a22 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -400,6 +400,7 @@ def fetch_paper(s2, paper_id): return paper def fetch_spreadsheet(): + """Open the Google Spreadsheet, which contains the individual worksheets""" scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] path = os.path.dirname(os.path.abspath(__file__)) credentials = ServiceAccountCredentials.from_json_keyfile_name(os.path.join(path, '.creds/Megapixels-ef28f91112a9.json'), scope) @@ -409,16 +410,22 @@ def fetch_spreadsheet(): return spreadsheet def fetch_worksheet(name="institutions"): + """Get a reference to a particular "worksheet" from the Google Spreadsheet""" spreadsheet = fetch_spreadsheet() return spreadsheet.worksheet(name) def fetch_google_sheet(name="institutions"): + """Get all the values from a particular worksheet as a list of lists. + Returns: + :keys - the first row of the document + :lines - a list of lists with the rest of the rows""" rows = fetch_worksheet(name).get_all_values() keys = rows[0] lines = rows[1:] return keys, lines def fetch_google_sheet_objects(name): + """Get all the values from a worksheet as a list of dictionaries""" keys, rows = fetch_google_sheet(name) recs = [] for row in rows: @@ -429,6 +436,8 @@ def fetch_google_sheet_objects(name): return recs def fetch_google_lookup(name, item_key='key'): + """Get all the values from a worksheet as a dictionary of dictionaries. + Specify which field you want to use as the dictionary key.""" keys, rows = fetch_google_sheet(name) lookup = {} for row in rows: |
