diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-02-20 16:21:53 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-02-20 16:21:53 +0100 |
| commit | e0038fbc4b891fe4393acfad8d9755fa1834278e (patch) | |
| tree | c35aa2d2b0a76c10e57904ed1f41f5a7dcdd4870 /scraper/util.py | |
| parent | 225b7936cd1b80effa4bf77b1ffc3c92a8f17526 (diff) | |
| parent | 9b97ddf7e1bc1febc4066cd5e083cee688d77027 (diff) | |
mergez
Diffstat (limited to 'scraper/util.py')
| -rw-r--r-- | scraper/util.py | 27 |
1 files changed, 25 insertions, 2 deletions
diff --git a/scraper/util.py b/scraper/util.py index 0c3e2169..0401b342 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -331,10 +331,13 @@ class AddressBook (object): row = self.find(address) if row is not None: return { - 'address': row[0], + 'name': row[0], + 'source_name': row[1], + 'street_adddress': row[2], 'lat': row[3], 'lng': row[4], 'type': row[5], + 'country': row[7], } return None @@ -372,7 +375,7 @@ def file_path(key, paper_id, fn): return os.path.join(data_path(key, paper_id), fn) def parallelize(func, rows): - print("Fetching {} items".format(len(rows))) + print("Processing {} items".format(len(rows))) if hasattr(os, 'sched_getaffinity'): processCount = len(os.sched_getaffinity(0)) else: @@ -400,6 +403,7 @@ def fetch_paper(s2, paper_id): return paper def fetch_spreadsheet(): + """Open the Google Spreadsheet, which contains the individual worksheets""" scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] path = os.path.dirname(os.path.abspath(__file__)) credentials = ServiceAccountCredentials.from_json_keyfile_name(os.path.join(path, '.creds/Megapixels-ef28f91112a9.json'), scope) @@ -409,16 +413,22 @@ def fetch_spreadsheet(): return spreadsheet def fetch_worksheet(name="institutions"): + """Get a reference to a particular "worksheet" from the Google Spreadsheet""" spreadsheet = fetch_spreadsheet() return spreadsheet.worksheet(name) def fetch_google_sheet(name="institutions"): + """Get all the values from a particular worksheet as a list of lists. + Returns: + :keys - the first row of the document + :lines - a list of lists with the rest of the rows""" rows = fetch_worksheet(name).get_all_values() keys = rows[0] lines = rows[1:] return keys, lines def fetch_google_sheet_objects(name): + """Get all the values from a worksheet as a list of dictionaries""" keys, rows = fetch_google_sheet(name) recs = [] for row in rows: @@ -429,6 +439,8 @@ def fetch_google_sheet_objects(name): return recs def fetch_google_lookup(name, item_key='key'): + """Get all the values from a worksheet as a dictionary of dictionaries. + Specify which field you want to use as the dictionary key.""" keys, rows = fetch_google_sheet(name) lookup = {} for row in rows: @@ -438,3 +450,14 @@ def fetch_google_lookup(name, item_key='key'): lookup[rec[item_key]] = rec return lookup +def load_countries(): + countries = read_json('countries.json') + lookup = {} + for country in countries: + name = country['name'] + lookup[name] = name + if 'alt' in country: + for alt_name in country['alt']: + lookup[alt_name] = name + return lookup + |
