diff options
Diffstat (limited to 'scraper/s2-geocode-spreadsheet.py')
| -rw-r--r-- | scraper/s2-geocode-spreadsheet.py | 160 |
1 files changed, 99 insertions, 61 deletions
diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index 66607562..b21a8453 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -7,77 +7,115 @@ from dotenv import load_dotenv from util import * load_dotenv() -@click.command() -def s2_geocode_spreadsheet(): - geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) +geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) + +print('loading institutions...') +worksheet = fetch_worksheet("institutions") +keys, rows = fetch_google_sheet("institutions") - worksheet = fetch_worksheet() - keys, rows = fetch_google_sheet() - valid_count = 0 - invalid_count = 0 +print("got {} rows".format(len(rows))) - print("got {} rows".format(len(rows))) +countries = load_countries() +print('loaded countries...') - cname_lookup = {} +cname_lookup = {} + +@click.command() +def s2_geocode_spreadsheet(): + # row_tuples = [] for i, row in enumerate(rows): - if len(row) == 6: - cname, name, address, lat, lng, org_type = row - elif len(row) == 7: - cname, name, address, lat, lng, org_type, extra_address = row - else: - print("Weirdly formatted row {}".format(i)) - continue + # row_tuples.append((i, row,)) + cname, name, address, lat, lng, org_type, extra_address, country = row if cname == name or cname not in cname_lookup: cname_lookup[cname] = i - - # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type + print("built lookup") + print("processing sheet...") for i, row in enumerate(rows): - if len(row) == 6: - cname, name, address, lat, lng, org_type = row - elif len(row) == 7: - cname, name, address, lat, lng, org_type, extra_address = row - else: - print("Weirdly formatted row {}: {} entries".format(i, len(row))) - continue - if lat and lng: - continue - c_row = rows[cname_lookup[cname]] - if c_row[3] and c_row[4]: - print("name {}, found cname: {}".format(name, cname)) - worksheet.update_cell(i+2, 3, c_row[2]) - worksheet.update_cell(i+2, 4, c_row[3]) - worksheet.update_cell(i+2, 5, c_row[4]) - continue - if address: - address_to_geocode = address - elif name: - address_to_geocode = name - elif cname: - address_to_geocode = cname + hit_api = s2_geocode_row(i, row) + if hit_api: + time.sleep(1) + # parallelize(s2_geocode_row, row_tuples) + +def s2_geocode_row(i, row): + # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type 6 notes 7 country + cname, name, address, lat, lng, org_type, extra_address, country = row + if lat and lng: + if not country: + update_country_from_address(address, i, countries, worksheet) + return True + return False + + c_row = rows[cname_lookup[cname]] + if c_row[3] and c_row[4]: + print("name {}, found cname: {}".format(name, cname)) + worksheet.update_cell(i+2, 2+1, c_row[2]) + worksheet.update_cell(i+2, 3+1, c_row[3]) + worksheet.update_cell(i+2, 4+1, c_row[4]) + worksheet.update_cell(i+2, 7+1, c_row[7]) + return True + if address: + address_to_geocode = address + elif name: + address_to_geocode = name + elif cname: + address_to_geocode = cname + + if not address_to_geocode: + return False - if not address_to_geocode: - continue + print(address_to_geocode) + location = geolocator.geocode(address_to_geocode) + if location: + print("{} found: {}".format(i+1, name)) + print(location.raw) + worksheet.update_cell(i+2, 2+1, location.address) + worksheet.update_cell(i+2, 3+1, location.latitude) + worksheet.update_cell(i+2, 4+1, location.longitude) + if address and address != location.address: + worksheet.update_cell(i+2, 6+1, address) # store alt address in "notes" field + valid_count += 1 + country = update_country_from_address(location.address, i, countries, worksheet) + row[2] = location.address + row[3] = location.latitude + row[4] = location.longitude + row[7] = country + return True + else: + print("{} not found: {}".format(i+1, address_to_geocode)) + invalid_count += 1 + return False - print(address_to_geocode) - location = geolocator.geocode(address_to_geocode) - if location: - print("{} found: {}".format(i+1, name)) - print(location.raw) - worksheet.update_cell(i+2, 3, location.address) - worksheet.update_cell(i+2, 4, location.latitude) - worksheet.update_cell(i+2, 5, location.longitude) - if address and address != location.address: - worksheet.update_cell(i+2, 7, address) - valid_count += 1 - row[2] = location.address - row[3] = location.latitude - row[4] = location.longitude - else: - print("{} not found: {}".format(i+1, address_to_geocode)) - invalid_count += 1 - time.sleep(2) +def update_country_from_address(address, i, countries, worksheet): + address_partz = address.split(', ') + possible_country = address_partz[-1] + country = None + if possible_country in countries: + country = countries[possible_country] + elif "China" in address: + country = "China" + elif "Singapore" in address: + country = "Singapore" + elif "Taiwan" in address: + country = "Taiwan" + elif "Russia" in address: + country = "Russia" + elif "Japan" in address: + country = "Japan" + elif "Iran" in address: + country = "Iran" + elif "Egypt" in address: + country = "Egypt" + elif "پاکستان" in address: + country = "Pakistan" + elif "السعودية" in address: + country = "Saudi Arabia" + else: + print("unknown country: {}".format(possible_country)) + return "" - print("geocoded {} addresses, {} found, {} not found".format(len(rows), valid_count, invalid_count)) + worksheet.update_cell(i+2, 7+1, country) + # print(country) + return country if __name__ == '__main__': s2_geocode_spreadsheet() |
