summaryrefslogtreecommitdiff
path: root/scraper/s2-geocode-spreadsheet.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-20 16:05:25 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-20 16:05:25 +0100
commit7885a180e1b3ddc37ef2192c74a897b911e48a14 (patch)
treebac496d50008c035668d7e6c0143b7ecabd3193d /scraper/s2-geocode-spreadsheet.py
parent8f0d09f4c5bfb8b09757c3dbdb6d29061f0405d4 (diff)
adding countries to citation feed / geocode step
Diffstat (limited to 'scraper/s2-geocode-spreadsheet.py')
-rw-r--r--scraper/s2-geocode-spreadsheet.py160
1 files changed, 99 insertions, 61 deletions
diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py
index 66607562..b21a8453 100644
--- a/scraper/s2-geocode-spreadsheet.py
+++ b/scraper/s2-geocode-spreadsheet.py
@@ -7,77 +7,115 @@ from dotenv import load_dotenv
from util import *
load_dotenv()
-@click.command()
-def s2_geocode_spreadsheet():
- geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+
+print('loading institutions...')
+worksheet = fetch_worksheet("institutions")
+keys, rows = fetch_google_sheet("institutions")
- worksheet = fetch_worksheet()
- keys, rows = fetch_google_sheet()
- valid_count = 0
- invalid_count = 0
+print("got {} rows".format(len(rows)))
- print("got {} rows".format(len(rows)))
+countries = load_countries()
+print('loaded countries...')
- cname_lookup = {}
+cname_lookup = {}
+
+@click.command()
+def s2_geocode_spreadsheet():
+ # row_tuples = []
for i, row in enumerate(rows):
- if len(row) == 6:
- cname, name, address, lat, lng, org_type = row
- elif len(row) == 7:
- cname, name, address, lat, lng, org_type, extra_address = row
- else:
- print("Weirdly formatted row {}".format(i))
- continue
+ # row_tuples.append((i, row,))
+ cname, name, address, lat, lng, org_type, extra_address, country = row
if cname == name or cname not in cname_lookup:
cname_lookup[cname] = i
-
- # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type
+ print("built lookup")
+ print("processing sheet...")
for i, row in enumerate(rows):
- if len(row) == 6:
- cname, name, address, lat, lng, org_type = row
- elif len(row) == 7:
- cname, name, address, lat, lng, org_type, extra_address = row
- else:
- print("Weirdly formatted row {}: {} entries".format(i, len(row)))
- continue
- if lat and lng:
- continue
- c_row = rows[cname_lookup[cname]]
- if c_row[3] and c_row[4]:
- print("name {}, found cname: {}".format(name, cname))
- worksheet.update_cell(i+2, 3, c_row[2])
- worksheet.update_cell(i+2, 4, c_row[3])
- worksheet.update_cell(i+2, 5, c_row[4])
- continue
- if address:
- address_to_geocode = address
- elif name:
- address_to_geocode = name
- elif cname:
- address_to_geocode = cname
+ hit_api = s2_geocode_row(i, row)
+ if hit_api:
+ time.sleep(1)
+ # parallelize(s2_geocode_row, row_tuples)
+
+def s2_geocode_row(i, row):
+ # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type 6 notes 7 country
+ cname, name, address, lat, lng, org_type, extra_address, country = row
+ if lat and lng:
+ if not country:
+ update_country_from_address(address, i, countries, worksheet)
+ return True
+ return False
+
+ c_row = rows[cname_lookup[cname]]
+ if c_row[3] and c_row[4]:
+ print("name {}, found cname: {}".format(name, cname))
+ worksheet.update_cell(i+2, 2+1, c_row[2])
+ worksheet.update_cell(i+2, 3+1, c_row[3])
+ worksheet.update_cell(i+2, 4+1, c_row[4])
+ worksheet.update_cell(i+2, 7+1, c_row[7])
+ return True
+ if address:
+ address_to_geocode = address
+ elif name:
+ address_to_geocode = name
+ elif cname:
+ address_to_geocode = cname
+
+ if not address_to_geocode:
+ return False
- if not address_to_geocode:
- continue
+ print(address_to_geocode)
+ location = geolocator.geocode(address_to_geocode)
+ if location:
+ print("{} found: {}".format(i+1, name))
+ print(location.raw)
+ worksheet.update_cell(i+2, 2+1, location.address)
+ worksheet.update_cell(i+2, 3+1, location.latitude)
+ worksheet.update_cell(i+2, 4+1, location.longitude)
+ if address and address != location.address:
+ worksheet.update_cell(i+2, 6+1, address) # store alt address in "notes" field
+ valid_count += 1
+ country = update_country_from_address(location.address, i, countries, worksheet)
+ row[2] = location.address
+ row[3] = location.latitude
+ row[4] = location.longitude
+ row[7] = country
+ return True
+ else:
+ print("{} not found: {}".format(i+1, address_to_geocode))
+ invalid_count += 1
+ return False
- print(address_to_geocode)
- location = geolocator.geocode(address_to_geocode)
- if location:
- print("{} found: {}".format(i+1, name))
- print(location.raw)
- worksheet.update_cell(i+2, 3, location.address)
- worksheet.update_cell(i+2, 4, location.latitude)
- worksheet.update_cell(i+2, 5, location.longitude)
- if address and address != location.address:
- worksheet.update_cell(i+2, 7, address)
- valid_count += 1
- row[2] = location.address
- row[3] = location.latitude
- row[4] = location.longitude
- else:
- print("{} not found: {}".format(i+1, address_to_geocode))
- invalid_count += 1
- time.sleep(2)
+def update_country_from_address(address, i, countries, worksheet):
+ address_partz = address.split(', ')
+ possible_country = address_partz[-1]
+ country = None
+ if possible_country in countries:
+ country = countries[possible_country]
+ elif "China" in address:
+ country = "China"
+ elif "Singapore" in address:
+ country = "Singapore"
+ elif "Taiwan" in address:
+ country = "Taiwan"
+ elif "Russia" in address:
+ country = "Russia"
+ elif "Japan" in address:
+ country = "Japan"
+ elif "Iran" in address:
+ country = "Iran"
+ elif "Egypt" in address:
+ country = "Egypt"
+ elif "پاکستان" in address:
+ country = "Pakistan"
+ elif "السعودية" in address:
+ country = "Saudi Arabia"
+ else:
+ print("unknown country: {}".format(possible_country))
+ return ""
- print("geocoded {} addresses, {} found, {} not found".format(len(rows), valid_count, invalid_count))
+ worksheet.update_cell(i+2, 7+1, country)
+ # print(country)
+ return country
if __name__ == '__main__':
s2_geocode_spreadsheet()