From 7885a180e1b3ddc37ef2192c74a897b911e48a14 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:05:25 +0100 Subject: adding countries to citation feed / geocode step --- scraper/s2-geocode.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'scraper/s2-geocode.py') diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 25eb6f8a..989c17bf 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -40,10 +40,10 @@ def s2_geocode(fn): print("found: {}".format(name)) cname = name for word in name.split(', '): - if "university" in word.lower(): + if "university" in word.lower() and 'california' not in word.lower(): cname = word worksheet.append_row([ - cname, name, location.address, location.latitude, location.longitude, 'edu' + cname, name, location.address, location.latitude, location.longitude, 'edu', '', ]) valid.append([ name, @@ -77,3 +77,4 @@ def remove_department_name(name): if __name__ == '__main__': s2_geocode() + -- cgit v1.2.3-70-g09d2 From 881d559cb0491c532264b151ed922c401f30db96 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:19:08 +0100 Subject: avoid adding very short cnames --- scraper/s2-geocode-spreadsheet.py | 2 ++ scraper/s2-geocode.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'scraper/s2-geocode.py') diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index b21a8453..98baf4b5 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -26,6 +26,8 @@ def s2_geocode_spreadsheet(): for i, row in enumerate(rows): # row_tuples.append((i, row,)) cname, name, address, lat, lng, org_type, extra_address, country = row + if len(cname) < 3: + print("very short cname: {}".format(cname)) if cname == name or cname not in cname_lookup: cname_lookup[cname] = i print("built lookup") diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 989c17bf..705f3a17 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -30,7 +30,7 @@ def s2_geocode(fn): for i, row in enumerate(rows): name = row[2] name = remove_department_name(name) - if not name: + if not name or len(name) < 2: continue try: location = geolocator.geocode(name) -- cgit v1.2.3-70-g09d2 From 9b97ddf7e1bc1febc4066cd5e083cee688d77027 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Wed, 20 Feb 2019 16:20:20 +0100 Subject: also avoid adding countries --- scraper/countries.json | 2 +- scraper/s2-geocode.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'scraper/s2-geocode.py') diff --git a/scraper/countries.json b/scraper/countries.json index d3dd213d..87b3c997 100644 --- a/scraper/countries.json +++ b/scraper/countries.json @@ -229,7 +229,7 @@ {"name": "Ukraine", "code": "UA"}, {"name": "United Arab Emirates", "code": "AE", "alt": ["Abu Dhabi - United Arab Emirates"]}, {"name": "United Kingdom", "code": "GB", "alt": ["UK"]}, -{"name": "United States", "code": "US", "alt": ["USA"]}, +{"name": "United States", "code": "US", "alt": ["USA", "United States of America"]}, {"name": "United States Minor Outlying Islands", "code": "UM"}, {"name": "Uruguay", "code": "UY"}, {"name": "Uzbekistan", "code": "UZ"}, diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index 705f3a17..1fcc690d 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -18,6 +18,7 @@ def s2_geocode(fn): # geolocator = geocoders.Nominatim(user_agent="cool geocoding service") geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) worksheet = fetch_worksheet('institutions') + countries = load_countries() # print(fn) @@ -31,6 +32,8 @@ def s2_geocode(fn): name = row[2] name = remove_department_name(name) if not name or len(name) < 2: + if cname in countries: + print("cname is a country: {}".format(cname)) continue try: location = geolocator.geocode(name) -- cgit v1.2.3-70-g09d2