diff options
Diffstat (limited to 's2-geocode.py')
| -rw-r--r-- | s2-geocode.py | 72 |
1 files changed, 45 insertions, 27 deletions
diff --git a/s2-geocode.py b/s2-geocode.py index 0c5a3540..eee11c4d 100644 --- a/s2-geocode.py +++ b/s2-geocode.py @@ -1,51 +1,67 @@ +import random +import re import os import glob import time import simplejson as json +from geopy import geocoders import click from urllib.parse import urlparse +from dotenv import load_dotenv import operator from util import * -from geopy.geocoders import Nominatim -import random +load_dotenv() @click.command() -@click.option('--fn', '-f', default='reports/doi_institutions.csv', help='List of institution names, to be geocoded :)') +@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)') def s2_geocode(fn): - geolocator = Nominatim(user_agent="cool geocoding service") + # geolocator = geocoders.Nominatim(user_agent="cool geocoding service") + geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) + worksheet = fetch_worksheet() + + ## DISABLED!! + return print(fn) + rows = read_csv(fn, keys=False) - valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) - invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True) - valid_names = [row[0] for row in valid] - invalid_names = [row[0] for row in invalid] - random.shuffle(rows) + # valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) + # invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True) + # valid_names = [row[0] for row in valid] + # invalid_names = [row[0] for row in invalid] + # random.shuffle(rows) for i, row in enumerate(rows): - name = remove_department_name(row[0]) + name = row[2] + name = remove_department_name(name) if not name: continue - if name in invalid_names: - continue - if name in valid_names: - continue - location = geolocator.geocode(name) + try: + location = geolocator.geocode(name) + except: + location = None if location: print("found: {}".format(name)) - valid.append([ - name, - location.latitude, - location.longitude, - location.address, + cname = name + for word in name.split(', '): + if "university" in word.lower(): + cname = word + worksheet.append_row([ + cname, name, location.address, location.latitude, location.longitude, 'edu' ]) - valid_names.append(name) + # valid.append([ + # name, + # location.latitude, + # location.longitude, + # location.address, + # ]) + # valid_names.append(name) else: print("not found: {}".format(name)) - invalid.append(row) - invalid_names.append(row[0]) - if i and (i % 20) == 0: - print("{}...".format(i)) - write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid) - write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid) + # invalid.append(row) + # invalid_names.append(row[0]) + # if i and (i % 20) == 0: + # print("{}...".format(i)) + # write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid) + # write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid) time.sleep(2) def remove_department_name(name): @@ -56,6 +72,8 @@ def remove_department_name(name): continue if 'department' in part.lower(): continue + if 'dept' in part.lower(): + continue valid_partz.append(part) return ', '.join(valid_partz) |
