import random import re import os import glob import time import simplejson as json from geopy import geocoders import click from urllib.parse import urlparse from dotenv import load_dotenv import operator from util import * load_dotenv() @click.command() @click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)') def s2_geocode(fn): # geolocator = geocoders.Nominatim(user_agent="cool geocoding service") geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) worksheet = fetch_worksheet('institutions') countries = load_countries() # print(fn) rows = read_csv(fn, keys=False) valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True) valid_names = [row[0] for row in valid] invalid_names = [row[0] for row in invalid] random.shuffle(rows) for i, row in enumerate(rows): name = row[2] name = remove_department_name(name) if not name or len(name) < 2 or name in countries: print("weird name: {}".format(name)) continue try: location = geolocator.geocode(name) except: location = None if location: print("found: {}".format(name)) cname = name for word in name.split(', '): if "university" in word.lower() and 'california' not in word.lower(): cname = word worksheet.append_row([ cname, name, location.address, location.latitude, location.longitude, 'edu', '', ]) valid.append([ name, location.latitude, location.longitude, location.address, ]) valid_names.append(name) else: print("not found: {}".format(name)) invalid.append(row) invalid_names.append(row[0]) if i and (i % 20) == 0: print("{}...".format(i)) write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid) write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid) time.sleep(2) def remove_department_name(name): name_partz = name.split(', ') valid_partz = [] for part in name_partz: if 'school of' in part.lower(): continue if 'department' in part.lower(): continue if 'dept' in part.lower(): continue valid_partz.append(part) return ', '.join(valid_partz) if __name__ == '__main__': s2_geocode()