import os import glob import time import simplejson as json import click from urllib.parse import urlparse import operator from util import * from geopy.geocoders import Nominatim import random @click.command() @click.option('--fn', '-f', default='reports/doi_institutions.csv', help='List of institution names, to be geocoded :)') def s2_geocode(fn): geolocator = Nominatim(user_agent="cool geocoding service") print(fn) rows = read_csv(fn, keys=False) valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True) valid_names = [row[0] for row in valid] invalid_names = [row[0] for row in invalid] random.shuffle(rows) for i, row in enumerate(rows): name = remove_department_name(row[0]) if not name: continue if name in invalid_names: continue if name in valid_names: continue location = geolocator.geocode(name) if location: print("found: {}".format(name)) valid.append([ name, location.latitude, location.longitude, location.address, ]) valid_names.append(name) else: print("not found: {}".format(name)) invalid.append(row) invalid_names.append(row[0]) if i and (i % 20) == 0: print("{}...".format(i)) write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid) write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid) time.sleep(2) def remove_department_name(name): name_partz = name.split(', ') valid_partz = [] for part in name_partz: if 'school of' in part.lower(): continue if 'department' in part.lower(): continue valid_partz.append(part) return ', '.join(valid_partz) if __name__ == '__main__': s2_geocode()