summaryrefslogtreecommitdiff
path: root/s2-geocode.py
blob: 76b6b1e2e4519cd8e3dafb9506bef7e3db5dbcfc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import glob
import time
import simplejson as json
import click
from urllib.parse import urlparse
import operator
from util import *
from geopy.geocoders import Nominatim
import random

@click.command()
@click.option('--fn', '-f', default='reports/institution_names.csv', help='List of institution names, to be geocoded :)')
def s2_geocode(fn):
  geolocator = Nominatim(user_agent="cool geocoding service")
  print(fn)
  rows = read_csv(fn, keys=False)
  valid = read_csv('./reports/institutions_geocoded.csv', keys=False, create=True)
  invalid = read_csv('./reports/institutions_not_found.csv', keys=False, create=True)
  valid_names = []
  invalid_names = []
  random.shuffle(rows)
  for i, row in enumerate(rows):
    name = row[0]
    if name in invalid_names:
      continue
    if name in valid_names:
      continue
    location = geolocator.geocode(name)
    if location:
      print("found: {}".format(name))
      valid.append([
        name,
        location.latitude,
        location.longitude,
        location.address,
      ])
      valid_names.append(name)
    else:
      print("not found: {}".format(name))
      invalid.append(row)
      invalid_names.append(name)
    if i and (i % 20) == 0:
      print("{}...".format(i))
      write_csv('./reports/institutions_geocoded.csv', keys=None, rows=valid)
      write_csv('./reports/institutions_not_found.csv', keys=None, rows=invalid)
    time.sleep(5)

if __name__ == '__main__':
  s2_geocode()