summaryrefslogtreecommitdiff
path: root/scraper/s2-geocode.py
blob: b9c31d64783e2de55a72140cf1d776d5ba9bdf3c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import random
import re
import os
import glob
import time
import simplejson as json
from geopy import geocoders
import click
from urllib.parse import urlparse
from dotenv import load_dotenv
import operator
from util import *
load_dotenv()

@click.command()
@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)')
def s2_geocode(fn):
  # geolocator = geocoders.Nominatim(user_agent="cool geocoding service")
  geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
  worksheet = fetch_worksheet('institutions')
  countries = load_countries()

  # print(fn)
  cname_lookup = {}
  name_lookup = {}
  institution_keys, institution_rows = fetch_google_sheet("institutions")
  for i, row in enumerate(institution_rows):
    # row_tuples.append((i, row,))
    cname, name, address, lat, lng, org_type, extra_address, country = row
    if len(cname) < 3:
      print("very short cname: {}".format(cname))
    if cname == name or cname not in cname_lookup:
      cname_lookup[cname] = i
    name_lookup[name] = True
  print("built lookup")

  rows = read_csv(fn, keys=False)
  valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
  invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
  valid_names = [row[0] for row in valid]
  invalid_names = [row[0] for row in invalid]
  random.shuffle(rows)
  for i, row in enumerate(rows):
    name = row[2]
    name = remove_department_name(name)
    if not name or len(name) < 2 or name in countries:
      print("weird name: {}".format(name))
      continue
    if name in cname_lookup or name in name_lookup:
      continue
    try:
      location = geolocator.geocode(name)
    except:
      location = None
    if location:
      print("found: {}".format(name))
      cname = name
      for word in name.split(', '):
        if "university" in word.lower() and 'california' not in word.lower():
          cname = word
      worksheet.append_row([
        cname, name, location.address, location.latitude, location.longitude, 'edu', '',
      ])
      valid.append([
        name,
        location.latitude,
        location.longitude,
        location.address,
      ])
      valid_names.append(name)
    else:
      print("not found: {}".format(name))
      invalid.append(row)
      invalid_names.append(row[0])
    if i and (i % 20) == 0:
      print("{}...".format(i))
      write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
      write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
    time.sleep(2)

def remove_department_name(name):
  name_partz = name.split(', ')
  valid_partz = []
  for part in name_partz:
    if 'school of' in part.lower():
      continue
    if 'department' in part.lower():
      continue
    if 'dept' in part.lower():
      continue
    valid_partz.append(part)
  return ', '.join(valid_partz)

if __name__ == '__main__':
  s2_geocode()