1 files changed, 45 insertions, 27 deletions
diff --git a/s2-geocode.py b/s2-geocode.py
index 0c5a3540..eee11c4d 100644
--- a/s2-geocode.py
+++ b/s2-geocode.py
@@ -1,51 +1,67 @@
+import random
+import re
 import os
 import glob
 import time
 import simplejson as json
+from geopy import geocoders
 import click
 from urllib.parse import urlparse
+from dotenv import load_dotenv
 import operator
 from util import *
-from geopy.geocoders import Nominatim
-import random
+load_dotenv()
 
 @click.command()
-@click.option('--fn', '-f', default='reports/doi_institutions.csv', help='List of institution names, to be geocoded :)')
+@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)')
 def s2_geocode(fn):
-  geolocator = Nominatim(user_agent="cool geocoding service")
+  # geolocator = geocoders.Nominatim(user_agent="cool geocoding service")
+  geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+  worksheet = fetch_worksheet()
+
+  ## DISABLED!!
+  return
   print(fn)
+
   rows = read_csv(fn, keys=False)
-  valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
-  invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
-  valid_names = [row[0] for row in valid]
-  invalid_names = [row[0] for row in invalid]
-  random.shuffle(rows)
+  # valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
+  # invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
+  # valid_names = [row[0] for row in valid]
+  # invalid_names = [row[0] for row in invalid]
+  # random.shuffle(rows)
   for i, row in enumerate(rows):
-    name = remove_department_name(row[0])
+    name = row[2]
+    name = remove_department_name(name)
     if not name:
       continue
-    if name in invalid_names:
-      continue
-    if name in valid_names:
-      continue
-    location = geolocator.geocode(name)
+    try:
+      location = geolocator.geocode(name)
+    except:
+      location = None
     if location:
       print("found: {}".format(name))
-      valid.append([
-        name,
-        location.latitude,
-        location.longitude,
-        location.address,
+      cname = name
+      for word in name.split(', '):
+        if "university" in word.lower():
+          cname = word
+      worksheet.append_row([
+        cname, name, location.address, location.latitude, location.longitude, 'edu'
       ])
-      valid_names.append(name)
+      # valid.append([
+      #   name,
+      #   location.latitude,
+      #   location.longitude,
+      #   location.address,
+      # ])
+      # valid_names.append(name)
     else:
       print("not found: {}".format(name))
-      invalid.append(row)
-      invalid_names.append(row[0])
-    if i and (i % 20) == 0:
-      print("{}...".format(i))
-      write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
-      write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
+      # invalid.append(row)
+      # invalid_names.append(row[0])
+    # if i and (i % 20) == 0:
+      # print("{}...".format(i))
+      # write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
+      # write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
     time.sleep(2)
 
 def remove_department_name(name):
@@ -56,6 +72,8 @@ def remove_department_name(name):
       continue
     if 'department' in part.lower():
       continue
+    if 'dept' in part.lower():
+      continue
     valid_partz.append(part)
   return ', '.join(valid_partz)