summaryrefslogtreecommitdiff
path: root/s2-geocode.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-geocode.py')
-rw-r--r--s2-geocode.py72
1 files changed, 45 insertions, 27 deletions
diff --git a/s2-geocode.py b/s2-geocode.py
index 0c5a3540..eee11c4d 100644
--- a/s2-geocode.py
+++ b/s2-geocode.py
@@ -1,51 +1,67 @@
+import random
+import re
import os
import glob
import time
import simplejson as json
+from geopy import geocoders
import click
from urllib.parse import urlparse
+from dotenv import load_dotenv
import operator
from util import *
-from geopy.geocoders import Nominatim
-import random
+load_dotenv()
@click.command()
-@click.option('--fn', '-f', default='reports/doi_institutions.csv', help='List of institution names, to be geocoded :)')
+@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)')
def s2_geocode(fn):
- geolocator = Nominatim(user_agent="cool geocoding service")
+ # geolocator = geocoders.Nominatim(user_agent="cool geocoding service")
+ geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+ worksheet = fetch_worksheet()
+
+ ## DISABLED!!
+ return
print(fn)
+
rows = read_csv(fn, keys=False)
- valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
- invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
- valid_names = [row[0] for row in valid]
- invalid_names = [row[0] for row in invalid]
- random.shuffle(rows)
+ # valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
+ # invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
+ # valid_names = [row[0] for row in valid]
+ # invalid_names = [row[0] for row in invalid]
+ # random.shuffle(rows)
for i, row in enumerate(rows):
- name = remove_department_name(row[0])
+ name = row[2]
+ name = remove_department_name(name)
if not name:
continue
- if name in invalid_names:
- continue
- if name in valid_names:
- continue
- location = geolocator.geocode(name)
+ try:
+ location = geolocator.geocode(name)
+ except:
+ location = None
if location:
print("found: {}".format(name))
- valid.append([
- name,
- location.latitude,
- location.longitude,
- location.address,
+ cname = name
+ for word in name.split(', '):
+ if "university" in word.lower():
+ cname = word
+ worksheet.append_row([
+ cname, name, location.address, location.latitude, location.longitude, 'edu'
])
- valid_names.append(name)
+ # valid.append([
+ # name,
+ # location.latitude,
+ # location.longitude,
+ # location.address,
+ # ])
+ # valid_names.append(name)
else:
print("not found: {}".format(name))
- invalid.append(row)
- invalid_names.append(row[0])
- if i and (i % 20) == 0:
- print("{}...".format(i))
- write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
- write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
+ # invalid.append(row)
+ # invalid_names.append(row[0])
+ # if i and (i % 20) == 0:
+ # print("{}...".format(i))
+ # write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
+ # write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
time.sleep(2)
def remove_department_name(name):
@@ -56,6 +72,8 @@ def remove_department_name(name):
continue
if 'department' in part.lower():
continue
+ if 'dept' in part.lower():
+ continue
valid_partz.append(part)
return ', '.join(valid_partz)