summaryrefslogtreecommitdiff
path: root/scraper/s2-geocode.py
diff options
context:
space:
mode:
authorAdam Harvey <adam@ahprojects.com>2018-12-23 01:37:03 +0100
committerAdam Harvey <adam@ahprojects.com>2018-12-23 01:37:03 +0100
commit4452e02e8b04f3476273574a875bb60cfbb4568b (patch)
tree3ffa44f9621b736250a8b94da14a187dc785c2fe /scraper/s2-geocode.py
parent2a65f7a157bd4bace970cef73529867b0e0a374d (diff)
parent5340bee951c18910fd764241945f1f136b5a22b4 (diff)
.
Diffstat (limited to 'scraper/s2-geocode.py')
-rw-r--r--scraper/s2-geocode.py81
1 files changed, 81 insertions, 0 deletions
diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py
new file mode 100644
index 00000000..eee11c4d
--- /dev/null
+++ b/scraper/s2-geocode.py
@@ -0,0 +1,81 @@
+import random
+import re
+import os
+import glob
+import time
+import simplejson as json
+from geopy import geocoders
+import click
+from urllib.parse import urlparse
+from dotenv import load_dotenv
+import operator
+from util import *
+load_dotenv()
+
+@click.command()
+@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)')
+def s2_geocode(fn):
+ # geolocator = geocoders.Nominatim(user_agent="cool geocoding service")
+ geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY'))
+ worksheet = fetch_worksheet()
+
+ ## DISABLED!!
+ return
+ print(fn)
+
+ rows = read_csv(fn, keys=False)
+ # valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
+ # invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
+ # valid_names = [row[0] for row in valid]
+ # invalid_names = [row[0] for row in invalid]
+ # random.shuffle(rows)
+ for i, row in enumerate(rows):
+ name = row[2]
+ name = remove_department_name(name)
+ if not name:
+ continue
+ try:
+ location = geolocator.geocode(name)
+ except:
+ location = None
+ if location:
+ print("found: {}".format(name))
+ cname = name
+ for word in name.split(', '):
+ if "university" in word.lower():
+ cname = word
+ worksheet.append_row([
+ cname, name, location.address, location.latitude, location.longitude, 'edu'
+ ])
+ # valid.append([
+ # name,
+ # location.latitude,
+ # location.longitude,
+ # location.address,
+ # ])
+ # valid_names.append(name)
+ else:
+ print("not found: {}".format(name))
+ # invalid.append(row)
+ # invalid_names.append(row[0])
+ # if i and (i % 20) == 0:
+ # print("{}...".format(i))
+ # write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
+ # write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
+ time.sleep(2)
+
+def remove_department_name(name):
+ name_partz = name.split(', ')
+ valid_partz = []
+ for part in name_partz:
+ if 'school of' in part.lower():
+ continue
+ if 'department' in part.lower():
+ continue
+ if 'dept' in part.lower():
+ continue
+ valid_partz.append(part)
+ return ', '.join(valid_partz)
+
+if __name__ == '__main__':
+ s2_geocode()