diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /scraper/s2-geocode.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 'scraper/s2-geocode.py')
| -rw-r--r-- | scraper/s2-geocode.py | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py new file mode 100644 index 00000000..eee11c4d --- /dev/null +++ b/scraper/s2-geocode.py @@ -0,0 +1,81 @@ +import random +import re +import os +import glob +import time +import simplejson as json +from geopy import geocoders +import click +from urllib.parse import urlparse +from dotenv import load_dotenv +import operator +from util import * +load_dotenv() + +@click.command() +@click.option('--fn', '-f', default='reports/doi_institutions_unknown.csv', help='List of institution names, to be geocoded :)') +def s2_geocode(fn): + # geolocator = geocoders.Nominatim(user_agent="cool geocoding service") + geolocator = geocoders.GoogleV3(os.getenv('MAPS_API_KEY')) + worksheet = fetch_worksheet() + + ## DISABLED!! + return + print(fn) + + rows = read_csv(fn, keys=False) + # valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) + # invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True) + # valid_names = [row[0] for row in valid] + # invalid_names = [row[0] for row in invalid] + # random.shuffle(rows) + for i, row in enumerate(rows): + name = row[2] + name = remove_department_name(name) + if not name: + continue + try: + location = geolocator.geocode(name) + except: + location = None + if location: + print("found: {}".format(name)) + cname = name + for word in name.split(', '): + if "university" in word.lower(): + cname = word + worksheet.append_row([ + cname, name, location.address, location.latitude, location.longitude, 'edu' + ]) + # valid.append([ + # name, + # location.latitude, + # location.longitude, + # location.address, + # ]) + # valid_names.append(name) + else: + print("not found: {}".format(name)) + # invalid.append(row) + # invalid_names.append(row[0]) + # if i and (i % 20) == 0: + # print("{}...".format(i)) + # write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid) + # write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid) + time.sleep(2) + +def remove_department_name(name): + name_partz = name.split(', ') + valid_partz = [] + for part in name_partz: + if 'school of' in part.lower(): + continue + if 'department' in part.lower(): + continue + if 'dept' in part.lower(): + continue + valid_partz.append(part) + return ', '.join(valid_partz) + +if __name__ == '__main__': + s2_geocode() |
