1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
import os
import glob
import time
import simplejson as json
import click
from urllib.parse import urlparse
import operator
from util import *
from geopy.geocoders import Nominatim
import random
@click.command()
@click.option('--fn', '-f', default='reports/doi_institutions.csv', help='List of institution names, to be geocoded :)')
def s2_geocode(fn):
geolocator = Nominatim(user_agent="cool geocoding service")
print(fn)
rows = read_csv(fn, keys=False)
valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True)
invalid = read_csv('./reports/doi_institutions_not_found.csv', keys=False, create=True)
valid_names = [row[0] for row in valid]
invalid_names = [row[0] for row in invalid]
random.shuffle(rows)
for i, row in enumerate(rows):
name = remove_department_name(row[0])
if not name:
continue
if name in invalid_names:
continue
if name in valid_names:
continue
location = geolocator.geocode(name)
if location:
print("found: {}".format(name))
valid.append([
name,
location.latitude,
location.longitude,
location.address,
])
valid_names.append(name)
else:
print("not found: {}".format(name))
invalid.append(row)
invalid_names.append(row[0])
if i and (i % 20) == 0:
print("{}...".format(i))
write_csv('./reports/doi_institutions_geocoded.csv', keys=None, rows=valid)
write_csv('./reports/doi_institutions_not_found.csv', keys=None, rows=invalid)
time.sleep(2)
def remove_department_name(name):
name_partz = name.split(', ')
valid_partz = []
for part in name_partz:
if 'school of' in part.lower():
continue
if 'department' in part.lower():
continue
valid_partz.append(part)
return ', '.join(valid_partz)
if __name__ == '__main__':
s2_geocode()
|