diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-03-04 22:28:05 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-03-04 22:28:05 +0100 |
| commit | 754f147d559e44c9830512b7b2a3790577fe7b38 (patch) | |
| tree | 7d5cee6105c6b505eeb2b5130efa95450a9f7443 /scraper | |
| parent | d675f85d0844dd4a25407cea1dc6cf52b795157d (diff) | |
| parent | 312fb050ef76e0d48a89ca2c46a801cede4cb6d7 (diff) | |
Merge branch 'master' of asdf.us:megapixels_dev
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/s2-geocode-spreadsheet.py | 4 | ||||
| -rw-r--r-- | scraper/s2-geocode.py | 14 |
2 files changed, 18 insertions, 0 deletions
diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index b9f148a3..375c8fde 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -111,6 +111,8 @@ def update_country_from_address(address, i, countries, worksheet): country = countries[possible_country] elif "China" in address: country = "China" + elif "Hong Kong" in address: + country = "China" elif "Singapore" in address: country = "Singapore" elif "Taiwan" in address: @@ -119,6 +121,8 @@ def update_country_from_address(address, i, countries, worksheet): country = "Russia" elif "Ukraine" in address: country = "Ukraine" + elif "Hungary" in address: + country = "Hungary" elif "Japan" in address: country = "Japan" elif "Iran" in address: diff --git a/scraper/s2-geocode.py b/scraper/s2-geocode.py index e1f012c9..b9c31d64 100644 --- a/scraper/s2-geocode.py +++ b/scraper/s2-geocode.py @@ -21,6 +21,18 @@ def s2_geocode(fn): countries = load_countries() # print(fn) + cname_lookup = {} + name_lookup = {} + institution_keys, institution_rows = fetch_google_sheet("institutions") + for i, row in enumerate(institution_rows): + # row_tuples.append((i, row,)) + cname, name, address, lat, lng, org_type, extra_address, country = row + if len(cname) < 3: + print("very short cname: {}".format(cname)) + if cname == name or cname not in cname_lookup: + cname_lookup[cname] = i + name_lookup[name] = True + print("built lookup") rows = read_csv(fn, keys=False) valid = read_csv('./reports/doi_institutions_geocoded.csv', keys=False, create=True) @@ -34,6 +46,8 @@ def s2_geocode(fn): if not name or len(name) < 2 or name in countries: print("weird name: {}".format(name)) continue + if name in cname_lookup or name in name_lookup: + continue try: location = geolocator.geocode(name) except: |
