diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-20 17:19:03 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-20 17:19:03 +0100 |
| commit | 16f7583dbdb2928c557d0ee3766f809779ae9b39 (patch) | |
| tree | 78711119681f7dbd21c7523c77ea0a4f31e56cbe | |
| parent | 9b97ddf7e1bc1febc4066cd5e083cee688d77027 (diff) | |
smoother geocode process, fix html entities
| -rw-r--r-- | scraper/README.md | 2 | ||||
| -rw-r--r-- | scraper/countries.json | 5 | ||||
| -rw-r--r-- | scraper/s2-geocode-spreadsheet.py | 15 | ||||
| -rw-r--r-- | scraper/util.py | 5 |
4 files changed, 19 insertions, 8 deletions
diff --git a/scraper/README.md b/scraper/README.md index e19a6920..ca6a2883 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -15,7 +15,7 @@ npm install ## simplified workflow -If you are just updating the scrape, run `s2-scrape.sh` to run just the scripts you need. +To rescrape everything, starting with the initial list of papers, please run `s2-scrape.sh`. ## workflow diff --git a/scraper/countries.json b/scraper/countries.json index 87b3c997..7de952f2 100644 --- a/scraper/countries.json +++ b/scraper/countries.json @@ -31,7 +31,7 @@ {"name": "Bouvet Island", "code": "BV"}, {"name": "Brazil", "code": "BR", "alt": ["Brasil"]}, {"name": "British Indian Ocean Territory", "code": "IO"}, -{"name": "Brunei Darussalam", "code": "BN"}, +{"name": "Brunei", "code": "BN", "alt": ["Brunei Darussalam"]}, {"name": "Bulgaria", "code": "BG"}, {"name": "Burkina Faso", "code": "BF"}, {"name": "Burundi", "code": "BI"}, @@ -43,7 +43,7 @@ {"name": "Central African Republic", "code": "CF"}, {"name": "Chad", "code": "TD"}, {"name": "Chile", "code": "CL"}, -{"name": "China", "code": "CN", "alt": ["中国"]}, +{"name": "China", "code": "CN", "alt": ["中国", "Hong Kong"]}, {"name": "Christmas Island", "code": "CX"}, {"name": "Cocos (Keeling) Islands", "code": "CC"}, {"name": "Colombia", "code": "CO"}, @@ -96,7 +96,6 @@ {"name": "Heard Island and Mcdonald Islands", "code": "HM"}, {"name": "Holy See (Vatican City State)", "code": "VA"}, {"name": "Honduras", "code": "HN"}, -{"name": "Hong Kong", "code": "HK"}, {"name": "Hungary", "code": "HU"}, {"name": "Iceland", "code": "IS", "alt": ["Ísland"]}, {"name": "India", "code": "IN"}, diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py index 98baf4b5..c48685f4 100644 --- a/scraper/s2-geocode-spreadsheet.py +++ b/scraper/s2-geocode-spreadsheet.py @@ -2,6 +2,7 @@ import os import csv import click import time +import html from geopy import geocoders from dotenv import load_dotenv from util import * @@ -32,7 +33,11 @@ def s2_geocode_spreadsheet(): cname_lookup[cname] = i print("built lookup") print("processing sheet...") + seen = {} for i, row in enumerate(rows): + if row[1] in seen: + continue + seen[row[1]] = True hit_api = s2_geocode_row(i, row) if hit_api: time.sleep(1) @@ -41,6 +46,12 @@ def s2_geocode_spreadsheet(): def s2_geocode_row(i, row): # 0 cname 1 name 2 address 3 lat 4 lng 5 org_type 6 notes 7 country cname, name, address, lat, lng, org_type, extra_address, country = row + decoded_cname = html.unescape(cname) + # print(decoded_cname) + if cname != decoded_cname: + worksheet.update_cell(i+2, 0+1, decoded_cname) + cname = decoded_cname + if lat and lng: if not country: update_country_from_address(address, i, countries, worksheet) @@ -75,7 +86,7 @@ def s2_geocode_row(i, row): worksheet.update_cell(i+2, 4+1, location.longitude) if address and address != location.address: worksheet.update_cell(i+2, 6+1, address) # store alt address in "notes" field - valid_count += 1 + #valid_count += 1 country = update_country_from_address(location.address, i, countries, worksheet) row[2] = location.address row[3] = location.latitude @@ -84,7 +95,7 @@ def s2_geocode_row(i, row): return True else: print("{} not found: {}".format(i+1, address_to_geocode)) - invalid_count += 1 + #invalid_count += 1 return False def update_country_from_address(address, i, countries, worksheet): diff --git a/scraper/util.py b/scraper/util.py index 0401b342..fdbc0534 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -308,7 +308,9 @@ class AddressBook (object): for index, line in enumerate(data): if line[0] == line[1] or line[0] not in entities: entities[line[0]] = index - lookup[line[1].lower().strip()] = line[0] + name = line[1].lower().strip() + if name not in lookup: + lookup[name] = line[0] self.data = data self.lookup = lookup self.entities = entities @@ -460,4 +462,3 @@ def load_countries(): for alt_name in country['alt']: lookup[alt_name] = name return lookup - |
