summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/README.md2
-rw-r--r--scraper/countries.json5
-rw-r--r--scraper/s2-geocode-spreadsheet.py15
-rw-r--r--scraper/util.py5
4 files changed, 19 insertions, 8 deletions
diff --git a/scraper/README.md b/scraper/README.md
index e19a6920..ca6a2883 100644
--- a/scraper/README.md
+++ b/scraper/README.md
@@ -15,7 +15,7 @@ npm install
## simplified workflow
-If you are just updating the scrape, run `s2-scrape.sh` to run just the scripts you need.
+To rescrape everything, starting with the initial list of papers, please run `s2-scrape.sh`.
## workflow
diff --git a/scraper/countries.json b/scraper/countries.json
index 87b3c997..7de952f2 100644
--- a/scraper/countries.json
+++ b/scraper/countries.json
@@ -31,7 +31,7 @@
{"name": "Bouvet Island", "code": "BV"},
{"name": "Brazil", "code": "BR", "alt": ["Brasil"]},
{"name": "British Indian Ocean Territory", "code": "IO"},
-{"name": "Brunei Darussalam", "code": "BN"},
+{"name": "Brunei", "code": "BN", "alt": ["Brunei Darussalam"]},
{"name": "Bulgaria", "code": "BG"},
{"name": "Burkina Faso", "code": "BF"},
{"name": "Burundi", "code": "BI"},
@@ -43,7 +43,7 @@
{"name": "Central African Republic", "code": "CF"},
{"name": "Chad", "code": "TD"},
{"name": "Chile", "code": "CL"},
-{"name": "China", "code": "CN", "alt": ["中国"]},
+{"name": "China", "code": "CN", "alt": ["中国", "Hong Kong"]},
{"name": "Christmas Island", "code": "CX"},
{"name": "Cocos (Keeling) Islands", "code": "CC"},
{"name": "Colombia", "code": "CO"},
@@ -96,7 +96,6 @@
{"name": "Heard Island and Mcdonald Islands", "code": "HM"},
{"name": "Holy See (Vatican City State)", "code": "VA"},
{"name": "Honduras", "code": "HN"},
-{"name": "Hong Kong", "code": "HK"},
{"name": "Hungary", "code": "HU"},
{"name": "Iceland", "code": "IS", "alt": ["Ísland"]},
{"name": "India", "code": "IN"},
diff --git a/scraper/s2-geocode-spreadsheet.py b/scraper/s2-geocode-spreadsheet.py
index 98baf4b5..c48685f4 100644
--- a/scraper/s2-geocode-spreadsheet.py
+++ b/scraper/s2-geocode-spreadsheet.py
@@ -2,6 +2,7 @@ import os
import csv
import click
import time
+import html
from geopy import geocoders
from dotenv import load_dotenv
from util import *
@@ -32,7 +33,11 @@ def s2_geocode_spreadsheet():
cname_lookup[cname] = i
print("built lookup")
print("processing sheet...")
+ seen = {}
for i, row in enumerate(rows):
+ if row[1] in seen:
+ continue
+ seen[row[1]] = True
hit_api = s2_geocode_row(i, row)
if hit_api:
time.sleep(1)
@@ -41,6 +46,12 @@ def s2_geocode_spreadsheet():
def s2_geocode_row(i, row):
# 0 cname 1 name 2 address 3 lat 4 lng 5 org_type 6 notes 7 country
cname, name, address, lat, lng, org_type, extra_address, country = row
+ decoded_cname = html.unescape(cname)
+ # print(decoded_cname)
+ if cname != decoded_cname:
+ worksheet.update_cell(i+2, 0+1, decoded_cname)
+ cname = decoded_cname
+
if lat and lng:
if not country:
update_country_from_address(address, i, countries, worksheet)
@@ -75,7 +86,7 @@ def s2_geocode_row(i, row):
worksheet.update_cell(i+2, 4+1, location.longitude)
if address and address != location.address:
worksheet.update_cell(i+2, 6+1, address) # store alt address in "notes" field
- valid_count += 1
+ #valid_count += 1
country = update_country_from_address(location.address, i, countries, worksheet)
row[2] = location.address
row[3] = location.latitude
@@ -84,7 +95,7 @@ def s2_geocode_row(i, row):
return True
else:
print("{} not found: {}".format(i+1, address_to_geocode))
- invalid_count += 1
+ #invalid_count += 1
return False
def update_country_from_address(address, i, countries, worksheet):
diff --git a/scraper/util.py b/scraper/util.py
index 0401b342..fdbc0534 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -308,7 +308,9 @@ class AddressBook (object):
for index, line in enumerate(data):
if line[0] == line[1] or line[0] not in entities:
entities[line[0]] = index
- lookup[line[1].lower().strip()] = line[0]
+ name = line[1].lower().strip()
+ if name not in lookup:
+ lookup[name] = line[0]
self.data = data
self.lookup = lookup
self.entities = entities
@@ -460,4 +462,3 @@ def load_countries():
for alt_name in country['alt']:
lookup[alt_name] = name
return lookup
-