summaryrefslogtreecommitdiff
path: root/scraper/expand-uni-lookup.py
diff options
context:
space:
mode:
authorAdam Harvey <adam@ahprojects.com>2018-12-23 01:37:03 +0100
committerAdam Harvey <adam@ahprojects.com>2018-12-23 01:37:03 +0100
commit4452e02e8b04f3476273574a875bb60cfbb4568b (patch)
tree3ffa44f9621b736250a8b94da14a187dc785c2fe /scraper/expand-uni-lookup.py
parent2a65f7a157bd4bace970cef73529867b0e0a374d (diff)
parent5340bee951c18910fd764241945f1f136b5a22b4 (diff)
.
Diffstat (limited to 'scraper/expand-uni-lookup.py')
-rw-r--r--scraper/expand-uni-lookup.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/scraper/expand-uni-lookup.py b/scraper/expand-uni-lookup.py
new file mode 100644
index 00000000..4ba531fd
--- /dev/null
+++ b/scraper/expand-uni-lookup.py
@@ -0,0 +1,42 @@
+import os
+import gzip
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def expand_uni_lookup():
+ addresses = load_unexpanded_addresses()
+ write_csv('reports/all_institutions_sorted.csv', keys=None, rows=sorted(addresses.values(), key=lambda x: x[0]))
+
+def load_unexpanded_addresses():
+ data = read_csv('reports/all_institutions.csv', keys=None)
+ lookup = {}
+ for row in data:
+ name = row[0]
+ if len(name.strip()) > 10:
+ uni_name = name
+ for part in name.split(', '):
+ if 'universit' in part.lower():
+ uni_name = part
+ new_row = convert_row(row)
+ if uni_name != name:
+ print(uni_name)
+ new_row[0] = uni_name
+ uni_row = new_row.copy()
+ uni_row[1] = uni_name
+ if uni_name not in lookup:
+ lookup[uni_name] = uni_row
+ lookup[name] = new_row
+ return lookup
+
+def convert_row(row):
+ return [
+ row[0], row[0], row[3], row[1], row[2],
+ ]
+
+if __name__ == '__main__':
+ expand_uni_lookup()