summaryrefslogtreecommitdiff
path: root/scraper/expand-uni-lookup.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/expand-uni-lookup.py')
-rw-r--r--scraper/expand-uni-lookup.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/scraper/expand-uni-lookup.py b/scraper/expand-uni-lookup.py
new file mode 100644
index 00000000..4ba531fd
--- /dev/null
+++ b/scraper/expand-uni-lookup.py
@@ -0,0 +1,42 @@
+import os
+import gzip
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def expand_uni_lookup():
+ addresses = load_unexpanded_addresses()
+ write_csv('reports/all_institutions_sorted.csv', keys=None, rows=sorted(addresses.values(), key=lambda x: x[0]))
+
+def load_unexpanded_addresses():
+ data = read_csv('reports/all_institutions.csv', keys=None)
+ lookup = {}
+ for row in data:
+ name = row[0]
+ if len(name.strip()) > 10:
+ uni_name = name
+ for part in name.split(', '):
+ if 'universit' in part.lower():
+ uni_name = part
+ new_row = convert_row(row)
+ if uni_name != name:
+ print(uni_name)
+ new_row[0] = uni_name
+ uni_row = new_row.copy()
+ uni_row[1] = uni_name
+ if uni_name not in lookup:
+ lookup[uni_name] = uni_row
+ lookup[name] = new_row
+ return lookup
+
+def convert_row(row):
+ return [
+ row[0], row[0], row[3], row[1], row[2],
+ ]
+
+if __name__ == '__main__':
+ expand_uni_lookup()