summaryrefslogtreecommitdiff
path: root/scraper/expand-uni-lookup.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2018-12-15 19:57:49 +0100
committeradamhrv <adam@ahprojects.com>2018-12-15 19:57:49 +0100
commit82b2c0b5d6d7baccbe4d574d96e18fe2078047d7 (patch)
treea8784b7ec2bc5a0451c252f66a6b786f3a2504f5 /scraper/expand-uni-lookup.py
parent8e978af21c2b29f678a09701afb3ec7d65d0a6ab (diff)
parentc5b02ffab8d388e8a2925e51736b902a48a95e71 (diff)
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/expand-uni-lookup.py')
-rw-r--r--scraper/expand-uni-lookup.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/scraper/expand-uni-lookup.py b/scraper/expand-uni-lookup.py
new file mode 100644
index 00000000..4ba531fd
--- /dev/null
+++ b/scraper/expand-uni-lookup.py
@@ -0,0 +1,42 @@
+import os
+import gzip
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def expand_uni_lookup():
+ addresses = load_unexpanded_addresses()
+ write_csv('reports/all_institutions_sorted.csv', keys=None, rows=sorted(addresses.values(), key=lambda x: x[0]))
+
+def load_unexpanded_addresses():
+ data = read_csv('reports/all_institutions.csv', keys=None)
+ lookup = {}
+ for row in data:
+ name = row[0]
+ if len(name.strip()) > 10:
+ uni_name = name
+ for part in name.split(', '):
+ if 'universit' in part.lower():
+ uni_name = part
+ new_row = convert_row(row)
+ if uni_name != name:
+ print(uni_name)
+ new_row[0] = uni_name
+ uni_row = new_row.copy()
+ uni_row[1] = uni_name
+ if uni_name not in lookup:
+ lookup[uni_name] = uni_row
+ lookup[name] = new_row
+ return lookup
+
+def convert_row(row):
+ return [
+ row[0], row[0], row[3], row[1], row[2],
+ ]
+
+if __name__ == '__main__':
+ expand_uni_lookup()