summaryrefslogtreecommitdiff
path: root/scraper/expand-uni-lookup.py
blob: 4ba531fda9f123643cec58848e5290fadb06d56d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
import gzip
import glob
import json
import math
import operator
import click
from util import *

@click.command()
def expand_uni_lookup():
  addresses = load_unexpanded_addresses()
  write_csv('reports/all_institutions_sorted.csv', keys=None, rows=sorted(addresses.values(), key=lambda x: x[0]))

def load_unexpanded_addresses():
  data = read_csv('reports/all_institutions.csv', keys=None)
  lookup = {}
  for row in data:
    name = row[0]
    if len(name.strip()) > 10:
      uni_name = name
      for part in name.split(', '):
        if 'universit' in part.lower():
          uni_name = part
      new_row = convert_row(row)
      if uni_name != name:
        print(uni_name)
        new_row[0] = uni_name
        uni_row = new_row.copy()
        uni_row[1] = uni_name
        if uni_name not in lookup:
          lookup[uni_name] = uni_row
      lookup[name] = new_row
  return lookup

def convert_row(row):
  return [
    row[0], row[0], row[3], row[1], row[2],
  ]

if __name__ == '__main__':
  expand_uni_lookup()