scraper/institution-dataset-counts.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *
DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_VERIFIED_CITATIONS = "../site/datasets/verified"

@click.command()
def s2_dataset_country_report():
  citation_dir = DIR_FINAL_CITATIONS
  megapixels = load_megapixels_lookup()
  items = []
  institution_lookup = {}
  country_lookup = {}
  for key, item in megapixels.items():
    fn = os.path.join(citation_dir, key + '.json')
    if not os.path.exists(fn):
      continue
    data = read_json(fn)
    parse_institutions(data, institution_lookup, country_lookup)
  sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x]))
  sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x]))

  institution_report = [
    (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),)
      for key in sorted_institutions
  ]
  country_report = [
    (key, len(country_lookup[key]), ', '.join(country_lookup[key]),)
      for key in sorted_countries
  ]
  write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report)
  write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report)

def parse_institutions(data, institution_lookup, country_lookup):
  key = data['paper']['key']
  for citation in data['citations']:
    for address in citation['addresses']:
      name = address['name']
      country = address['country']
      if name not in institution_lookup:
        institution_lookup[name] = []
      if key not in institution_lookup[name]:
        institution_lookup[name].append(key)
      if country not in country_lookup:
        country_lookup[country] = []
      if key not in country_lookup[country]:
        country_lookup[country].append(key)

def load_megapixels_lookup():
  keys, rows = fetch_google_sheet('citation_lookup')
  dataset_lookup = fetch_google_lookup('datasets')
  lookup = {}
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
      continue
    paper_key = rec['key']
    if paper_key not in lookup:
      rec['paper_ids'] = []
      lookup[paper_key] = rec
    lookup[paper_key]['paper_ids'].append(rec['paper_id'])
    if paper_key in dataset_lookup:
      lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
    else:
      print("not in datasets lookup:", paper_key)
      lookup[paper_key]['dataset'] = {}
  return lookup


if __name__ == '__main__':
  s2_dataset_country_report()