1 files changed, 80 insertions, 0 deletions
diff --git a/scraper/institution-dataset-counts.py b/scraper/institution-dataset-counts.py
new file mode 100644
index 00000000..75692777
--- /dev/null
+++ b/scraper/institution-dataset-counts.py
@@ -0,0 +1,80 @@
+import os
+import re
+import glob
+import simplejson as json
+import math
+import operator
+import click
+import subprocess
+from util import *
+DIR_FINAL_CITATIONS = "../site/datasets/final"
+DIR_VERIFIED_CITATIONS = "../site/datasets/verified"
+
+@click.command()
+def s2_dataset_country_report():
+  citation_dir = DIR_FINAL_CITATIONS
+  megapixels = load_megapixels_lookup()
+  items = []
+  institution_lookup = {}
+  country_lookup = {}
+  for key, item in megapixels.items():
+    fn = os.path.join(citation_dir, key + '.json')
+    if not os.path.exists(fn):
+      continue
+    data = read_json(fn)
+    parse_institutions(data, institution_lookup, country_lookup)
+  sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x]))
+  sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x]))
+
+  institution_report = [
+    (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),)
+      for key in sorted_institutions
+  ]
+  country_report = [
+    (key, len(country_lookup[key]), ', '.join(country_lookup[key]),)
+      for key in sorted_countries
+  ]
+  write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report)
+  write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report)
+
+def parse_institutions(data, institution_lookup, country_lookup):
+  key = data['paper']['key']
+  for citation in data['citations']:
+    for address in citation['addresses']:
+      name = address['name']
+      country = address['country']
+      if name not in institution_lookup:
+        institution_lookup[name] = []
+      if key not in institution_lookup[name]:
+        institution_lookup[name].append(key)
+      if country not in country_lookup:
+        country_lookup[country] = []
+      if key not in country_lookup[country]:
+        country_lookup[country].append(key)
+
+def load_megapixels_lookup():
+  keys, rows = fetch_google_sheet('citation_lookup')
+  dataset_lookup = fetch_google_lookup('datasets')
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
+      continue
+    paper_key = rec['key']
+    if paper_key not in lookup:
+      rec['paper_ids'] = []
+      lookup[paper_key] = rec
+    lookup[paper_key]['paper_ids'].append(rec['paper_id'])
+    if paper_key in dataset_lookup:
+      lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
+    else:
+      print("not in datasets lookup:", paper_key)
+      lookup[paper_key]['dataset'] = {}
+  return lookup
+
+
+if __name__ == '__main__':
+  s2_dataset_country_report()
+