summaryrefslogtreecommitdiff
path: root/scraper/institution-dataset-counts.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/institution-dataset-counts.py')
-rw-r--r--scraper/institution-dataset-counts.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/scraper/institution-dataset-counts.py b/scraper/institution-dataset-counts.py
new file mode 100644
index 00000000..75692777
--- /dev/null
+++ b/scraper/institution-dataset-counts.py
@@ -0,0 +1,80 @@
+import os
+import re
+import glob
+import simplejson as json
+import math
+import operator
+import click
+import subprocess
+from util import *
+DIR_FINAL_CITATIONS = "../site/datasets/final"
+DIR_VERIFIED_CITATIONS = "../site/datasets/verified"
+
+@click.command()
+def s2_dataset_country_report():
+ citation_dir = DIR_FINAL_CITATIONS
+ megapixels = load_megapixels_lookup()
+ items = []
+ institution_lookup = {}
+ country_lookup = {}
+ for key, item in megapixels.items():
+ fn = os.path.join(citation_dir, key + '.json')
+ if not os.path.exists(fn):
+ continue
+ data = read_json(fn)
+ parse_institutions(data, institution_lookup, country_lookup)
+ sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x]))
+ sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x]))
+
+ institution_report = [
+ (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),)
+ for key in sorted_institutions
+ ]
+ country_report = [
+ (key, len(country_lookup[key]), ', '.join(country_lookup[key]),)
+ for key in sorted_countries
+ ]
+ write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report)
+ write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report)
+
+def parse_institutions(data, institution_lookup, country_lookup):
+ key = data['paper']['key']
+ for citation in data['citations']:
+ for address in citation['addresses']:
+ name = address['name']
+ country = address['country']
+ if name not in institution_lookup:
+ institution_lookup[name] = []
+ if key not in institution_lookup[name]:
+ institution_lookup[name].append(key)
+ if country not in country_lookup:
+ country_lookup[country] = []
+ if key not in country_lookup[country]:
+ country_lookup[country].append(key)
+
+def load_megapixels_lookup():
+ keys, rows = fetch_google_sheet('citation_lookup')
+ dataset_lookup = fetch_google_lookup('datasets')
+ lookup = {}
+ for row in rows:
+ rec = {}
+ for index, key in enumerate(keys):
+ rec[key] = row[index]
+ if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
+ continue
+ paper_key = rec['key']
+ if paper_key not in lookup:
+ rec['paper_ids'] = []
+ lookup[paper_key] = rec
+ lookup[paper_key]['paper_ids'].append(rec['paper_id'])
+ if paper_key in dataset_lookup:
+ lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
+ else:
+ print("not in datasets lookup:", paper_key)
+ lookup[paper_key]['dataset'] = {}
+ return lookup
+
+
+if __name__ == '__main__':
+ s2_dataset_country_report()
+