From 494dfd56be7e5e3b2d0c661947870b130f6774d0 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Mon, 6 May 2019 18:02:46 +0200 Subject: institution-dataset-counts.py --- scraper/institution-dataset-counts.py | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 scraper/institution-dataset-counts.py (limited to 'scraper/institution-dataset-counts.py') diff --git a/scraper/institution-dataset-counts.py b/scraper/institution-dataset-counts.py new file mode 100644 index 00000000..75692777 --- /dev/null +++ b/scraper/institution-dataset-counts.py @@ -0,0 +1,80 @@ +import os +import re +import glob +import simplejson as json +import math +import operator +import click +import subprocess +from util import * +DIR_FINAL_CITATIONS = "../site/datasets/final" +DIR_VERIFIED_CITATIONS = "../site/datasets/verified" + +@click.command() +def s2_dataset_country_report(): + citation_dir = DIR_FINAL_CITATIONS + megapixels = load_megapixels_lookup() + items = [] + institution_lookup = {} + country_lookup = {} + for key, item in megapixels.items(): + fn = os.path.join(citation_dir, key + '.json') + if not os.path.exists(fn): + continue + data = read_json(fn) + parse_institutions(data, institution_lookup, country_lookup) + sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x])) + sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x])) + + institution_report = [ + (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),) + for key in sorted_institutions + ] + country_report = [ + (key, len(country_lookup[key]), ', '.join(country_lookup[key]),) + for key in sorted_countries + ] + write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report) + write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report) + +def parse_institutions(data, institution_lookup, country_lookup): + key = data['paper']['key'] + for citation in data['citations']: + for address in citation['addresses']: + name = address['name'] + country = address['country'] + if name not in institution_lookup: + institution_lookup[name] = [] + if key not in institution_lookup[name]: + institution_lookup[name].append(key) + if country not in country_lookup: + country_lookup[country] = [] + if key not in country_lookup[country]: + country_lookup[country].append(key) + +def load_megapixels_lookup(): + keys, rows = fetch_google_sheet('citation_lookup') + dataset_lookup = fetch_google_lookup('datasets') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'): + continue + paper_key = rec['key'] + if paper_key not in lookup: + rec['paper_ids'] = [] + lookup[paper_key] = rec + lookup[paper_key]['paper_ids'].append(rec['paper_id']) + if paper_key in dataset_lookup: + lookup[paper_key]['dataset'] = dataset_lookup[paper_key] + else: + print("not in datasets lookup:", paper_key) + lookup[paper_key]['dataset'] = {} + return lookup + + +if __name__ == '__main__': + s2_dataset_country_report() + -- cgit v1.2.3-70-g09d2