import os import re import glob import simplejson as json import math import operator import click import subprocess from util import * DIR_FINAL_CITATIONS = "../site/datasets/final" DIR_VERIFIED_CITATIONS = "../site/datasets/verified" @click.command() def s2_dataset_country_report(): citation_dir = DIR_FINAL_CITATIONS megapixels = load_megapixels_lookup() items = [] institution_lookup = {} country_lookup = {} for key, item in megapixels.items(): fn = os.path.join(citation_dir, key + '.json') if not os.path.exists(fn): continue data = read_json(fn) parse_institutions(data, institution_lookup, country_lookup) sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x])) sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x])) institution_report = [ (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),) for key in sorted_institutions ] country_report = [ (key, len(country_lookup[key]), ', '.join(country_lookup[key]),) for key in sorted_countries ] write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report) write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report) def parse_institutions(data, institution_lookup, country_lookup): key = data['paper']['key'] for citation in data['citations']: for address in citation['addresses']: name = address['name'] country = address['country'] if name not in institution_lookup: institution_lookup[name] = [] if key not in institution_lookup[name]: institution_lookup[name].append(key) if country not in country_lookup: country_lookup[country] = [] if key not in country_lookup[country]: country_lookup[country].append(key) def load_megapixels_lookup(): keys, rows = fetch_google_sheet('citation_lookup') dataset_lookup = fetch_google_lookup('datasets') lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'): continue paper_key = rec['key'] if paper_key not in lookup: rec['paper_ids'] = [] lookup[paper_key] = rec lookup[paper_key]['paper_ids'].append(rec['paper_id']) if paper_key in dataset_lookup: lookup[paper_key]['dataset'] = dataset_lookup[paper_key] else: print("not in datasets lookup:", paper_key) lookup[paper_key]['dataset'] = {} return lookup if __name__ == '__main__': s2_dataset_country_report()