1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *
DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_VERIFIED_CITATIONS = "../site/datasets/verified"
@click.command()
def s2_dataset_country_report():
citation_dir = DIR_FINAL_CITATIONS
megapixels = load_megapixels_lookup()
items = []
institution_lookup = {}
country_lookup = {}
for key, item in megapixels.items():
fn = os.path.join(citation_dir, key + '.json')
if not os.path.exists(fn):
continue
data = read_json(fn)
parse_institutions(data, institution_lookup, country_lookup)
sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x]))
sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x]))
institution_report = [
(key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),)
for key in sorted_institutions
]
country_report = [
(key, len(country_lookup[key]), ', '.join(country_lookup[key]),)
for key in sorted_countries
]
write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report)
write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report)
def parse_institutions(data, institution_lookup, country_lookup):
key = data['paper']['key']
for citation in data['citations']:
for address in citation['addresses']:
name = address['name']
country = address['country']
if name not in institution_lookup:
institution_lookup[name] = []
if key not in institution_lookup[name]:
institution_lookup[name].append(key)
if country not in country_lookup:
country_lookup[country] = []
if key not in country_lookup[country]:
country_lookup[country].append(key)
def load_megapixels_lookup():
keys, rows = fetch_google_sheet('citation_lookup')
dataset_lookup = fetch_google_lookup('datasets')
lookup = {}
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
continue
paper_key = rec['key']
if paper_key not in lookup:
rec['paper_ids'] = []
lookup[paper_key] = rec
lookup[paper_key]['paper_ids'].append(rec['paper_id'])
if paper_key in dataset_lookup:
lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
else:
print("not in datasets lookup:", paper_key)
lookup[paper_key]['dataset'] = {}
return lookup
if __name__ == '__main__':
s2_dataset_country_report()
|