diff options
| author | Adam Harvey <adam@ahprojects.com> | 2019-05-23 18:37:06 +0200 |
|---|---|---|
| committer | Adam Harvey <adam@ahprojects.com> | 2019-05-23 18:37:06 +0200 |
| commit | b2b2c7d7816baa7d6de36c1de3576a31aa92a209 (patch) | |
| tree | 9105ef39a3bfcd78e9cf4b8c183ee21e7149bf66 /scraper/institution-dataset-counts.py | |
| parent | 4559cf6cccfb6f6d8b8e59e95984044fdf5a5610 (diff) | |
| parent | 84b286e1bd85feba12174a2a480d2be404e7b9c5 (diff) | |
merge
Diffstat (limited to 'scraper/institution-dataset-counts.py')
| -rw-r--r-- | scraper/institution-dataset-counts.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/scraper/institution-dataset-counts.py b/scraper/institution-dataset-counts.py new file mode 100644 index 00000000..75692777 --- /dev/null +++ b/scraper/institution-dataset-counts.py @@ -0,0 +1,80 @@ +import os +import re +import glob +import simplejson as json +import math +import operator +import click +import subprocess +from util import * +DIR_FINAL_CITATIONS = "../site/datasets/final" +DIR_VERIFIED_CITATIONS = "../site/datasets/verified" + +@click.command() +def s2_dataset_country_report(): + citation_dir = DIR_FINAL_CITATIONS + megapixels = load_megapixels_lookup() + items = [] + institution_lookup = {} + country_lookup = {} + for key, item in megapixels.items(): + fn = os.path.join(citation_dir, key + '.json') + if not os.path.exists(fn): + continue + data = read_json(fn) + parse_institutions(data, institution_lookup, country_lookup) + sorted_institutions = sorted(institution_lookup.keys(), reverse=True, key=lambda x: len(institution_lookup[x])) + sorted_countries = sorted(country_lookup.keys(), reverse=True, key=lambda x: len(country_lookup[x])) + + institution_report = [ + (key, len(institution_lookup[key]), ', '.join(institution_lookup[key]),) + for key in sorted_institutions + ] + country_report = [ + (key, len(country_lookup[key]), ', '.join(country_lookup[key]),) + for key in sorted_countries + ] + write_report('reports/instutition_dataset_report.html', 'Institution Dataset Report', keys=['Institution', 'Count', 'Datasets'], rows=institution_report) + write_report('reports/country_dataset_report.html', 'Country Dataset Report', keys=['Country', 'Count', 'Datasets'], rows=country_report) + +def parse_institutions(data, institution_lookup, country_lookup): + key = data['paper']['key'] + for citation in data['citations']: + for address in citation['addresses']: + name = address['name'] + country = address['country'] + if name not in institution_lookup: + institution_lookup[name] = [] + if key not in institution_lookup[name]: + institution_lookup[name].append(key) + if country not in country_lookup: + country_lookup[country] = [] + if key not in country_lookup[country]: + country_lookup[country].append(key) + +def load_megapixels_lookup(): + keys, rows = fetch_google_sheet('citation_lookup') + dataset_lookup = fetch_google_lookup('datasets') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'): + continue + paper_key = rec['key'] + if paper_key not in lookup: + rec['paper_ids'] = [] + lookup[paper_key] = rec + lookup[paper_key]['paper_ids'].append(rec['paper_id']) + if paper_key in dataset_lookup: + lookup[paper_key]['dataset'] = dataset_lookup[paper_key] + else: + print("not in datasets lookup:", paper_key) + lookup[paper_key]['dataset'] = {} + return lookup + + +if __name__ == '__main__': + s2_dataset_country_report() + |
