diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 44 |
1 files changed, 21 insertions, 23 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 58ac481f..283ca4fc 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -11,18 +11,18 @@ from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/final" DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" +addresses = AddressBook() + @click.command() def s2_final_report(): - addresses = AddressBook() megapixels = load_megapixels_lookup() - ft_lookup = load_ft_lookup() - for key, row in megapixels.items(): - print(key) - ft_share = ft_lookup[key] - if ft_share: - paper_data = process_paper(row, addresses) + items = [] + for key, item in megapixels.items(): + if item['dataset']['ft_share'] == '1': + items.append((item,)) + parallelize(process_paper, items) -def process_paper(row, addresses): +def process_paper(row): aggregate_citations = {} unknown_citations = {} address = None @@ -39,6 +39,8 @@ def process_paper(row, addresses): with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], + 'dataset': row['dataset'], + 'statistics': row['statistics'], 'paper': papers[0], 'address': address, 'additional_papers': papers[1:], @@ -47,7 +49,6 @@ def process_paper(row, addresses): with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], - 'paper': papers[0], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) @@ -161,33 +162,30 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ } return res -def load_ft_lookup(): - keys, rows = fetch_google_sheet('datasets') - lookup = {} - for row in rows: - rec = {} - for index, key in enumerate(keys): - rec[key] = row[index] - if rec['ft_share'] == '1' or rec['ft_share'] == 1: - lookup[rec['key']] = True - else: - lookup[rec['key']] = False - return lookup - def load_megapixels_lookup(): keys, rows = fetch_google_sheet('citation_lookup') + dataset_lookup = fetch_google_lookup('datasets') + statistics_lookup = fetch_google_lookup('statistics') lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] - if rec['paper_id'] == "": + if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'): continue paper_key = rec['key'] if paper_key not in lookup: rec['paper_ids'] = [] lookup[paper_key] = rec lookup[paper_key]['paper_ids'].append(rec['paper_id']) + if paper_key in dataset_lookup: + lookup[paper_key]['dataset'] = dataset_lookup[paper_key] + else: + print("not in datasets lookup:", paper_key) + if paper_key in statistics_lookup: + lookup[paper_key]['statistics'] = statistics_lookup[paper_key] + else: + print("not in statistics lookup:", paper_key) # recs.append(rec) return lookup |
