diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-03-28 17:25:28 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-03-28 17:25:28 +0100 |
| commit | fd4faf7fb94e7b5cbcf5e232d1fd08822e8825bb (patch) | |
| tree | 060e7ff9b25402e90eb3cab078193155e60b1fcf /scraper/s2-final-report.py | |
| parent | 7347fb5a2a8b966b9dce79d97a5d2bdf3c6557d1 (diff) | |
build verified citations report
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 22 |
1 files changed, 20 insertions, 2 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 3673f516..febbbafd 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -11,6 +11,7 @@ from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/citations" DIR_FINAL_CITATIONS = "../site/datasets/final" DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" +DIR_VERIFIED_CITATIONS = "../site/datasets/verified" addresses = AddressBook() paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id') @@ -18,10 +19,15 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i @click.command() def s2_final_report(): megapixels = load_megapixels_lookup() + verified_lookup = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y': - items.append((item,)) + if key in verified_lookup: + lookup = verified_lookup[key] + else: + lookup = {} + items.append((item, lookup,)) parallelize(process_paper, items) # key name_short name_full purpose url # wild indoor outdoor campus cyberspace parent @@ -36,7 +42,7 @@ def s2_final_report(): "s3://megapixels/v1/citations/", ]) -def process_paper(row): +def process_paper(row, verified_lookup): aggregate_citations = {} unknown_citations = {} address = None @@ -78,6 +84,18 @@ def process_paper(row): 'address': address_list[0] if len(address_list) else {}, 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': paper['paper_id'], + 'paper': { + 'key': row['key'], + 'name': row['name'], + 'title': paper['title'], + 'year': paper['year'], + }, + 'address': address_list[0] if len(address_list) else {}, + 'citations': [aggregate_citations[key] for key in verified_citations.keys()], + }, f) def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): res = { |
