diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 32 |
1 files changed, 25 insertions, 7 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 4b74750a..ec4ad25b 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -8,7 +8,8 @@ import click import subprocess from util import * -DIR_PUBLIC_CITATIONS = "../site/datasets/final" +DIR_PUBLIC_CITATIONS = "../site/datasets/citations" +DIR_FINAL_CITATIONS = "../site/datasets/final" DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" addresses = AddressBook() @@ -19,9 +20,14 @@ def s2_final_report(): megapixels = load_megapixels_lookup() items = [] for key, item in megapixels.items(): - if item['dataset']['ft_share'] == '1': + if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y': items.append((item,)) parallelize(process_paper, items) + subprocess.call([ + "s3cmd", "put", "-P", "--recursive", + DIR_PUBLIC_CITATIONS + '/', + "s3://megapixels/v1/citations/", + ]) def process_paper(row): aggregate_citations = {} @@ -38,12 +44,12 @@ def process_paper(row): address_list.append(res['address']) if not len(papers): return - with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + paper = papers[0] + with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: json.dump({ - 'id': papers[0]['paper_id'], + 'id': paper['paper_id'], 'dataset': row['dataset'], - 'statistics': row['statistics'], - 'paper': papers[0], + 'paper': paper, 'addresses': address_list, 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], @@ -53,6 +59,18 @@ def process_paper(row): 'id': papers[0]['paper_id'], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': paper['paper_id'], + 'paper': { + 'key': row['key'], + 'name': row['name'], + 'title': paper['title'], + 'year': paper['year'], + }, + 'address': address_list[0] if len(address_list) else {}, + 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], + }, f) def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): res = { @@ -184,7 +202,7 @@ def load_megapixels_lookup(): lookup[paper_key]['dataset'] = dataset_lookup[paper_key] else: print("not in datasets lookup:", paper_key) - # recs.append(rec) + lookup[paper_key]['dataset'] = {} return lookup if __name__ == '__main__': |
