diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 27 |
1 files changed, 23 insertions, 4 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 197d5642..878640ac 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -8,7 +8,8 @@ import click import subprocess from util import * -DIR_PUBLIC_CITATIONS = "../site/datasets/final" +DIR_PUBLIC_CITATIONS = "../site/datasets/public" +DIR_FINAL_CITATIONS = "../site/datasets/final" DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" addresses = AddressBook() @@ -22,6 +23,11 @@ def s2_final_report(): if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y': items.append((item,)) parallelize(process_paper, items) + subprocess.call([ + "s3cmd", "put", "-P", "--recursive", + DIR_PUBLIC_CITATIONS + '/', + "s3://megapixels/v1/citations/", + ]) def process_paper(row): aggregate_citations = {} @@ -38,11 +44,12 @@ def process_paper(row): address_list.append(res['address']) if not len(papers): return - with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + paper = papers[0] + with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: json.dump({ - 'id': papers[0]['paper_id'], + 'id': paper['paper_id'], 'dataset': row['dataset'], - 'paper': papers[0], + 'paper': paper, 'addresses': address_list, 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], @@ -52,6 +59,18 @@ def process_paper(row): 'id': papers[0]['paper_id'], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': paper['paper_id'], + 'paper': { + 'key': row['key'], + 'name': row['name'], + 'title': paper['title'], + 'year': paper['year'], + }, + 'address': address_list[0], + 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], + }, f) def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): res = { |
