summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-25 17:07:47 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-25 17:07:47 +0100
commitbc240ba2a4b5e30710d37af88eccd905209fc263 (patch)
tree536281157e7a394edff02343926ae80c0d7daf90 /scraper/s2-final-report.py
parentcd624bdcc5307713dca541f1be130450e86d62ea (diff)
update final report
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py27
1 files changed, 23 insertions, 4 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 197d5642..878640ac 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -8,7 +8,8 @@ import click
import subprocess
from util import *
-DIR_PUBLIC_CITATIONS = "../site/datasets/final"
+DIR_PUBLIC_CITATIONS = "../site/datasets/public"
+DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
addresses = AddressBook()
@@ -22,6 +23,11 @@ def s2_final_report():
if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y':
items.append((item,))
parallelize(process_paper, items)
+ subprocess.call([
+ "s3cmd", "put", "-P", "--recursive",
+ DIR_PUBLIC_CITATIONS + '/',
+ "s3://megapixels/v1/citations/",
+ ])
def process_paper(row):
aggregate_citations = {}
@@ -38,11 +44,12 @@ def process_paper(row):
address_list.append(res['address'])
if not len(papers):
return
- with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+ paper = papers[0]
+ with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
json.dump({
- 'id': papers[0]['paper_id'],
+ 'id': paper['paper_id'],
'dataset': row['dataset'],
- 'paper': papers[0],
+ 'paper': paper,
'addresses': address_list,
'additional_papers': papers[1:],
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
@@ -52,6 +59,18 @@ def process_paper(row):
'id': papers[0]['paper_id'],
'citations': [unknown_citations[key] for key in unknown_citations.keys()],
}, f)
+ with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+ json.dump({
+ 'id': paper['paper_id'],
+ 'paper': {
+ 'key': row['key'],
+ 'name': row['name'],
+ 'title': paper['title'],
+ 'year': paper['year'],
+ },
+ 'address': address_list[0],
+ 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
+ }, f)
def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
res = {