summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py32
1 files changed, 25 insertions, 7 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 4b74750a..ec4ad25b 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -8,7 +8,8 @@ import click
import subprocess
from util import *
-DIR_PUBLIC_CITATIONS = "../site/datasets/final"
+DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
+DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
addresses = AddressBook()
@@ -19,9 +20,14 @@ def s2_final_report():
megapixels = load_megapixels_lookup()
items = []
for key, item in megapixels.items():
- if item['dataset']['ft_share'] == '1':
+ if 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y':
items.append((item,))
parallelize(process_paper, items)
+ subprocess.call([
+ "s3cmd", "put", "-P", "--recursive",
+ DIR_PUBLIC_CITATIONS + '/',
+ "s3://megapixels/v1/citations/",
+ ])
def process_paper(row):
aggregate_citations = {}
@@ -38,12 +44,12 @@ def process_paper(row):
address_list.append(res['address'])
if not len(papers):
return
- with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+ paper = papers[0]
+ with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
json.dump({
- 'id': papers[0]['paper_id'],
+ 'id': paper['paper_id'],
'dataset': row['dataset'],
- 'statistics': row['statistics'],
- 'paper': papers[0],
+ 'paper': paper,
'addresses': address_list,
'additional_papers': papers[1:],
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
@@ -53,6 +59,18 @@ def process_paper(row):
'id': papers[0]['paper_id'],
'citations': [unknown_citations[key] for key in unknown_citations.keys()],
}, f)
+ with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+ json.dump({
+ 'id': paper['paper_id'],
+ 'paper': {
+ 'key': row['key'],
+ 'name': row['name'],
+ 'title': paper['title'],
+ 'year': paper['year'],
+ },
+ 'address': address_list[0] if len(address_list) else {},
+ 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
+ }, f)
def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
res = {
@@ -184,7 +202,7 @@ def load_megapixels_lookup():
lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
else:
print("not in datasets lookup:", paper_key)
- # recs.append(rec)
+ lookup[paper_key]['dataset'] = {}
return lookup
if __name__ == '__main__':