summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
authorjules@lens <julescarbon@gmail.com>2019-05-28 14:01:49 +0200
committerjules@lens <julescarbon@gmail.com>2019-05-28 14:01:49 +0200
commitea5dd6e066b5a8faf7a6e5e766452001dad44514 (patch)
tree8b3e8e2ffadd5c0ec0f711512f8a5f47ec2701cf /scraper/s2-final-report.py
parentafbfc3f6f527ffabc6515a72c6142cdb59d9a588 (diff)
show all verified papers even if s2 deletes the connection
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index dc65a3a3..ddee18c7 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -22,6 +22,8 @@ def s2_final_report():
verified_lookup = fetch_verified_paper_lookup()
items = []
for key, item in megapixels.items():
+ if key != 'brainwash':
+ continue
ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y'
nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y'
if ft_share or nyt_share:
@@ -43,11 +45,11 @@ def s2_final_report():
# DIR_PUBLIC_CITATIONS + '/',
# "s3://megapixels/v1/citations/",
# ])
- subprocess.call([
- "s3cmd", "put", "-P", "--recursive",
- DIR_VERIFIED_CITATIONS + '/',
- "s3://megapixels/v1/citations/verified/",
- ])
+ #subprocess.call([
+ # "s3cmd", "put", "-P", "--recursive",
+ # DIR_VERIFIED_CITATIONS + '/',
+ # "s3://megapixels/v1/citations/verified/",
+ #])
def process_paper(row, verified_lookup):
aggregate_citations = {}
@@ -63,6 +65,7 @@ def process_paper(row, verified_lookup):
if res['address']:
address_list.append(res['address'])
process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
+ process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys())
if not len(papers):
return
paper = papers[0]
@@ -114,7 +117,7 @@ def process_paper(row, verified_lookup):
'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations],
}, f)
-def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
+def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]):
res = {
'paper_id': '',
'key': '',
@@ -131,7 +134,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# 'citations_doi': 0,
}
- if paper_id == 'search':
+ if paper_id == 'verified':
+ data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] }
+ elif paper_id == 'search':
dataset = row['key']
fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
if not os.path.exists(fn):