diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-05-28 14:01:49 +0200 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-05-28 14:01:49 +0200 |
| commit | ea5dd6e066b5a8faf7a6e5e766452001dad44514 (patch) | |
| tree | 8b3e8e2ffadd5c0ec0f711512f8a5f47ec2701cf /scraper/s2-final-report.py | |
| parent | afbfc3f6f527ffabc6515a72c6142cdb59d9a588 (diff) | |
show all verified papers even if s2 deletes the connection
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index dc65a3a3..ddee18c7 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -22,6 +22,8 @@ def s2_final_report(): verified_lookup = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): + if key != 'brainwash': + continue ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: @@ -43,11 +45,11 @@ def s2_final_report(): # DIR_PUBLIC_CITATIONS + '/', # "s3://megapixels/v1/citations/", # ]) - subprocess.call([ - "s3cmd", "put", "-P", "--recursive", - DIR_VERIFIED_CITATIONS + '/', - "s3://megapixels/v1/citations/verified/", - ]) + #subprocess.call([ + # "s3cmd", "put", "-P", "--recursive", + # DIR_VERIFIED_CITATIONS + '/', + # "s3://megapixels/v1/citations/verified/", + #]) def process_paper(row, verified_lookup): aggregate_citations = {} @@ -63,6 +65,7 @@ def process_paper(row, verified_lookup): if res['address']: address_list.append(res['address']) process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) + process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys()) if not len(papers): return paper = papers[0] @@ -114,7 +117,7 @@ def process_paper(row, verified_lookup): 'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations], }, f) -def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): +def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]): res = { 'paper_id': '', 'key': '', @@ -131,7 +134,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # 'citations_doi': 0, } - if paper_id == 'search': + if paper_id == 'verified': + data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] } + elif paper_id == 'search': dataset = row['key'] fn = 'datasets/s2/search_papers/{}.json'.format(dataset) if not os.path.exists(fn): |
