summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2019-05-29 15:25:02 +0200
committeradamhrv <adam@ahprojects.com>2019-05-29 15:25:02 +0200
commitca0d3ed1a451ce65960ff2e0f44fd5a9008eeaf4 (patch)
tree345f0ee4a2fd7a917d7d604290fa2bda51225c68 /scraper/s2-final-report.py
parent5c21bdb664649c62ebbed29448a7c653ab32ddb0 (diff)
parent2963cd2ec73860e3bf3a5e4d469b4e573ce4817c (diff)
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py32
1 files changed, 24 insertions, 8 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index dc65a3a3..16d70f12 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -19,17 +19,21 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i
@click.command()
def s2_final_report():
megapixels = load_megapixels_lookup()
- verified_lookup = fetch_verified_paper_lookup()
+ verified_lookup, verified_totals = fetch_verified_paper_lookup()
items = []
for key, item in megapixels.items():
+ #if key != 'brainwash':
+ # continue
ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y'
nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y'
if ft_share or nyt_share:
if key in verified_lookup:
lookup = verified_lookup[key]
+ totals = verified_totals[key]
else:
lookup = {}
- items.append((item, lookup,))
+ totals = {}
+ items.append((item, lookup, totals,))
parallelize(process_paper, items)
# key name_short name_full purpose url
# wild indoor outdoor campus cyberspace parent
@@ -49,7 +53,7 @@ def s2_final_report():
"s3://megapixels/v1/citations/verified/",
])
-def process_paper(row, verified_lookup):
+def process_paper(row, verified_lookup, verified_totals):
aggregate_citations = {}
unknown_citations = {}
address = None
@@ -62,10 +66,19 @@ def process_paper(row, verified_lookup):
papers.append(res)
if res['address']:
address_list.append(res['address'])
- process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
+
if not len(papers):
return
paper = papers[0]
+ print('>> {} {}'.format(paper['paper_id'], row['key']))
+
+ process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
+
+ for paper_id in verified_lookup.keys():
+ if paper_id not in aggregate_citations:
+ print('S2 API missing verified citation: {}'.format(paper_id))
+
+ process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys())
# final citations - a report of all geocoded citations
with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
@@ -110,11 +123,12 @@ def process_paper(row, verified_lookup):
'title': paper['title'],
'year': paper['year'],
'addresses': address_list,
+ 'vetting': verified_totals,
},
'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations],
}, f)
-def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
+def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]):
res = {
'paper_id': '',
'key': '',
@@ -131,7 +145,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
# 'citations_doi': 0,
}
- if paper_id == 'search':
+ if paper_id == 'verified':
+ data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] }
+ elif paper_id == 'search':
dataset = row['key']
fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
if not os.path.exists(fn):
@@ -143,10 +159,10 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
fn = file_path('papers', paper_id, 'paper.json')
with open(fn, 'r') as f:
data = json.load(f)
- print('>> {} {}'.format(data['paperId'], row['key']))
+ # print('>> {} {}'.format(data['paperId'], row['key']))
paper = load_paper(data['paperId'])
if paper is None:
- print("Paper missing! {}".format(data['paperId']))
+ print(">> Paper missing! {}".format(data['paperId']))
return
res['key'] = row['key']