summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scraper/s2-final-report.py39
-rw-r--r--scraper/util.py8
2 files changed, 32 insertions, 15 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index ddee18c7..16d70f12 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -19,19 +19,21 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i
@click.command()
def s2_final_report():
megapixels = load_megapixels_lookup()
- verified_lookup = fetch_verified_paper_lookup()
+ verified_lookup, verified_totals = fetch_verified_paper_lookup()
items = []
for key, item in megapixels.items():
- if key != 'brainwash':
- continue
+ #if key != 'brainwash':
+ # continue
ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y'
nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y'
if ft_share or nyt_share:
if key in verified_lookup:
lookup = verified_lookup[key]
+ totals = verified_totals[key]
else:
lookup = {}
- items.append((item, lookup,))
+ totals = {}
+ items.append((item, lookup, totals,))
parallelize(process_paper, items)
# key name_short name_full purpose url
# wild indoor outdoor campus cyberspace parent
@@ -45,13 +47,13 @@ def s2_final_report():
# DIR_PUBLIC_CITATIONS + '/',
# "s3://megapixels/v1/citations/",
# ])
- #subprocess.call([
- # "s3cmd", "put", "-P", "--recursive",
- # DIR_VERIFIED_CITATIONS + '/',
- # "s3://megapixels/v1/citations/verified/",
- #])
+ subprocess.call([
+ "s3cmd", "put", "-P", "--recursive",
+ DIR_VERIFIED_CITATIONS + '/',
+ "s3://megapixels/v1/citations/verified/",
+ ])
-def process_paper(row, verified_lookup):
+def process_paper(row, verified_lookup, verified_totals):
aggregate_citations = {}
unknown_citations = {}
address = None
@@ -64,11 +66,19 @@ def process_paper(row, verified_lookup):
papers.append(res)
if res['address']:
address_list.append(res['address'])
- process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
- process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys())
+
if not len(papers):
return
paper = papers[0]
+ print('>> {} {}'.format(paper['paper_id'], row['key']))
+
+ process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
+
+ for paper_id in verified_lookup.keys():
+ if paper_id not in aggregate_citations:
+ print('S2 API missing verified citation: {}'.format(paper_id))
+
+ process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys())
# final citations - a report of all geocoded citations
with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
@@ -113,6 +123,7 @@ def process_paper(row, verified_lookup):
'title': paper['title'],
'year': paper['year'],
'addresses': address_list,
+ 'vetting': verified_totals,
},
'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations],
}, f)
@@ -148,10 +159,10 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
fn = file_path('papers', paper_id, 'paper.json')
with open(fn, 'r') as f:
data = json.load(f)
- print('>> {} {}'.format(data['paperId'], row['key']))
+ # print('>> {} {}'.format(data['paperId'], row['key']))
paper = load_paper(data['paperId'])
if paper is None:
- print("Paper missing! {}".format(data['paperId']))
+ print(">> Paper missing! {}".format(data['paperId']))
return
res['key'] = row['key']
diff --git a/scraper/util.py b/scraper/util.py
index c7e18b44..7febf86f 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -467,15 +467,21 @@ def fetch_verified_paper_lookup():
"""Fetch a lookup keyed by dataset, where each dataset points to a hash of valid or invalid papers..."""
keys, rows = fetch_google_sheet('verifications')
verified_lookup = {}
+ verified_totals = {}
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
if rec['dataset'] not in verified_lookup:
verified_lookup[rec['dataset']] = {}
+ verified_totals[rec['dataset']] = { 'yes': 0, 'no': 0, 'total': 0 }
if str(rec['uses_dataset']) == '1':
verified_lookup[rec['dataset']][rec['paper_id']] = rec
- return verified_lookup
+ verified_totals[rec['dataset']]['yes'] += 1
+ else:
+ verified_totals[rec['dataset']]['no'] += 1
+ verified_totals[rec['dataset']]['total'] += 1
+ return verified_lookup, verified_totals
def update_or_append_worksheet(name, form):
worksheet = fetch_worksheet(name)