diff options
| author | jules@lens <julescarbon@gmail.com> | 2019-05-28 18:04:25 +0200 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2019-05-28 18:04:25 +0200 |
| commit | f9a1c5f6a631c24afee76f9449508622d57992fb (patch) | |
| tree | 8a65a15e057cde71a360f68d0204f522e2128c90 | |
| parent | ea5dd6e066b5a8faf7a6e5e766452001dad44514 (diff) | |
export totals and look for censorship
| -rw-r--r-- | scraper/s2-final-report.py | 39 | ||||
| -rw-r--r-- | scraper/util.py | 8 |
2 files changed, 32 insertions, 15 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index ddee18c7..16d70f12 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -19,19 +19,21 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i @click.command() def s2_final_report(): megapixels = load_megapixels_lookup() - verified_lookup = fetch_verified_paper_lookup() + verified_lookup, verified_totals = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): - if key != 'brainwash': - continue + #if key != 'brainwash': + # continue ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: if key in verified_lookup: lookup = verified_lookup[key] + totals = verified_totals[key] else: lookup = {} - items.append((item, lookup,)) + totals = {} + items.append((item, lookup, totals,)) parallelize(process_paper, items) # key name_short name_full purpose url # wild indoor outdoor campus cyberspace parent @@ -45,13 +47,13 @@ def s2_final_report(): # DIR_PUBLIC_CITATIONS + '/', # "s3://megapixels/v1/citations/", # ]) - #subprocess.call([ - # "s3cmd", "put", "-P", "--recursive", - # DIR_VERIFIED_CITATIONS + '/', - # "s3://megapixels/v1/citations/verified/", - #]) + subprocess.call([ + "s3cmd", "put", "-P", "--recursive", + DIR_VERIFIED_CITATIONS + '/', + "s3://megapixels/v1/citations/verified/", + ]) -def process_paper(row, verified_lookup): +def process_paper(row, verified_lookup, verified_totals): aggregate_citations = {} unknown_citations = {} address = None @@ -64,11 +66,19 @@ def process_paper(row, verified_lookup): papers.append(res) if res['address']: address_list.append(res['address']) - process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) - process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys()) + if not len(papers): return paper = papers[0] + print('>> {} {}'.format(paper['paper_id'], row['key'])) + + process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) + + for paper_id in verified_lookup.keys(): + if paper_id not in aggregate_citations: + print('S2 API missing verified citation: {}'.format(paper_id)) + + process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys()) # final citations - a report of all geocoded citations with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: @@ -113,6 +123,7 @@ def process_paper(row, verified_lookup): 'title': paper['title'], 'year': paper['year'], 'addresses': address_list, + 'vetting': verified_totals, }, 'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations], }, f) @@ -148,10 +159,10 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ fn = file_path('papers', paper_id, 'paper.json') with open(fn, 'r') as f: data = json.load(f) - print('>> {} {}'.format(data['paperId'], row['key'])) + # print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: - print("Paper missing! {}".format(data['paperId'])) + print(">> Paper missing! {}".format(data['paperId'])) return res['key'] = row['key'] diff --git a/scraper/util.py b/scraper/util.py index c7e18b44..7febf86f 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -467,15 +467,21 @@ def fetch_verified_paper_lookup(): """Fetch a lookup keyed by dataset, where each dataset points to a hash of valid or invalid papers...""" keys, rows = fetch_google_sheet('verifications') verified_lookup = {} + verified_totals = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] if rec['dataset'] not in verified_lookup: verified_lookup[rec['dataset']] = {} + verified_totals[rec['dataset']] = { 'yes': 0, 'no': 0, 'total': 0 } if str(rec['uses_dataset']) == '1': verified_lookup[rec['dataset']][rec['paper_id']] = rec - return verified_lookup + verified_totals[rec['dataset']]['yes'] += 1 + else: + verified_totals[rec['dataset']]['no'] += 1 + verified_totals[rec['dataset']]['total'] += 1 + return verified_lookup, verified_totals def update_or_append_worksheet(name, form): worksheet = fetch_worksheet(name) |
