diff options
| author | adamhrv <adam@ahprojects.com> | 2019-05-29 15:25:02 +0200 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2019-05-29 15:25:02 +0200 |
| commit | ca0d3ed1a451ce65960ff2e0f44fd5a9008eeaf4 (patch) | |
| tree | 345f0ee4a2fd7a917d7d604290fa2bda51225c68 /scraper/s2-final-report.py | |
| parent | 5c21bdb664649c62ebbed29448a7c653ab32ddb0 (diff) | |
| parent | 2963cd2ec73860e3bf3a5e4d469b4e573ce4817c (diff) | |
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 32 |
1 files changed, 24 insertions, 8 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index dc65a3a3..16d70f12 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -19,17 +19,21 @@ paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_i @click.command() def s2_final_report(): megapixels = load_megapixels_lookup() - verified_lookup = fetch_verified_paper_lookup() + verified_lookup, verified_totals = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): + #if key != 'brainwash': + # continue ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: if key in verified_lookup: lookup = verified_lookup[key] + totals = verified_totals[key] else: lookup = {} - items.append((item, lookup,)) + totals = {} + items.append((item, lookup, totals,)) parallelize(process_paper, items) # key name_short name_full purpose url # wild indoor outdoor campus cyberspace parent @@ -49,7 +53,7 @@ def s2_final_report(): "s3://megapixels/v1/citations/verified/", ]) -def process_paper(row, verified_lookup): +def process_paper(row, verified_lookup, verified_totals): aggregate_citations = {} unknown_citations = {} address = None @@ -62,10 +66,19 @@ def process_paper(row, verified_lookup): papers.append(res) if res['address']: address_list.append(res['address']) - process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) + if not len(papers): return paper = papers[0] + print('>> {} {}'.format(paper['paper_id'], row['key'])) + + process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) + + for paper_id in verified_lookup.keys(): + if paper_id not in aggregate_citations: + print('S2 API missing verified citation: {}'.format(paper_id)) + + process_single_paper(row, 'verified', addresses, aggregate_citations, unknown_citations, verified_lookup.keys()) # final citations - a report of all geocoded citations with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: @@ -110,11 +123,12 @@ def process_paper(row, verified_lookup): 'title': paper['title'], 'year': paper['year'], 'addresses': address_list, + 'vetting': verified_totals, }, 'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations], }, f) -def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): +def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations, verified_citations=[]): res = { 'paper_id': '', 'key': '', @@ -131,7 +145,9 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # 'citations_doi': 0, } - if paper_id == 'search': + if paper_id == 'verified': + data = { 'citations': [ { 'paperId': paperId } for paperId in verified_citations ] } + elif paper_id == 'search': dataset = row['key'] fn = 'datasets/s2/search_papers/{}.json'.format(dataset) if not os.path.exists(fn): @@ -143,10 +159,10 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ fn = file_path('papers', paper_id, 'paper.json') with open(fn, 'r') as f: data = json.load(f) - print('>> {} {}'.format(data['paperId'], row['key'])) + # print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: - print("Paper missing! {}".format(data['paperId'])) + print(">> Paper missing! {}".format(data['paperId'])) return res['key'] = row['key'] |
