diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 76 |
1 files changed, 46 insertions, 30 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index c369fa6f..854aa940 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -62,9 +62,12 @@ def process_paper(row, verified_lookup): papers.append(res) if res['address']: address_list.append(res['address']) + process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) if not len(papers): return paper = papers[0] + + # final citations - a report of all geocoded citations with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -74,11 +77,16 @@ def process_paper(row, verified_lookup): 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + + # unkonwn citations - a report of all non-geocoded citations with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) + + # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know + # this might not need to get built... with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -91,6 +99,8 @@ def process_paper(row, verified_lookup): }, 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + + # verified citations - the final public reports with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], @@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # 'citations_doi': 0, } - fn = file_path('papers', paper_id, 'paper.json') - - with open(fn, 'r') as f: - data = json.load(f) - print('>> {} {}'.format(data['paperId'], row['key'])) - paper = load_paper(data['paperId']) - if paper is None: - print("Paper missing! {}".format(data['paperId'])) - return + if paper_id == 'search': + dataset = row['key'] + fn = 'datasets/s2/search_papers/{}.json'.format(dataset) + with open(fn, 'r') as f: + citations = json.load(f) + data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] } + else: + fn = file_path('papers', paper_id, 'paper.json') + with open(fn, 'r') as f: + data = json.load(f) + print('>> {} {}'.format(data['paperId'], row['key'])) + paper = load_paper(data['paperId']) + if paper is None: + print("Paper missing! {}".format(data['paperId'])) + return - res['key'] = row['key'] - res['name'] = row['name'] - res['paper_id'] = paper.paper_id - res['title'] = paper.title - # res['journal'] = paper.journal - res['year'] = paper.year - res['pdf'] = paper.pdf_links() - res['doi'] = paper.doi_links() - # res['authors'] = ', '.join(paper.authors) - # res['citations'] = [] + res['key'] = row['key'] + res['name'] = row['name'] + res['paper_id'] = paper.paper_id + res['title'] = paper.title + # res['journal'] = paper.journal + res['year'] = paper.year + res['pdf'] = paper.pdf_links() + res['doi'] = paper.doi_links() + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] - paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) - paper_address = None - for inst in sorted(paper_institutions, key=operator.itemgetter(1)): - #print(inst[1]) - institution = inst[1] - if paper_address is None: - paper_address = addresses.findObject(institution) + paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + #print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.findObject(institution) - if paper_address: - # print(paper_address) - res['address'] = paper_address + if paper_address: + # print(paper_address) + res['address'] = paper_address for cite in data['citations']: citationId = cite['paperId'] @@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ # if has_doi: # doi_count += 1 if citation is None or citation.data is None: - print("Citation missing! {}".format(cite['paperId'])) + print("Citation missing! {}".format(citationId)) continue institutions = load_institutions(citationId, paper_location_lookup) geocoded_addresses = [] |
