diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py new file mode 100644 index 00000000..207b7abe --- /dev/null +++ b/scraper/s2-final-report.py @@ -0,0 +1,205 @@ +import os +import re +import glob +import simplejson as json +import math +import operator +import click +import subprocess +from util import * + +DIR_PUBLIC_CITATIONS = "../site/datasets/final" + +@click.command() +def s2_final_report(): + addresses = AddressBook() + megapixels = load_megapixels_lookup() + ft_lookup = load_ft_lookup() + for key, row in megapixels.items(): + print(key) + ft_share = ft_lookup[key] + if ft_share: + paper_data = process_paper(row, addresses) + +def process_paper(row, addresses): + aggregate_citations = {} + papers = [] + for paper_id in row['paper_ids']: + res = process_single_paper(row, addresses, aggregate_citations) + if res: + papers.append(res) + if not len(papers): + return + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': papers[0]['paper_id'], + 'paper': papers[0], + 'address': papers[0]['address'], + 'additional_papers': papers[1:], + 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], + }, f) + +def process_single_paper(row, addresses, aggregate_citations): + res = { + 'paper_id': '', + 'key': '', + 'title': '', + 'journal': '', + 'year': '', + 'address': '', + 'pdf_link': '', + 'citation_count': 0, + 'citations_geocoded': 0, + 'citations_unknown': 0, + 'citations_empty': 0, + 'citations_pdf': 0, + 'citations_doi': 0, + } + + geocoded_citations = [] + unknown_citations = [] + empty_citations = [] + pdf_count = 0 + doi_count = 0 + address_count = 0 + paper_id = row['paper_id'] + + fn = file_path('papers', paper_id, 'paper.json') + + with open(fn, 'r') as f: + data = json.load(f) + print('>> {} {}'.format(data['paperId'], row['key'])) + paper = load_paper(data['paperId']) + if paper is None: + print("Paper missing! {}".format(data['paperId'])) + return + + res['key'] = row['key'] + res['name'] = row['name'] + res['paper_id'] = paper.paper_id + res['title'] = paper.title + res['journal'] = paper.journal + res['report_link'] = 'papers/{}.html'.format(paper.paper_id) + res['pdf_link'] = paper.pdf_link + # res['authors'] = ', '.join(paper.authors) + # res['citations'] = [] + + paper_institutions = load_institutions(paper.paper_id) + paper_address = None + for inst in sorted(paper_institutions, key=operator.itemgetter(1)): + # print(inst[1]) + institution = inst[1] + if paper_address is None: + paper_address = addresses.findObject(institution) + + if paper_address: + # print(paper_address) + res['address'] = paper_address + + for cite in data['citations']: + citationId = cite['paperId'] + if citationId in aggregate_citations: + continue + seen_here = {} + citation = load_paper(citationId) + has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) + has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) + if has_pdf: + pdf_count += 1 + if has_doi: + doi_count += 1 + if citation is None or citation.data is None: + print("Citation missing! {}".format(cite['paperId'])) + continue + institutions = load_institutions(citationId) + geocoded_addresses = [] + geocoded_institutions = [] + institution = '' + address = None + for inst in sorted(institutions, key=operator.itemgetter(1)): + address_count += 1 + institution = inst[1] + next_address = addresses.findObject(institution) + if next_address and next_address['address'] not in seen_here: + seen_here[next_address['address']] = True + address = next_address + geocoded_addresses.append(next_address) + if not address: + if has_pdf: + headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) + heading_string = '\n'.join(headings[0:20]) + found_addresses = [] + if len(headings): + for heading in headings: + l = heading.lower().strip() + if l: + next_address = addresses.findObject(l) + if next_address and next_address['address'] not in seen_here: + seen_here[next_address['address']] = True + address = next_address + geocoded_addresses.append(next_address) + if address: + if citationId not in aggregate_citations: + pdf_link = citation.pdf_link + if type(pdf_link) == dict and 'url' in pdf_link: + pdf_link = pdf_link['url'] + aggregate_citations[citationId] = { + 'id': citationId, + 'title': citation.title, + 'addresses': geocoded_addresses, + 'year': citation.year, + 'pdf': pdf_link, + } + + # res['citation_count'] = len(data['citations']) + # res['citations_geocoded'] = len(geocoded_citations) + # res['citations_unknown'] = len(unknown_citations) + # res['citations_empty'] = len(empty_citations) + # res['citations_pdf'] = pdf_count + # res['citations_doi'] = doi_count + + return res + +def load_ft_lookup(): + keys, rows = fetch_google_sheet('datasets') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + if rec['ft_share'] == '1' or rec['ft_share'] == 1: + lookup[rec['key']] = True + else: + lookup[rec['key']] = False + return lookup + +def load_megapixels_lookup(): + keys, rows = read_csv('datasets/citation_lookup.csv') + lookup = {} + for row in rows: + rec = {} + for index, key in enumerate(keys): + rec[key] = row[index] + paper_key = rec['key'] + if paper_key not in lookup: + rec['paper_ids'] = [] + lookup[paper_key] = rec + lookup[paper_key]['paper_ids'].append(rec['paper_id']) + # recs.append(rec) + return lookup + +def load_institutions(paperId): + if os.path.exists(file_path('pdf', paperId, 'institutions.json')): + return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] + elif os.path.exists(file_path('doi', paperId, 'institutions.json')): + return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] + else: + return [] + +def data_path(key, paper_id): + return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) +def file_path(key, paper_id, fn): + return os.path.join(data_path(key, paper_id), fn) + +if __name__ == '__main__': + s2_final_report() |
