import os import re import glob import simplejson as json import math import operator import click import subprocess from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/final" @click.command() def s2_final_report(): addresses = AddressBook() megapixels = load_megapixels_lookup() ft_lookup = load_ft_lookup() for key, row in megapixels.items(): print(key) ft_share = ft_lookup[key] if ft_share: paper_data = process_paper(row, addresses) def process_paper(row, addresses): aggregate_citations = {} papers = [] for paper_id in row['paper_ids']: res = process_single_paper(row, addresses, aggregate_citations) if res: papers.append(res) if not len(papers): return with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], 'paper': papers[0], 'address': papers[0]['address'], 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) def process_single_paper(row, addresses, aggregate_citations): res = { 'paper_id': '', 'key': '', 'title': '', # 'journal': '', 'year': '', 'pdf': '', 'address': '', # 'citation_count': 0, # 'citations_geocoded': 0, # 'citations_unknown': 0, # 'citations_empty': 0, # 'citations_pdf': 0, # 'citations_doi': 0, } geocoded_citations = [] unknown_citations = [] empty_citations = [] pdf_count = 0 doi_count = 0 address_count = 0 paper_id = row['paper_id'] fn = file_path('papers', paper_id, 'paper.json') with open(fn, 'r') as f: data = json.load(f) print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: print("Paper missing! {}".format(data['paperId'])) return res['key'] = row['key'] res['name'] = row['name'] res['paper_id'] = paper.paper_id res['title'] = paper.title # res['journal'] = paper.journal res['year'] = paper.year res['pdf'] = paper.pdf_link # res['authors'] = ', '.join(paper.authors) # res['citations'] = [] paper_institutions = load_institutions(paper.paper_id) paper_address = None for inst in sorted(paper_institutions, key=operator.itemgetter(1)): # print(inst[1]) institution = inst[1] if paper_address is None: paper_address = addresses.findObject(institution) if paper_address: # print(paper_address) res['address'] = paper_address for cite in data['citations']: citationId = cite['paperId'] if citationId in aggregate_citations: continue seen_here = {} citation = load_paper(citationId) has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) if has_pdf: pdf_count += 1 if has_doi: doi_count += 1 if citation is None or citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue institutions = load_institutions(citationId) geocoded_addresses = [] geocoded_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): address_count += 1 institution = inst[1] next_address = addresses.findObject(institution) if next_address and next_address['address'] not in seen_here: seen_here[next_address['address']] = True address = next_address geocoded_addresses.append(next_address) if not address: if has_pdf: headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if len(headings): for heading in headings: l = heading.lower().strip() if l: next_address = addresses.findObject(l) if next_address and next_address['address'] not in seen_here: seen_here[next_address['address']] = True address = next_address geocoded_addresses.append(next_address) if address: if citationId not in aggregate_citations: aggregate_citations[citationId] = { 'id': citationId, 'title': citation.title, 'addresses': geocoded_addresses, 'year': citation.year, 'pdf': citation.pdf_link, } # res['citation_count'] = len(data['citations']) # res['citations_geocoded'] = len(geocoded_citations) # res['citations_unknown'] = len(unknown_citations) # res['citations_empty'] = len(empty_citations) # res['citations_pdf'] = pdf_count # res['citations_doi'] = doi_count return res def load_ft_lookup(): keys, rows = fetch_google_sheet('datasets') lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] if rec['ft_share'] == '1' or rec['ft_share'] == 1: lookup[rec['key']] = True else: lookup[rec['key']] = False return lookup def load_megapixels_lookup(): keys, rows = read_csv('datasets/citation_lookup.csv') lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] paper_key = rec['key'] if paper_key not in lookup: rec['paper_ids'] = [] lookup[paper_key] = rec lookup[paper_key]['paper_ids'].append(rec['paper_id']) # recs.append(rec) return lookup def load_institutions(paperId): if os.path.exists(file_path('pdf', paperId, 'institutions.json')): return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions'] elif os.path.exists(file_path('doi', paperId, 'institutions.json')): return read_json(file_path('doi', paperId, 'institutions.json'))['institutions'] else: return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) def file_path(key, paper_id, fn): return os.path.join(data_path(key, paper_id), fn) if __name__ == '__main__': s2_final_report()