import os import gzip import glob import json import math import operator import click from util import * @click.command() def s2_citation_report(): addresses = load_addresses() for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): process_paper(fn, addresses) def process_paper(fn, addresses): res = {} address_count = 0 geocode_count = 0 geocoded_citations = [] unknown_citations = [] display_geocoded_citations = [] with open(fn, 'r') as f: data = json.load(f) print('>> {}'.format(data['paperId'])) paper = load_paper(data['paperId']) if paper.data is None: print("Paper missing! {}".format(data['paperId'])) return res['paperId'] = paper.paper_id res['title'] = paper.title res['journal'] = paper.journal res['authors'] = paper.authors res['citations'] = [] for cite in data['citations']: citationId = cite['paperId'] citation = load_paper(citationId) if citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue institutions = load_institutions(citationId) geocoded_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): # print(inst[1]) address_count += 1 institution = inst[1] if institution in addresses: address = addresses[institution] geocode_count += 1 geocoded_institutions.append(institution) else: for part in institution.split(', '): if part in addresses: address = addresses[part] geocode_count += 1 geocoded_institutions.append(institution) res['citations'].append({ 'title': citation.title, 'journal': citation.journal, 'authors': citation.authors, 'institutions': [inst[1] for inst in institutions], 'geocoded': geocoded_institutions, }) if len(geocoded_institutions): geocoded_citations.append([ citation.title, institution, address, ]) display_geocoded_citations.append([ citation.title, institution, ', '.join(address), ]) else: unknown_citations.append([ citation.title, institution, ]) total_citations = len(geocoded_citations) + len(unknown_citations) os.makedirs('reports/papers/', exist_ok=True) with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f: f.write("") f.write("") f.write("") f.write('') f.write("{}".format(paper.title)) f.write("") f.write('') f.write("") f.write("") f.write("
") f.write("

{}

".format(paper.title)) f.write('') f.write('

{}

'.format('Geocoded Citations')) write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) f.write('

{}

'.format('Other Citations')) write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0))) f.write("") f.write('') f.write('') f.write('') f.write("") return res def load_addresses(): data = read_csv('reports/all_institutions.csv', keys=None) lookup = {} for row in data: name = row[0] lookup[name] = row return lookup def load_institutions(paperId): if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')): return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions'] elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')): return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions'] else: return [] def data_path(key, paper_id): return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id) if __name__ == '__main__': s2_citation_report()