import os
import re
import glob
import json
import math
import operator
import click
from util import *
@click.command()
def s2_citation_report():
addresses = AddressBook()
for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
process_paper(fn, addresses)
def process_paper(fn, addresses):
res = {}
address_count = 0
geocode_count = 0
geocoded_citations = []
unknown_citations = []
display_geocoded_citations = []
with open(fn, 'r') as f:
data = json.load(f)
print('>> {}'.format(data['paperId']))
paper = load_paper(data['paperId'])
if paper.data is None:
print("Paper missing! {}".format(data['paperId']))
return
res['paperId'] = paper.paper_id
res['title'] = paper.title
res['journal'] = paper.journal
res['authors'] = paper.authors
res['citations'] = []
for cite in data['citations']:
citationId = cite['paperId']
citation = load_paper(citationId)
if citation.data is None:
print("Citation missing! {}".format(cite['paperId']))
continue
institutions = load_institutions(citationId)
geocoded_institutions = []
unknown_institutions = []
institution = ''
address = None
for inst in sorted(institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
next_address = addresses.find(institution)
if next_address:
address = next_address
geocode_count += 1
geocoded_institutions.append(institution)
else:
unknown_institutions.append(institution)
if not address:
if os.path.exists(file_path('pdf', citationId, 'paper.txt')):
headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
if len(headings):
for heading in headings:
l = heading.lower().strip()
if l:
next_address = addresses.find(l)
if next_address:
address = next_address
geocode_count += 1
geocoded_institutions.append(heading)
else:
unknown_institutions.append(heading)
res['citations'].append({
'title': citation.title,
'journal': citation.journal,
'authors': citation.authors,
'institutions': [inst[1] for inst in institutions],
'geocoded': geocoded_institutions,
})
if address:
geocoded_citations.append([
citation.title,
institution,
] + address)
display_geocoded_citations.append([
citation.title,
] + address)
else:
unknown_citations.append([
citation.title,
'
'.join(unknown_institutions),
])
paper_institutions = load_institutions(paper.paper_id)
paper_address = None
for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
paper_address = addresses.find(institution)
if paper_address:
print(paper_address)
total_citations = len(geocoded_citations) + len(unknown_citations)
os.makedirs('reports/papers/', exist_ok=True)
with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
f.write("")
f.write("")
f.write("