import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *
DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id')
@click.command()
def s2_citation_report():
addresses = AddressBook()
megapixels = load_megapixels_queries()
successful_geocodes = {}
papers = []
for row in megapixels:
paper_data = process_paper(row, addresses, successful_geocodes)
if paper_data is not None:
papers.append(paper_data)
write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
paper_count = 0
geocode_count = 0
for key, value in successful_geocodes.items():
if value:
geocode_count += 1
paper_count += 1
print("citations: {}".format(paper_count))
print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
subprocess.call([
"s3cmd", "put", "-P", "--recursive",
DIR_PUBLIC_CITATIONS + '/',
"s3://megapixels/v1/citations/",
])
def write_master_report(fn, papers):
# first make a lookup of the keys that have papers
paper_key_lookup = {}
for paper in papers:
if paper['key'] not in paper_key_lookup:
paper_key_lookup[paper['key']] = paper
# then fetch the datasets csv which has things like "year"
fields, rows = fetch_google_sheet('datasets')
master_papers = []
statistics = {}
def clean(n):
if type(n) is int:
return n
if type(n) is str and n:
s = str(n).replace(',','').replace('.','').replace('?','').strip()
try:
return int(s)
except e:
return s
if n:
return n
return None
for row in rows:
key = row[0]
if key not in paper_key_lookup:
continue
paper = paper_key_lookup[key]
stats = {}
for index, field in enumerate(fields):
stats[field] = row[index]
report_fn = '../site/content/datasets/{}/index.md'.format(key)
has_report = os.path.exists(report_fn)
statistics[key] = stats
# search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
image_count = stats['images']
if type(image_count) is str:
if len(image_count):
image_count = clean(image_count)
else:
image_count = None,
master_papers.append([
key,
row[1],
'/datasets/{}/'.format(key) if has_report else '',
image_count,
#clean(stats['faces_unique']) or None,
#stats['year_published'],
clean(paper['citation_count']) or 0,
# clean(search_result['citationStats']['numKeyCitations']) or 0,
# origin
])
master_paper_keys = [
'key',
'title',
'link',
'images',
#'people',
#'year',
'citations',
# 'influenced',
# 'origin'
]
write_csv(fn, keys=master_paper_keys, rows=master_papers)
def write_papers_report(fn, title, papers, key, reverse=False):
sorted_papers = []
for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
sorted_papers.append([
paper['paperId'],
paper['key'],
paper['name'],
LinkLine(paper['report_link'], paper['title']),
LinkLine(paper['pdf_link'], '[pdf]'),
paper['journal'],
paper['address_type'],
paper['address'],
paper['country'],
paper['lat'],
paper['lng'],
str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
paper['citation_count'],
paper['citations_geocoded'],
paper['citations_unknown'],
paper['citations_empty'],
paper['citations_pdf'],
paper['citations_doi'],
])
sorted_paper_keys = [
'Paper ID',
'Megapixels Key',
'Megapixels Name',
'Report Link',
'PDF Link',
'Journal',
'Type',
'Address',
'Country',
'Lat',
'Lng',
'Coverage',
'Total Citations',
'Geocoded Citations',
'Unknown Citations',
'Empty Citations',
'With PDF',
'With DOI',
]
write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
def process_paper(row, addresses, success):
res = {
'paperId': '',
'key': '',
'title': '',
'journal': '',
'address': '',
'country': '',
'address_type': '',
'lat': '',
'lng': '',
'pdf_link': '',
'report_link': '',
'citation_count': 0,
'citations_geocoded': 0,
'citations_unknown': 0,
'citations_empty': 0,
'citations_pdf': 0,
'citations_doi': 0,
}
geocoded_citations = []
unknown_citations = []
display_geocoded_citations = []
empty_citations = []
pdf_count = 0
doi_count = 0
address_count = 0
fn = file_path('papers', row['paper_id'], 'paper.json')
if not os.path.exists(fn):
print("not found: {}".format(fn))
print(row)
return
with open(fn, 'r') as f:
data = json.load(f)
print('>> {} {}'.format(data['paperId'], row['key']))
paper = load_paper(data['paperId'])
if paper is None:
print("Paper missing! {}".format(data['paperId']))
return
res['key'] = row['key']
res['name'] = row['name']
res['paperId'] = paper.paper_id
res['title'] = paper.title
res['journal'] = paper.journal
res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
res['pdf_link'] = paper.pdf_link
# res['authors'] = ', '.join(paper.authors)
# res['citations'] = []
paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
paper_address = None
for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
# print(inst[1])
institution = inst[1]
if paper_address is None:
paper_address = addresses.find(institution)
if paper_address:
# print(paper_address)
res['address'] = paper_address[0]
res['lat'] = paper_address[3]
res['lng'] = paper_address[4]
res['address_type'] = paper_address[5]
res['country'] = paper_address[7]
for cite in data['citations']:
citationId = cite['paperId']
citation = load_paper(citationId)
has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
if has_pdf:
pdf_count += 1
if has_doi:
doi_count += 1
if citation is None or citation.data is None:
print("Citation missing! {}".format(cite['paperId']))
continue
institutions = load_institutions(citationId, paper_location_lookup)
geocoded_institutions = []
unknown_institutions = []
institution = ''
address = None
for inst in sorted(institutions, key=operator.itemgetter(1)):
# print(inst[1])
address_count += 1
institution = inst[1]
next_address = addresses.find(institution)
if next_address:
address = next_address
geocoded_institutions.append(institution)
else:
unknown_institutions.append(institution)
if not address:
if has_pdf:
headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
if len(headings):
for heading in headings:
l = heading.lower().strip()
if l:
next_address = addresses.find(l)
if next_address:
address = next_address
geocoded_institutions.append(heading)
else:
unknown_institutions.append(heading)
else:
empty_citations.append([
citationId,
citation.title,
])
# res['citations'].append({
# 'title': citation.title,
# 'journal': citation.journal,
# 'authors': citation.authors,
# 'institutions': [inst[1] for inst in institutions],
# 'geocoded': geocoded_institutions,
# })
if address:
success[citationId] = True
geocoded_citations.append([
citation.title,
institution,
] + address + [
citation.year,
])
display_geocoded_citations.append([
citationId,
LinkLine(citation.pdf_link, '[pdf]'),
citation.title,
] + address[0:5])
else:
success[citationId] = False
unknown_citations.append([
citationId,
LinkLine(citation.pdf_link, '[pdf]'),
citation.title,
'
'.join(unknown_institutions),
])
res['citation_count'] = len(data['citations'])
res['citations_geocoded'] = len(geocoded_citations)
res['citations_unknown'] = len(unknown_citations)
res['citations_empty'] = len(empty_citations)
res['citations_pdf'] = pdf_count
res['citations_doi'] = doi_count
total_citations = len(geocoded_citations) + len(unknown_citations)
os.makedirs('reports/papers/', exist_ok=True)
with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
f.write("")
f.write("")
f.write("