From 45e0625bcbc2c7f041b8c5d177c5dcf487f07d26 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 14 Dec 2018 02:31:14 +0100 Subject: new reports --- scraper/s2-citation-report.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'scraper/s2-citation-report.py') diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py index 526cf778..5c5fae9a 100644 --- a/scraper/s2-citation-report.py +++ b/scraper/s2-citation-report.py @@ -5,7 +5,7 @@ import simplejson as json import math import operator import click -import builder +#import builder from util import * @click.command() @@ -14,9 +14,10 @@ def s2_citation_report(): megapixels = load_megapixels_queries() successful_geocodes = {} papers = [] - for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True): - paper_data = process_paper(fn, addresses, megapixels, successful_geocodes) - papers.append(paper_data) + for row in megapixels: + paper_data = process_paper(row, addresses, successful_geocodes) + if paper_data is not None: + papers.append(paper_data) write_papers_report('reports/report_index.html', 'All Papers', papers, 'title') write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True) @@ -35,6 +36,7 @@ def write_papers_report(fn, title, papers, key, reverse=False): sorted_papers.append([ paper['paperId'], paper['key'], + paper['name'], LinkLine(paper['report_link'], paper['title']), LinkLine(paper['pdf_link'], '[pdf]'), paper['journal'], @@ -53,6 +55,7 @@ def write_papers_report(fn, title, papers, key, reverse=False): sorted_paper_keys = [ 'Paper ID', 'Megapixels Key', + 'Megapixels Name', 'Report Link', 'PDF Link', 'Journal', @@ -70,7 +73,7 @@ def write_papers_report(fn, title, papers, key, reverse=False): ] write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers) -def process_paper(fn, addresses, megapixels, success): +def process_paper(row, addresses, success): res = { 'paperId': '', 'key': '', @@ -98,14 +101,18 @@ def process_paper(fn, addresses, megapixels, success): doi_count = 0 address_count = 0 + fn = file_path('papers', row['paper_id'], 'paper.json') + with open(fn, 'r') as f: data = json.load(f) print('>> {}'.format(data['paperId'])) paper = load_paper(data['paperId']) - if paper.data is None: + if paper is None: print("Paper missing! {}".format(data['paperId'])) return + res['key'] = row['key'] + res['name'] = row['name'] res['paperId'] = paper.paper_id res['title'] = paper.title res['journal'] = paper.journal @@ -114,9 +121,6 @@ def process_paper(fn, addresses, megapixels, success): # res['authors'] = ', '.join(paper.authors) # res['citations'] = [] - if res['title'] in megapixels: - res['key'] = megapixels[res['title']]['Database Name'] - paper_institutions = load_institutions(paper.paper_id) paper_address = None for inst in sorted(paper_institutions, key=operator.itemgetter(1)): @@ -126,7 +130,7 @@ def process_paper(fn, addresses, megapixels, success): paper_address = addresses.find(institution) if paper_address: - print(paper_address) + # print(paper_address) res['address'] = paper_address[0] res['lat'] = paper_address[3] res['lng'] = paper_address[4] @@ -235,7 +239,10 @@ def process_paper(fn, addresses, megapixels, success): f.write('
  • Address: {}
  • '.format(paper_address[2])) f.write('
  • Lat/Lng: {}, {}
  • '.format(paper_address[3], paper_address[4])) f.write('
  • Year: {}
  • '.format(paper.year)) - f.write('
  • Coverage: {} / {} citations were located ({} %).
  • '.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) + if total_citations == 0: + f.write('
  • Coverage: No citations found!
  • ') + else: + f.write('
  • Coverage: {} / {} citations were located ({} %).
  • '.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100))) f.write('') f.write('

    {}

    '.format('Geocoded Citations')) write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0))) @@ -258,14 +265,14 @@ def process_paper(fn, addresses, megapixels, success): return res def load_megapixels_queries(): - keys, rows = read_csv('datasets/citations-2018310.csv') - lookup = {} + keys, rows = read_csv('datasets/citation_lookup.csv') + recs = [] for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] - lookup[rec['Title'].strip()] = rec - return lookup + recs.append(rec) + return recs def load_institutions(paperId): if os.path.exists(file_path('pdf', paperId, 'institutions.json')): -- cgit v1.2.3-70-g09d2