summaryrefslogtreecommitdiff
path: root/scraper/s2-citation-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-12-14 02:31:14 +0100
committerJules Laplace <julescarbon@gmail.com>2018-12-14 02:31:14 +0100
commit45e0625bcbc2c7f041b8c5d177c5dcf487f07d26 (patch)
tree728353b00677a865679e72429dfe6b200dc57100 /scraper/s2-citation-report.py
parent3ab28a3ff3d0e1b71f123e38ce3d0df42caddc7c (diff)
new reports
Diffstat (limited to 'scraper/s2-citation-report.py')
-rw-r--r--scraper/s2-citation-report.py37
1 files changed, 22 insertions, 15 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
index 526cf778..5c5fae9a 100644
--- a/scraper/s2-citation-report.py
+++ b/scraper/s2-citation-report.py
@@ -5,7 +5,7 @@ import simplejson as json
import math
import operator
import click
-import builder
+#import builder
from util import *
@click.command()
@@ -14,9 +14,10 @@ def s2_citation_report():
megapixels = load_megapixels_queries()
successful_geocodes = {}
papers = []
- for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
- paper_data = process_paper(fn, addresses, megapixels, successful_geocodes)
- papers.append(paper_data)
+ for row in megapixels:
+ paper_data = process_paper(row, addresses, successful_geocodes)
+ if paper_data is not None:
+ papers.append(paper_data)
write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
@@ -35,6 +36,7 @@ def write_papers_report(fn, title, papers, key, reverse=False):
sorted_papers.append([
paper['paperId'],
paper['key'],
+ paper['name'],
LinkLine(paper['report_link'], paper['title']),
LinkLine(paper['pdf_link'], '[pdf]'),
paper['journal'],
@@ -53,6 +55,7 @@ def write_papers_report(fn, title, papers, key, reverse=False):
sorted_paper_keys = [
'Paper ID',
'Megapixels Key',
+ 'Megapixels Name',
'Report Link',
'PDF Link',
'Journal',
@@ -70,7 +73,7 @@ def write_papers_report(fn, title, papers, key, reverse=False):
]
write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
-def process_paper(fn, addresses, megapixels, success):
+def process_paper(row, addresses, success):
res = {
'paperId': '',
'key': '',
@@ -98,14 +101,18 @@ def process_paper(fn, addresses, megapixels, success):
doi_count = 0
address_count = 0
+ fn = file_path('papers', row['paper_id'], 'paper.json')
+
with open(fn, 'r') as f:
data = json.load(f)
print('>> {}'.format(data['paperId']))
paper = load_paper(data['paperId'])
- if paper.data is None:
+ if paper is None:
print("Paper missing! {}".format(data['paperId']))
return
+ res['key'] = row['key']
+ res['name'] = row['name']
res['paperId'] = paper.paper_id
res['title'] = paper.title
res['journal'] = paper.journal
@@ -114,9 +121,6 @@ def process_paper(fn, addresses, megapixels, success):
# res['authors'] = ', '.join(paper.authors)
# res['citations'] = []
- if res['title'] in megapixels:
- res['key'] = megapixels[res['title']]['Database Name']
-
paper_institutions = load_institutions(paper.paper_id)
paper_address = None
for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
@@ -126,7 +130,7 @@ def process_paper(fn, addresses, megapixels, success):
paper_address = addresses.find(institution)
if paper_address:
- print(paper_address)
+ # print(paper_address)
res['address'] = paper_address[0]
res['lat'] = paper_address[3]
res['lng'] = paper_address[4]
@@ -235,7 +239,10 @@ def process_paper(fn, addresses, megapixels, success):
f.write('<li>Address: {}</li>'.format(paper_address[2]))
f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
f.write('<li>Year: {}</li>'.format(paper.year))
- f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+ if total_citations == 0:
+ f.write('<li>Coverage: No citations found!</li>')
+ else:
+ f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
f.write('</ul>')
f.write('<h3>{}</h3>'.format('Geocoded Citations'))
write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
@@ -258,14 +265,14 @@ def process_paper(fn, addresses, megapixels, success):
return res
def load_megapixels_queries():
- keys, rows = read_csv('datasets/citations-2018310.csv')
- lookup = {}
+ keys, rows = read_csv('datasets/citation_lookup.csv')
+ recs = []
for row in rows:
rec = {}
for index, key in enumerate(keys):
rec[key] = row[index]
- lookup[rec['Title'].strip()] = rec
- return lookup
+ recs.append(rec)
+ return recs
def load_institutions(paperId):
if os.path.exists(file_path('pdf', paperId, 'institutions.json')):