summaryrefslogtreecommitdiff
path: root/s2-citation-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-citation-report.py')
-rw-r--r--s2-citation-report.py136
1 files changed, 136 insertions, 0 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
new file mode 100644
index 00000000..54d267eb
--- /dev/null
+++ b/s2-citation-report.py
@@ -0,0 +1,136 @@
+import os
+import gzip
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def s2_citation_report():
+ addresses = load_addresses()
+ for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
+ process_paper(fn, addresses)
+
+def process_paper(fn, addresses):
+ res = {}
+ address_count = 0
+ geocode_count = 0
+ geocoded_citations = []
+ unknown_citations = []
+ display_geocoded_citations = []
+ with open(fn, 'r') as f:
+ data = json.load(f)
+ print('>> {}'.format(data['paperId']))
+ paper = load_paper(data['paperId'])
+ if paper.data is None:
+ print("Paper missing! {}".format(data['paperId']))
+ return
+ res['paperId'] = paper.paper_id
+ res['title'] = paper.title
+ res['journal'] = paper.journal
+ res['authors'] = paper.authors
+ res['citations'] = []
+ for cite in data['citations']:
+ citationId = cite['paperId']
+ citation = load_paper(citationId)
+ if citation.data is None:
+ print("Citation missing! {}".format(cite['paperId']))
+ continue
+ institutions = load_institutions(citationId)
+ geocoded_institutions = []
+ institution = ''
+ address = None
+ for inst in sorted(institutions, key=operator.itemgetter(1)):
+ # print(inst[1])
+ address_count += 1
+ institution = inst[1]
+ if institution in addresses:
+ address = addresses[institution]
+ geocode_count += 1
+ geocoded_institutions.append(institution)
+ else:
+ for part in institution.split(', '):
+ if part in addresses:
+ address = addresses[part]
+ geocode_count += 1
+ geocoded_institutions.append(institution)
+ res['citations'].append({
+ 'title': citation.title,
+ 'journal': citation.journal,
+ 'authors': citation.authors,
+ 'institutions': [inst[1] for inst in institutions],
+ 'geocoded': geocoded_institutions,
+ })
+ if len(geocoded_institutions):
+ geocoded_citations.append([
+ citation.title,
+ institution,
+ address,
+ ])
+ display_geocoded_citations.append([
+ citation.title,
+ institution,
+ ', '.join(address),
+ ])
+ else:
+ unknown_citations.append([
+ citation.title,
+ institution,
+ ])
+
+ total_citations = len(geocoded_citations) + len(unknown_citations)
+ os.makedirs('reports/papers/', exist_ok=True)
+ with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
+ f.write("<!doctype html>")
+ f.write("<html>")
+ f.write("<head>")
+ f.write('<meta charset="utf-8">')
+ f.write("<title>{}</title>".format(paper.title))
+ f.write("<link rel='stylesheet' href='../reports.css'>")
+ f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
+ f.write("</head>")
+ f.write("<body>")
+ f.write("<div id='mapid'></div>")
+ f.write("<h2>{}</h2>".format(paper.title))
+ f.write('<ul>')
+ if paper.journal:
+ f.write('<li>{}</li>'.format(paper.journal))
+ f.write('<li>{}</li>'.format(paper.year))
+ f.write('<li>{} / {} citations ({} %)</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+ f.write('</ul>')
+ f.write('<h3>{}</h3>'.format('Geocoded Citations'))
+ write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
+ f.write('<h3>{}</h3>'.format('Other Citations'))
+ write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
+ f.write("</body>")
+ f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
+ f.write('<script type="text/json">')
+ json.dump(geocoded_citations, f)
+ f.write('</script>')
+ f.write('<script src="../map.js"></script>')
+ f.write("</html>")
+ return res
+
+def load_addresses():
+ data = read_csv('reports/all_institutions.csv', keys=None)
+ lookup = {}
+ for row in data:
+ name = row[0]
+ lookup[name] = row
+ return lookup
+
+def load_institutions(paperId):
+ if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')):
+ return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions']
+ elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')):
+ return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions']
+ else:
+ return []
+
+def data_path(key, paper_id):
+ return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ s2_citation_report()