1 files changed, 136 insertions, 0 deletions
diff --git a/s2-citation-report.py b/s2-citation-report.py
new file mode 100644
index 00000000..54d267eb
--- /dev/null
+++ b/s2-citation-report.py
@@ -0,0 +1,136 @@
+import os
+import gzip
+import glob
+import json
+import math
+import operator
+import click
+from util import *
+
+@click.command()
+def s2_citation_report():
+  addresses = load_addresses()
+  for fn in glob.iglob('datasets/s2/papers/**/*.json', recursive=True):
+    process_paper(fn, addresses)
+
+def process_paper(fn, addresses):
+  res = {}
+  address_count = 0
+  geocode_count = 0
+  geocoded_citations = []
+  unknown_citations = []
+  display_geocoded_citations = []
+  with open(fn, 'r') as f:
+    data = json.load(f)
+    print('>> {}'.format(data['paperId']))
+    paper = load_paper(data['paperId'])
+    if paper.data is None:
+      print("Paper missing! {}".format(data['paperId']))
+      return
+    res['paperId'] = paper.paper_id
+    res['title'] = paper.title
+    res['journal'] = paper.journal
+    res['authors'] = paper.authors
+    res['citations'] = []
+    for cite in data['citations']:
+      citationId = cite['paperId']
+      citation = load_paper(citationId)
+      if citation.data is None:
+        print("Citation missing! {}".format(cite['paperId']))
+        continue
+      institutions = load_institutions(citationId)
+      geocoded_institutions = []
+      institution = ''
+      address = None
+      for inst in sorted(institutions, key=operator.itemgetter(1)):
+        # print(inst[1])
+        address_count += 1
+        institution = inst[1]
+        if institution in addresses:
+          address = addresses[institution]
+          geocode_count += 1
+          geocoded_institutions.append(institution)
+        else:
+          for part in institution.split(', '):
+            if part in addresses:
+              address = addresses[part]
+              geocode_count += 1
+              geocoded_institutions.append(institution)
+      res['citations'].append({
+        'title': citation.title,
+        'journal': citation.journal,
+        'authors': citation.authors,
+        'institutions': [inst[1] for inst in institutions],
+        'geocoded': geocoded_institutions,
+      })
+      if len(geocoded_institutions):
+        geocoded_citations.append([
+          citation.title,
+          institution,
+          address,
+        ])
+        display_geocoded_citations.append([
+          citation.title,
+          institution,
+          ', '.join(address),
+        ])
+      else:
+        unknown_citations.append([
+          citation.title,
+          institution,
+        ])
+
+  total_citations = len(geocoded_citations) + len(unknown_citations)
+  os.makedirs('reports/papers/', exist_ok=True)
+  with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
+    f.write("<!doctype html>")
+    f.write("<html>")
+    f.write("<head>")
+    f.write('<meta charset="utf-8">')
+    f.write("<title>{}</title>".format(paper.title))
+    f.write("<link rel='stylesheet' href='../reports.css'>")
+    f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
+    f.write("</head>")
+    f.write("<body>")
+    f.write("<div id='mapid'></div>")
+    f.write("<h2>{}</h2>".format(paper.title))
+    f.write('<ul>')
+    if paper.journal:
+      f.write('<li>{}</li>'.format(paper.journal))
+    f.write('<li>{}</li>'.format(paper.year))
+    f.write('<li>{} / {} citations ({} %)</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+    f.write('</ul>')
+    f.write('<h3>{}</h3>'.format('Geocoded Citations'))
+    write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
+    f.write('<h3>{}</h3>'.format('Other Citations'))
+    write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
+    f.write("</body>")
+    f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
+    f.write('<script type="text/json">')
+    json.dump(geocoded_citations, f)
+    f.write('</script>')
+    f.write('<script src="../map.js"></script>')
+    f.write("</html>")
+  return res
+
+def load_addresses():
+  data = read_csv('reports/all_institutions.csv', keys=None)
+  lookup = {}
+  for row in data:
+    name = row[0]
+    lookup[name] = row
+  return lookup
+
+def load_institutions(paperId):
+  if os.path.exists(os.path.join(data_path('pdf', paperId), 'institutions.json')):
+    return read_json(os.path.join(data_path('pdf', paperId), 'institutions.json'))['institutions']
+  elif os.path.exists(os.path.join(data_path('doi', paperId), 'institutions.json')):
+    return read_json(os.path.join(data_path('doi', paperId), 'institutions.json'))['institutions']
+  else:
+    return []
+
+def data_path(key, paper_id):
+  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+  
+if __name__ == '__main__':
+  s2_citation_report()