1 files changed, 378 insertions, 0 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
new file mode 100644
index 00000000..580312ff
--- /dev/null
+++ b/scraper/s2-citation-report.py
@@ -0,0 +1,378 @@
+import os
+import re
+import glob
+import simplejson as json
+import math
+import operator
+import click
+import subprocess
+from util import *
+
+DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
+
+@click.command()
+def s2_citation_report():
+  addresses = AddressBook()
+  megapixels = load_megapixels_queries()
+  successful_geocodes = {}
+  papers = []
+  for row in megapixels:
+    paper_data = process_paper(row, addresses, successful_geocodes)
+    if paper_data is not None:
+      papers.append(paper_data)
+  write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
+  write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)
+
+  paper_count = 0
+  geocode_count = 0
+  for key, value in successful_geocodes.items():
+    if value:
+      geocode_count += 1
+    paper_count += 1
+  print("citations: {}".format(paper_count))
+  print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
+
+  write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
+
+  sts = subprocess.call([
+    "s3cmd", "put", "-P", "--recursive",
+    DIR_PUBLIC_CITATIONS + '/',
+    "s3://megapixels/v1/citations/",
+  ])
+
+def write_master_report(fn, papers):
+  # first make a lookup of the keys that have papers
+  paper_key_lookup = {}
+  for paper in papers:
+    if paper['key'] not in paper_key_lookup:
+      paper_key_lookup[paper['key']] = paper
+
+  # then fetch the statistics csv which has things like "year"
+  fields, rows = fetch_google_sheet('statistics')
+  master_papers = []
+  statistics = {}
+
+  def clean(n):
+    if type(n) is int:
+      return n
+    if type(n) is str and n:
+      s = str(n).replace(',','').replace('.','').replace('?','').strip()
+      try:
+        return int(s)
+      except e:
+        return s
+    if n:
+      return n
+    return None
+
+  for row in rows:
+    key = row[0]
+    if key not in paper_key_lookup:
+      continue
+    paper = paper_key_lookup[key]
+    stats = {}
+    for index, field in enumerate(fields):
+      stats[field] = row[index]
+    report_fn = '../site/content/datasets/{}/index.md'.format(key)
+    has_report = os.path.exists(report_fn)
+    statistics[key] = stats
+    search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
+
+    image_count = stats['images']
+    if type(image_count) is str:
+      if len(image_count):
+        image_count = clean(image_count)
+      else:
+        image_count = None,
+    master_papers.append([
+      stats['key'],
+      stats['name'],
+      '/datasets/{}/'.format(key) if has_report else '',
+      image_count,
+      clean(stats['faces_unique']) or None,
+      stats['year_published'],
+      clean(paper['citation_count']) or 0,
+      clean(search_result['citationStats']['numKeyCitations']) or 0,
+      # origin
+    ])
+  master_paper_keys = [
+    'key',
+    'title',
+    'link',
+    'images',
+    'people',
+    'year',
+    'citations',
+    'influenced',
+    # 'origin'
+  ]
+  write_csv(fn, keys=master_paper_keys, rows=master_papers)
+
+def write_papers_report(fn, title, papers, key, reverse=False):
+  sorted_papers = []
+  for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
+    sorted_papers.append([
+      paper['paperId'],
+      paper['key'],
+      paper['name'],
+      LinkLine(paper['report_link'], paper['title']),
+      LinkLine(paper['pdf_link'], '[pdf]'),
+      paper['journal'],
+      paper['address_type'],
+      paper['address'],
+      paper['lat'],
+      paper['lng'],
+      str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
+      paper['citation_count'],
+      paper['citations_geocoded'],
+      paper['citations_unknown'],
+      paper['citations_empty'],
+      paper['citations_pdf'],
+      paper['citations_doi'],
+    ])
+  sorted_paper_keys = [
+    'Paper ID',
+    'Megapixels Key',
+    'Megapixels Name',
+    'Report Link',
+    'PDF Link',
+    'Journal',
+    'Type',
+    'Address',
+    'Lat',
+    'Lng',
+    'Coverage',
+    'Total Citations',
+    'Geocoded Citations',
+    'Unknown Citations',
+    'Empty Citations',
+    'With PDF', 
+    'With DOI',
+  ]
+  write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)
+
+def process_paper(row, addresses, success):
+  res = {
+    'paperId': '',
+    'key': '',
+    'title': '',
+    'journal': '',
+    'address': '',
+    'address_type': '',
+    'lat': '',
+    'lng': '',
+    'pdf_link': '',
+    'report_link': '',
+    'citation_count': 0,
+    'citations_geocoded': 0,
+    'citations_unknown': 0,
+    'citations_empty': 0,
+    'citations_pdf': 0,
+    'citations_doi': 0,
+  }
+
+  geocoded_citations = []
+  unknown_citations = []
+  display_geocoded_citations = []
+  empty_citations = []
+  pdf_count = 0
+  doi_count = 0
+  address_count = 0
+
+  fn = file_path('papers', row['paper_id'], 'paper.json')
+
+  with open(fn, 'r') as f:
+    data = json.load(f)
+    print('>> {} {}'.format(data['paperId'], row['key']))
+    paper = load_paper(data['paperId'])
+    if paper is None:
+      print("Paper missing! {}".format(data['paperId']))
+      return
+    
+    res['key'] = row['key']
+    res['name'] = row['name']
+    res['paperId'] = paper.paper_id
+    res['title'] = paper.title
+    res['journal'] = paper.journal
+    res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
+    res['pdf_link'] = paper.pdf_link
+    # res['authors'] = ', '.join(paper.authors)
+    # res['citations'] = []
+
+    paper_institutions = load_institutions(paper.paper_id)
+    paper_address = None
+    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+      # print(inst[1])
+      institution = inst[1]
+      if paper_address is None:
+        paper_address = addresses.find(institution)
+
+    if paper_address:
+      # print(paper_address)
+      res['address'] = paper_address[0]
+      res['lat'] = paper_address[3]
+      res['lng'] = paper_address[4]
+      res['address_type'] = paper_address[5]
+
+    for cite in data['citations']:
+      citationId = cite['paperId']
+      citation = load_paper(citationId)
+      has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
+      has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
+      if has_pdf:
+        pdf_count += 1
+      if has_doi:
+        doi_count += 1
+      if citation is None or citation.data is None:
+        print("Citation missing! {}".format(cite['paperId']))
+        continue
+      institutions = load_institutions(citationId)
+      geocoded_institutions = []
+      unknown_institutions = []
+      institution = ''
+      address = None
+      for inst in sorted(institutions, key=operator.itemgetter(1)):
+        # print(inst[1])
+        address_count += 1
+        institution = inst[1]
+        next_address = addresses.find(institution)
+        if next_address:
+          address = next_address
+          geocoded_institutions.append(institution)
+        else:
+          unknown_institutions.append(institution)
+      if not address:
+        if has_pdf:
+          headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
+          heading_string = '\n'.join(headings[0:20])
+          found_addresses = []
+          if len(headings):
+            for heading in headings:
+              l = heading.lower().strip()
+              if l:
+                next_address = addresses.find(l)
+                if next_address:
+                  address = next_address
+                  geocoded_institutions.append(heading)
+                else:
+                  unknown_institutions.append(heading)
+          else:
+            empty_citations.append([
+              citationId,
+              citation.title,
+            ])
+
+      # res['citations'].append({
+      #   'title': citation.title,
+      #   'journal': citation.journal,
+      #   'authors': citation.authors,
+      #   'institutions': [inst[1] for inst in institutions],
+      #   'geocoded': geocoded_institutions,
+      # })
+      if address:
+        success[citationId] = True
+        geocoded_citations.append([
+          citation.title,
+          institution,
+        ] + address + [
+          citation.year,
+        ])
+        display_geocoded_citations.append([
+          citationId,
+          LinkLine(citation.pdf_link, '[pdf]'),
+          citation.title,
+        ] + address[0:5])
+      else:
+        success[citationId] = False
+        unknown_citations.append([
+          citationId,
+          LinkLine(citation.pdf_link, '[pdf]'),
+          citation.title,
+          '<br>'.join(unknown_institutions),
+        ])
+    res['citation_count'] = len(data['citations'])
+    res['citations_geocoded'] = len(geocoded_citations)
+    res['citations_unknown'] = len(unknown_citations)
+    res['citations_empty'] = len(empty_citations)
+    res['citations_pdf'] = pdf_count
+    res['citations_doi'] = doi_count
+
+  total_citations = len(geocoded_citations) + len(unknown_citations)
+  os.makedirs('reports/papers/', exist_ok=True)
+  with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
+    f.write("<!doctype html>")
+    f.write("<html>")
+    f.write("<head>")
+    f.write('<meta charset="utf-8">')
+    f.write("<title>{}</title>".format(paper.title))
+    f.write("<link rel='stylesheet' href='../reports.css'>")
+    f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
+    f.write("</head>")
+    f.write("<body>")
+    f.write("<div id='mapid'></div>")
+    f.write("<h2>{}</h2>".format(paper.title))
+    f.write('<ul>')
+    if paper.journal:
+      f.write('<li>Journal: {}</li>'.format(paper.journal))
+    if paper_address:
+      f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
+      f.write('<li>Address: {}</li>'.format(paper_address[2]))
+      f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
+    f.write('<li>Year: {}</li>'.format(paper.year))
+    if total_citations == 0:
+      f.write('<li>Coverage: No citations found!</li>')
+    else:
+      f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
+    f.write('</ul>')
+    f.write('<h3>{}</h3>'.format('Geocoded Citations'))
+    write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
+    f.write('<h3>{}</h3>'.format('Other Citations'))
+    write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
+    f.write("</body>")
+    f.write('<script src="../snap.svg-min.js"></script>')
+    f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
+    f.write('<script src="../leaflet.arc.js"></script>')
+    f.write('<script src="../leaflet.bezier.js"></script>')
+    f.write('<script type="text/json" id="address">')
+    json.dump(paper_address, f)
+    f.write('</script>')
+    f.write('<script type="text/json" id="citations">')
+    json.dump(geocoded_citations, f)
+    f.write('</script>')
+    f.write('<script src="../map.js"></script>')
+    f.write("</html>")
+  # template = env.get_template('paper.html')
+  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+    json.dump({
+      'id': paper.paper_id,
+      'paper': res,
+      'address': paper_address,
+      'citations': geocoded_citations,
+    }, f)
+  return res
+
+def load_megapixels_queries():
+  keys, rows = read_csv('datasets/citation_lookup.csv')
+  recs = []
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    recs.append(rec)
+  return recs
+
+def load_institutions(paperId):
+  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
+  else:
+    return []
+
+def data_path(key, paper_id):
+  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+  return os.path.join(data_path(key, paper_id), fn)
+  
+if __name__ == '__main__':
+  s2_citation_report()