1 files changed, 88 insertions, 3 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
index 5c5fae9a..d70a378a 100644
--- a/scraper/s2-citation-report.py
+++ b/scraper/s2-citation-report.py
@@ -5,9 +5,11 @@ import simplejson as json
 import math
 import operator
 import click
-#import builder
+import subprocess
 from util import *
 
+DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
+
 @click.command()
 def s2_citation_report():
   addresses = AddressBook()
@@ -30,6 +32,82 @@ def s2_citation_report():
   print("citations: {}".format(paper_count))
   print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
 
+  write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
+
+  sts = subprocess.call([
+    "s3cmd", "put", "-P", "--recursive",
+    DIR_PUBLIC_CITATIONS + '/',
+    "s3://megapixels/v1/citations/",
+  ])
+
+def write_master_report(fn, papers):
+  # first make a lookup of the keys that have papers
+  paper_key_lookup = {}
+  for paper in papers:
+    if paper['key'] not in paper_key_lookup:
+      paper_key_lookup[paper['key']] = paper
+
+  # then fetch the statistics csv which has things like "year"
+  fields, rows = fetch_google_sheet('statistics')
+  master_papers = []
+  statistics = {}
+
+  def clean(n):
+    if type(n) is int:
+      return n
+    if type(n) is str and n:
+      s = str(n).replace(',','').replace('.','').replace('?','').strip()
+      try:
+        return int(s)
+      except e:
+        return s
+    if n:
+      return n
+    return None
+
+  for row in rows:
+    key = row[0]
+    if key not in paper_key_lookup:
+      continue
+    paper = paper_key_lookup[key]
+    stats = {}
+    for index, field in enumerate(fields):
+      stats[field] = row[index]
+    report_fn = '../site/content/datasets/{}/index.md'.format(key)
+    has_report = os.path.exists(report_fn)
+    statistics[key] = stats
+    search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
+
+    image_count = stats['images']
+    if type(image_count) is str:
+      if len(image_count):
+        image_count = clean(image_count)
+      else:
+        image_count = None,
+    master_papers.append([
+      stats['key'],
+      stats['name'],
+      '/datasets/{}/'.format(key) if has_report else '',
+      image_count,
+      clean(stats['faces_unique']) or None,
+      stats['year_published'],
+      clean(paper['citation_count']) or 0,
+      clean(search_result['citationStats']['numKeyCitations']) or 0,
+      # origin
+    ])
+  master_paper_keys = [
+    'key',
+    'title',
+    'link',
+    'images',
+    'people',
+    'year',
+    'citations',
+    'influenced',
+    # 'origin'
+  ]
+  write_csv(fn, keys=master_paper_keys, rows=master_papers)
+
 def write_papers_report(fn, title, papers, key, reverse=False):
   sorted_papers = []
   for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
@@ -105,7 +183,7 @@ def process_paper(row, addresses, success):
 
   with open(fn, 'r') as f:
     data = json.load(f)
-    print('>> {}'.format(data['paperId']))
+    print('>> {} {}'.format(data['paperId'], row['key']))
     paper = load_paper(data['paperId'])
     if paper is None:
       print("Paper missing! {}".format(data['paperId']))
@@ -145,7 +223,7 @@ def process_paper(row, addresses, success):
         pdf_count += 1
       if has_doi:
         doi_count += 1
-      if citation.data is None:
+      if citation is None or citation.data is None:
         print("Citation missing! {}".format(cite['paperId']))
         continue
       institutions = load_institutions(citationId)
@@ -262,6 +340,13 @@ def process_paper(row, addresses, success):
     f.write('<script src="../map.js"></script>')
     f.write("</html>")
   # template = env.get_template('paper.html')
+  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+    json.dump({
+      'id': paper.paper_id,
+      'paper': res,
+      'address': paper_address,
+      'citations': geocoded_citations,
+    }, f)
   return res
 
 def load_megapixels_queries():