rebuild

author: Jules Laplace <julescarbon@gmail.com> 2018-12-16 16:29:04 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-12-16 16:29:04 +0100
commit: 05fc975a313aa38483d904cb9ad07a029641d086 (patch)
tree: 85f3d4fdf3688c2779d3ca3ba9c59910a48d1df9 /scraper/s2-citation-report.py
parent: 110f3a34f1f36d0ea999d4aa34bbe66d5f2a01da (diff)
1 files changed, 62 insertions, 5 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
index e0d812d7..b5849329 100644
--- a/scraper/s2-citation-report.py
+++ b/scraper/s2-citation-report.py
@@ -32,16 +32,73 @@ def s2_citation_report():
   print("citations: {}".format(paper_count))
   print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
 
-  # fetch_google_sheet
+  write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
 
   sts = subprocess.call([
-    "s3cmd", "sync",
+    "s3cmd", "put", "-P", "--recursive",
     DIR_PUBLIC_CITATIONS + '/',
     "s3://megapixels/v1/citations/",
   ])
 
-def write_master_report(fn, title, papers, key):
-  keys, rows = fetch_google_sheet('statistics')
+def write_master_report(fn, papers):
+  # first make a lookup of the keys that have papers
+  paper_key_lookup = {}
+  for paper in papers:
+    if paper['key'] not in paper_key_lookup:
+      paper_key_lookup[paper['key']] = paper
+
+  # then fetch the statistics csv which has things like "year"
+  fields, rows = fetch_google_sheet('statistics')
+  master_papers = []
+  statistics = {}
+
+  def clean(n):
+    if n:
+      return int(n.replace(',','').replace('.','').replace('?','').strip())
+    return None
+
+  for row in rows:
+    key = row[0]
+    if key not in paper_key_lookup:
+      continue
+    paper = paper_key_lookup[key]
+    stats = {}
+    for index, field in enumerate(fields):
+      stats[field] = row[index]
+    report_fn = '../site/content/datasets/{}/index.md'.format(key)
+    has_report = os.path.exists(report_fn)
+    statistics[key] = stats
+    search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
+
+    image_count = stats['images']
+    if type(image_count) is str:
+      if len(image_count):
+        image_count = clean(image_count)
+      else:
+        image_count = None,
+    master_papers.append([
+      stats['key'],
+      stats['name'],
+      '/datasets/{}/'.format(key) if has_report else '',
+      image_count,
+      clean(stats['faces_unique']) or None,
+      stats['year_published'],
+      clean(paper['citation_count']) or 0,
+      clean(search_result['citationStats']['numKeyCitations']) or 0,
+      # origin
+    ])
+  master_paper_keys = [
+    'key',
+    'title',
+    'link',
+    'images',
+    'people',
+    'year',
+    'citations',
+    'influenced',
+    # 'origin'
+  ]
+  write_csv(fn, keys=master_paper_keys, rows=master_papers)
 
 def write_papers_report(fn, title, papers, key, reverse=False):
   sorted_papers = []
@@ -275,7 +332,7 @@ def process_paper(row, addresses, success):
     f.write('<script src="../map.js"></script>')
     f.write("</html>")
   # template = env.get_template('paper.html')
-  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.paper_id), 'w') as f:
+  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.key), 'w') as f:
     json.dump({
       'id': paper.paper_id,
       'paper': res,
author	Jules Laplace <julescarbon@gmail.com>	2018-12-16 16:29:04 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-12-16 16:29:04 +0100
commit	05fc975a313aa38483d904cb9ad07a029641d086 (patch)
tree	85f3d4fdf3688c2779d3ca3ba9c59910a48d1df9 /scraper/s2-citation-report.py
parent	110f3a34f1f36d0ea999d4aa34bbe66d5f2a01da (diff)