1 files changed, 205 insertions, 0 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
new file mode 100644
index 00000000..207b7abe
--- /dev/null
+++ b/scraper/s2-final-report.py
@@ -0,0 +1,205 @@
+import os
+import re
+import glob
+import simplejson as json
+import math
+import operator
+import click
+import subprocess
+from util import *
+
+DIR_PUBLIC_CITATIONS = "../site/datasets/final"
+
+@click.command()
+def s2_final_report():
+  addresses = AddressBook()
+  megapixels = load_megapixels_lookup()
+  ft_lookup = load_ft_lookup()
+  for key, row in megapixels.items():
+    print(key)
+    ft_share = ft_lookup[key]
+    if ft_share:
+      paper_data = process_paper(row, addresses)
+
+def process_paper(row, addresses):
+  aggregate_citations = {}
+  papers = []
+  for paper_id in row['paper_ids']:
+    res = process_single_paper(row, addresses, aggregate_citations)
+    if res:
+      papers.append(res)
+  if not len(papers):
+    return
+  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+    json.dump({
+      'id': papers[0]['paper_id'],
+      'paper': papers[0],
+      'address': papers[0]['address'],
+      'additional_papers': papers[1:],
+      'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
+    }, f)
+
+def process_single_paper(row, addresses, aggregate_citations):
+  res = {
+    'paper_id': '',
+    'key': '',
+    'title': '',
+    'journal': '',
+    'year': '',
+    'address': '',
+    'pdf_link': '',
+    'citation_count': 0,
+    'citations_geocoded': 0,
+    'citations_unknown': 0,
+    'citations_empty': 0,
+    'citations_pdf': 0,
+    'citations_doi': 0,
+  }
+
+  geocoded_citations = []
+  unknown_citations = []
+  empty_citations = []
+  pdf_count = 0
+  doi_count = 0
+  address_count = 0
+  paper_id = row['paper_id']
+
+  fn = file_path('papers', paper_id, 'paper.json')
+
+  with open(fn, 'r') as f:
+    data = json.load(f)
+  print('>> {} {}'.format(data['paperId'], row['key']))
+  paper = load_paper(data['paperId'])
+  if paper is None:
+    print("Paper missing! {}".format(data['paperId']))
+    return
+  
+  res['key'] = row['key']
+  res['name'] = row['name']
+  res['paper_id'] = paper.paper_id
+  res['title'] = paper.title
+  res['journal'] = paper.journal
+  res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
+  res['pdf_link'] = paper.pdf_link
+  # res['authors'] = ', '.join(paper.authors)
+  # res['citations'] = []
+
+  paper_institutions = load_institutions(paper.paper_id)
+  paper_address = None
+  for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+    # print(inst[1])
+    institution = inst[1]
+    if paper_address is None:
+      paper_address = addresses.findObject(institution)
+
+  if paper_address:
+    # print(paper_address)
+    res['address'] = paper_address
+
+  for cite in data['citations']:
+    citationId = cite['paperId']
+    if citationId in aggregate_citations:
+      continue
+    seen_here = {}
+    citation = load_paper(citationId)
+    has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
+    has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
+    if has_pdf:
+      pdf_count += 1
+    if has_doi:
+      doi_count += 1
+    if citation is None or citation.data is None:
+      print("Citation missing! {}".format(cite['paperId']))
+      continue
+    institutions = load_institutions(citationId)
+    geocoded_addresses = []
+    geocoded_institutions = []
+    institution = ''
+    address = None
+    for inst in sorted(institutions, key=operator.itemgetter(1)):
+      address_count += 1
+      institution = inst[1]
+      next_address = addresses.findObject(institution)
+      if next_address and next_address['address'] not in seen_here:
+        seen_here[next_address['address']] = True
+        address = next_address
+        geocoded_addresses.append(next_address)
+    if not address:
+      if has_pdf:
+        headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
+        heading_string = '\n'.join(headings[0:20])
+        found_addresses = []
+        if len(headings):
+          for heading in headings:
+            l = heading.lower().strip()
+            if l:
+              next_address = addresses.findObject(l)
+              if next_address and next_address['address'] not in seen_here:
+                seen_here[next_address['address']] = True
+                address = next_address
+                geocoded_addresses.append(next_address)
+    if address:
+      if citationId not in aggregate_citations:
+        pdf_link = citation.pdf_link
+        if type(pdf_link) == dict and 'url' in pdf_link:
+          pdf_link = pdf_link['url']
+        aggregate_citations[citationId] = {
+          'id': citationId,
+          'title': citation.title,
+          'addresses': geocoded_addresses,
+          'year': citation.year,
+          'pdf': pdf_link,
+        }
+
+  # res['citation_count'] = len(data['citations'])
+  # res['citations_geocoded'] = len(geocoded_citations)
+  # res['citations_unknown'] = len(unknown_citations)
+  # res['citations_empty'] = len(empty_citations)
+  # res['citations_pdf'] = pdf_count
+  # res['citations_doi'] = doi_count
+
+  return res
+
+def load_ft_lookup():
+  keys, rows = fetch_google_sheet('datasets')
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    if rec['ft_share'] == '1' or rec['ft_share'] == 1:
+      lookup[rec['key']] = True
+    else:
+      lookup[rec['key']] = False
+  return lookup
+
+def load_megapixels_lookup():
+  keys, rows = read_csv('datasets/citation_lookup.csv')
+  lookup = {}
+  for row in rows:
+    rec = {}
+    for index, key in enumerate(keys):
+      rec[key] = row[index]
+    paper_key = rec['key']
+    if paper_key not in lookup:
+      rec['paper_ids'] = []
+      lookup[paper_key] = rec
+    lookup[paper_key]['paper_ids'].append(rec['paper_id'])
+    # recs.append(rec)
+  return lookup
+
+def load_institutions(paperId):
+  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
+    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
+  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
+    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
+  else:
+    return []
+
+def data_path(key, paper_id):
+  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
+def file_path(key, paper_id, fn):
+  return os.path.join(data_path(key, paper_id), fn)
+  
+if __name__ == '__main__':
+  s2_final_report()