1 files changed, 46 insertions, 30 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index c369fa6f..854aa940 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -62,9 +62,12 @@ def process_paper(row, verified_lookup):
       papers.append(res)
       if res['address']:
         address_list.append(res['address'])
+  process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
   if not len(papers):
     return
   paper = papers[0]
+
+  # final citations - a report of all geocoded citations
   with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -74,11 +77,16 @@ def process_paper(row, verified_lookup):
       'additional_papers': papers[1:],
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # unkonwn citations - a report of all non-geocoded citations
   with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': papers[0]['paper_id'],
       'citations': [unknown_citations[key] for key in unknown_citations.keys()],
     }, f)
+
+  # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
+  # this might not need to get built...
   with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -91,6 +99,8 @@ def process_paper(row, verified_lookup):
       },
       'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
     }, f)
+
+  # verified citations - the final public reports 
   with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
     json.dump({
       'id': paper['paper_id'],
@@ -121,38 +131,44 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # 'citations_doi': 0,
   }
 
-  fn = file_path('papers', paper_id, 'paper.json')
-
-  with open(fn, 'r') as f:
-    data = json.load(f)
-  print('>> {} {}'.format(data['paperId'], row['key']))
-  paper = load_paper(data['paperId'])
-  if paper is None:
-    print("Paper missing! {}".format(data['paperId']))
-    return
+  if paper_id == 'search':
+    dataset = row['key']
+    fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
+    with open(fn, 'r') as f:
+      citations = json.load(f)
+      data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
+  else:
+    fn = file_path('papers', paper_id, 'paper.json')
+    with open(fn, 'r') as f:
+      data = json.load(f)
+    print('>> {} {}'.format(data['paperId'], row['key']))
+    paper = load_paper(data['paperId'])
+    if paper is None:
+      print("Paper missing! {}".format(data['paperId']))
+      return
   
-  res['key'] = row['key']
-  res['name'] = row['name']
-  res['paper_id'] = paper.paper_id
-  res['title'] = paper.title
-  # res['journal'] = paper.journal
-  res['year'] = paper.year
-  res['pdf'] = paper.pdf_links()
-  res['doi'] = paper.doi_links()
-  # res['authors'] = ', '.join(paper.authors)
-  # res['citations'] = []
+    res['key'] = row['key']
+    res['name'] = row['name']
+    res['paper_id'] = paper.paper_id
+    res['title'] = paper.title
+    # res['journal'] = paper.journal
+    res['year'] = paper.year
+    res['pdf'] = paper.pdf_links()
+    res['doi'] = paper.doi_links()
+    # res['authors'] = ', '.join(paper.authors)
+    # res['citations'] = []
 
-  paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
-  paper_address = None
-  for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
-    #print(inst[1])
-    institution = inst[1]
-    if paper_address is None:
-      paper_address = addresses.findObject(institution)
+    paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
+    paper_address = None
+    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
+      #print(inst[1])
+      institution = inst[1]
+      if paper_address is None:
+        paper_address = addresses.findObject(institution)
 
-  if paper_address:
-    # print(paper_address)
-    res['address'] = paper_address
+    if paper_address:
+      # print(paper_address)
+      res['address'] = paper_address
 
   for cite in data['citations']:
     citationId = cite['paperId']
@@ -169,7 +185,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
     # if has_doi:
     #   doi_count += 1
     if citation is None or citation.data is None:
-      print("Citation missing! {}".format(cite['paperId']))
+      print("Citation missing! {}".format(citationId))
       continue
     institutions = load_institutions(citationId, paper_location_lookup)
     geocoded_addresses = []