1 files changed, 20 insertions, 2 deletions
diff --git a/scraper/util.py b/scraper/util.py
index bb210012..899089f7 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -7,6 +7,9 @@ import gspread
 from multiprocessing import Pool
 import simplejson as json
 from oauth2client.service_account import ServiceAccountCredentials
+#from s2 import SemanticScholarAPI
+
+#s2 = SemanticScholarAPI()
 
 def read_citation_list(index=0):
   filename = './datasets/citations.csv'
@@ -176,8 +179,21 @@ class RawPaper(object):
     data = read_json(paper_path('raw_papers', paper_id))
     # {'responseType': 'CANONICAL', 'canonicalId': '02ccd5f0eb9a48a6af088197b950fb30a8e3abcc', 'canonicalSlug': 'Scaling-for-Multimodal-3-D-Object-Detection-Stanford'}
     if 'responseType' in data and data['responseType'] == 'CANONICAL':
-      data = read_json(paper_path('raw_papers', data['canonicalId']))
-
+      canonical_id = data['canonicalId']
+      canonical_path = paper_path('raw_papers', canonical_id)
+      if os.path.exists(canonical_path):
+        data = read_json(canonical_path)
+      else:
+        # print('fetching canonical paper {}'.format(canonical_id))
+        # os.makedirs(paper_path('raw_papers', canonical_id).replace('paper.json', ''), exist_ok=True)
+        # data = s2.raw_paper(canonical_id)
+        data = None
+        if data is None:
+          # print("Got empty canonical paper?? {}".format(canonical_id))
+          self.data = None
+          return None
+        #else:
+          #write_json(canonical_path, data)
     # print(data)
     if 'paper' not in data:
       print(data)
@@ -200,6 +216,8 @@ class RawPaper(object):
   def authors(self):
     return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ]
   def paper_links(self):
+    if self.data is None:
+      return []
     def url_part(link):
       if type(link) == dict and 'url' in link:
         return link['url']