summaryrefslogtreecommitdiff
path: root/scraper/util.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/util.py')
-rw-r--r--scraper/util.py22
1 files changed, 20 insertions, 2 deletions
diff --git a/scraper/util.py b/scraper/util.py
index bb210012..899089f7 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -7,6 +7,9 @@ import gspread
from multiprocessing import Pool
import simplejson as json
from oauth2client.service_account import ServiceAccountCredentials
+#from s2 import SemanticScholarAPI
+
+#s2 = SemanticScholarAPI()
def read_citation_list(index=0):
filename = './datasets/citations.csv'
@@ -176,8 +179,21 @@ class RawPaper(object):
data = read_json(paper_path('raw_papers', paper_id))
# {'responseType': 'CANONICAL', 'canonicalId': '02ccd5f0eb9a48a6af088197b950fb30a8e3abcc', 'canonicalSlug': 'Scaling-for-Multimodal-3-D-Object-Detection-Stanford'}
if 'responseType' in data and data['responseType'] == 'CANONICAL':
- data = read_json(paper_path('raw_papers', data['canonicalId']))
-
+ canonical_id = data['canonicalId']
+ canonical_path = paper_path('raw_papers', canonical_id)
+ if os.path.exists(canonical_path):
+ data = read_json(canonical_path)
+ else:
+ # print('fetching canonical paper {}'.format(canonical_id))
+ # os.makedirs(paper_path('raw_papers', canonical_id).replace('paper.json', ''), exist_ok=True)
+ # data = s2.raw_paper(canonical_id)
+ data = None
+ if data is None:
+ # print("Got empty canonical paper?? {}".format(canonical_id))
+ self.data = None
+ return None
+ #else:
+ #write_json(canonical_path, data)
# print(data)
if 'paper' not in data:
print(data)
@@ -200,6 +216,8 @@ class RawPaper(object):
def authors(self):
return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ]
def paper_links(self):
+ if self.data is None:
+ return []
def url_part(link):
if type(link) == dict and 'url' in link:
return link['url']