diff options
Diffstat (limited to 'scraper/util.py')
| -rw-r--r-- | scraper/util.py | 22 |
1 files changed, 20 insertions, 2 deletions
diff --git a/scraper/util.py b/scraper/util.py index bb210012..899089f7 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -7,6 +7,9 @@ import gspread from multiprocessing import Pool import simplejson as json from oauth2client.service_account import ServiceAccountCredentials +#from s2 import SemanticScholarAPI + +#s2 = SemanticScholarAPI() def read_citation_list(index=0): filename = './datasets/citations.csv' @@ -176,8 +179,21 @@ class RawPaper(object): data = read_json(paper_path('raw_papers', paper_id)) # {'responseType': 'CANONICAL', 'canonicalId': '02ccd5f0eb9a48a6af088197b950fb30a8e3abcc', 'canonicalSlug': 'Scaling-for-Multimodal-3-D-Object-Detection-Stanford'} if 'responseType' in data and data['responseType'] == 'CANONICAL': - data = read_json(paper_path('raw_papers', data['canonicalId'])) - + canonical_id = data['canonicalId'] + canonical_path = paper_path('raw_papers', canonical_id) + if os.path.exists(canonical_path): + data = read_json(canonical_path) + else: + # print('fetching canonical paper {}'.format(canonical_id)) + # os.makedirs(paper_path('raw_papers', canonical_id).replace('paper.json', ''), exist_ok=True) + # data = s2.raw_paper(canonical_id) + data = None + if data is None: + # print("Got empty canonical paper?? {}".format(canonical_id)) + self.data = None + return None + #else: + #write_json(canonical_path, data) # print(data) if 'paper' not in data: print(data) @@ -200,6 +216,8 @@ class RawPaper(object): def authors(self): return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ] def paper_links(self): + if self.data is None: + return [] def url_part(link): if type(link) == dict and 'url' in link: return link['url'] |
