diff options
Diffstat (limited to 'scraper/util.py')
| -rw-r--r-- | scraper/util.py | 38 |
1 files changed, 32 insertions, 6 deletions
diff --git a/scraper/util.py b/scraper/util.py index 27465487..7b55afae 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -156,11 +156,16 @@ class DbPaper(object): return [ (author['ids'][0] if len(author['ids']) else '', author['name']) for author in self.data['authors'] ] @property def pdf_link(self): + link = None if self.data['s2PdfUrl']: - return self.data['s2PdfUrl'] - if len(self.data['pdfUrls']): - return self.data['pdfUrls'][0] - return None + link = self.data['s2PdfUrl'] + elif len(self.data['pdfUrls']): + link = self.data['pdfUrls'][0] + if link is None: + return None + if type(link) == dict and 'url' in link: + return link['url'] + return link def record(self): return [ self.paper_id, self.title, self.journal, self.year ] @@ -192,7 +197,10 @@ class RawPaper(object): @property def pdf_link(self): if 'primaryPaperLink' in self.data: - return self.data['primaryPaperLink'] + link = self.data['primaryPaperLink'] + if type(link) == dict and 'url' in link: + return link['url'] + return link return None def record(self): return [ self.paper_id, self.title, self.journal, self.year ] @@ -290,10 +298,28 @@ class AddressBook (object): 'address': row[0], 'lat': row[3], 'lng': row[4], - 'address_type': row[5], + 'type': row[5], } return None +def fetch_paper(s2, paper_id): + os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) + paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) + if os.path.exists(paper_fn): + return read_json(paper_fn) + print(paper_id) + paper = s2.paper(paper_id) + if paper is None: + print("Got none paper??") + # time.sleep(random.randint(1, 2)) + paper = s2.paper(paper_id) + if paper is None: + print("Paper not found") + return None + write_json(paper_fn, paper) + # time.sleep(random.randint(1, 2)) + return paper + def fetch_spreadsheet(): scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive'] credentials = ServiceAccountCredentials.from_json_keyfile_name('./.creds/Megapixels-ef28f91112a9.json', scope) |
