From c1ce13b41b595847f18d2f7232850b10cd677e66 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sat, 16 Feb 2019 14:21:32 +0100 Subject: get better pdf url --- scraper/util.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'scraper/util.py') diff --git a/scraper/util.py b/scraper/util.py index 6c671cec..1b1a0a9b 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -195,25 +195,37 @@ class RawPaper(object): @property def authors(self): return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ] - @property - def pdf_link(self): - if 'primaryPaperLink' in self.data: - link = self.data['primaryPaperLink'] + def paper_links(self): + def url_part(link): if type(link) == dict and 'url' in link: return link['url'] return link - return None + paper_links = [] + if 'primaryPaperLink' in self.data: + paper_links.append(url_part(self.data['primaryPaperLink'])) + if 'alternatePaperLinks' in self.data: + for link in self.data['alternatePaperLinks']: + paper_links.append(url_part(link)) + def pdf_links(self): + return [ link for link in self.paper_links() if 'pdf' in link ] + def doi_links(self): + return [ link for link in self.paper_links() if 'pdf' not in link ] + @property + def pdf_link(self): + links = self.pdf_links() + return links[0] if len(links) else None def record(self): return [ self.paper_id, self.title, self.journal, self.year ] def load_paper(paper_id): - if os.path.exists(paper_path('db_papers', paper_id)): - # print('db paper') - return DbPaper(paper_id) + # no longer using DB papers :p + # if os.path.exists(paper_path('db_papers', paper_id)) + # print('db paper') + # return DbPaper(paper_id) if os.path.exists(paper_path('raw_papers', paper_id)): # print('raw paper') return RawPaper(paper_id) - print('no paper') + print('no raw paper: {}'.format(paper_id)) return None def dedupe(a): -- cgit v1.2.3-70-g09d2