diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-16 14:21:32 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-16 14:21:32 +0100 |
| commit | c1ce13b41b595847f18d2f7232850b10cd677e66 (patch) | |
| tree | 7ce70a6899f49ac3633b6364bf26b9da762a89d8 /scraper/util.py | |
| parent | 3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (diff) | |
get better pdf url
Diffstat (limited to 'scraper/util.py')
| -rw-r--r-- | scraper/util.py | 30 |
1 files changed, 21 insertions, 9 deletions
diff --git a/scraper/util.py b/scraper/util.py index 6c671cec..1b1a0a9b 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -195,25 +195,37 @@ class RawPaper(object): @property def authors(self): return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ] - @property - def pdf_link(self): - if 'primaryPaperLink' in self.data: - link = self.data['primaryPaperLink'] + def paper_links(self): + def url_part(link): if type(link) == dict and 'url' in link: return link['url'] return link - return None + paper_links = [] + if 'primaryPaperLink' in self.data: + paper_links.append(url_part(self.data['primaryPaperLink'])) + if 'alternatePaperLinks' in self.data: + for link in self.data['alternatePaperLinks']: + paper_links.append(url_part(link)) + def pdf_links(self): + return [ link for link in self.paper_links() if 'pdf' in link ] + def doi_links(self): + return [ link for link in self.paper_links() if 'pdf' not in link ] + @property + def pdf_link(self): + links = self.pdf_links() + return links[0] if len(links) else None def record(self): return [ self.paper_id, self.title, self.journal, self.year ] def load_paper(paper_id): - if os.path.exists(paper_path('db_papers', paper_id)): - # print('db paper') - return DbPaper(paper_id) + # no longer using DB papers :p + # if os.path.exists(paper_path('db_papers', paper_id)) + # print('db paper') + # return DbPaper(paper_id) if os.path.exists(paper_path('raw_papers', paper_id)): # print('raw paper') return RawPaper(paper_id) - print('no paper') + print('no raw paper: {}'.format(paper_id)) return None def dedupe(a): |
