diff options
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 23 |
1 files changed, 22 insertions, 1 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index 8895fa0d..1b8f74c0 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -94,10 +94,31 @@ def process_raw_paper(fn): paper = load_paper(paper_id) if paper is None: return None - pdf_url = paper.pdf_link + links = paper.paper_links() + + pdf_url = None ieee_url = None doi_url = None extra_url = None + for link in links: + if '.pdf' in link: + pdf_url = link + continue + domain = urlparse(link).netloc + if 'ieee.org' in domain: + doi_url = link + elif 'link.springer.com' in domain: + doi_url = link + elif 'sciencedirect.com' in domain: + doi_url = link + elif 'acm.org' in domain: + doi_url = link + elif 'computer.org' in domain: + doi_url = link + #elif 'elsevier.com' in domain: + # doi_url = link + if doi_url is not None: + pdf_url = None return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': |
