diff options
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 25 |
1 files changed, 23 insertions, 2 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index b82ac6dd..1b8f74c0 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -91,13 +91,34 @@ def process_raw_paper(fn): # 0 1 2 3 4 5 6 :) # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json paper_id = fn.split('/')[5] - paper = RawPaper(paper_id) + paper = load_paper(paper_id) if paper is None: return None - pdf_url = paper.pdf_link + links = paper.paper_links() + + pdf_url = None ieee_url = None doi_url = None extra_url = None + for link in links: + if '.pdf' in link: + pdf_url = link + continue + domain = urlparse(link).netloc + if 'ieee.org' in domain: + doi_url = link + elif 'link.springer.com' in domain: + doi_url = link + elif 'sciencedirect.com' in domain: + doi_url = link + elif 'acm.org' in domain: + doi_url = link + elif 'computer.org' in domain: + doi_url = link + #elif 'elsevier.com' in domain: + # doi_url = link + if doi_url is not None: + pdf_url = None return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': |
