summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-db-pdf-urls.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
-rw-r--r--scraper/s2-dump-db-pdf-urls.py23
1 files changed, 22 insertions, 1 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
index 8895fa0d..1b8f74c0 100644
--- a/scraper/s2-dump-db-pdf-urls.py
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -94,10 +94,31 @@ def process_raw_paper(fn):
paper = load_paper(paper_id)
if paper is None:
return None
- pdf_url = paper.pdf_link
+ links = paper.paper_links()
+
+ pdf_url = None
ieee_url = None
doi_url = None
extra_url = None
+ for link in links:
+ if '.pdf' in link:
+ pdf_url = link
+ continue
+ domain = urlparse(link).netloc
+ if 'ieee.org' in domain:
+ doi_url = link
+ elif 'link.springer.com' in domain:
+ doi_url = link
+ elif 'sciencedirect.com' in domain:
+ doi_url = link
+ elif 'acm.org' in domain:
+ doi_url = link
+ elif 'computer.org' in domain:
+ doi_url = link
+ #elif 'elsevier.com' in domain:
+ # doi_url = link
+ if doi_url is not None:
+ pdf_url = None
return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
if __name__ == '__main__':