diff options
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/rm-txt-images.sh | 4 | ||||
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 2 | ||||
| -rw-r--r-- | scraper/util.py | 5 |
3 files changed, 10 insertions, 1 deletions
diff --git a/scraper/rm-txt-images.sh b/scraper/rm-txt-images.sh new file mode 100644 index 00000000..75ac1cf2 --- /dev/null +++ b/scraper/rm-txt-images.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +find ./datasets/s2/txt/ -name "*.img" -print0 | xargs -0 rm + diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index b82ac6dd..8895fa0d 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -91,7 +91,7 @@ def process_raw_paper(fn): # 0 1 2 3 4 5 6 :) # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json paper_id = fn.split('/')[5] - paper = RawPaper(paper_id) + paper = load_paper(paper_id) if paper is None: return None pdf_url = paper.pdf_link diff --git a/scraper/util.py b/scraper/util.py index 1b1a0a9b..bb210012 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -174,6 +174,10 @@ class RawPaper(object): def __init__(self, paper_id): self.paper_id = paper_id data = read_json(paper_path('raw_papers', paper_id)) + # {'responseType': 'CANONICAL', 'canonicalId': '02ccd5f0eb9a48a6af088197b950fb30a8e3abcc', 'canonicalSlug': 'Scaling-for-Multimodal-3-D-Object-Detection-Stanford'} + if 'responseType' in data and data['responseType'] == 'CANONICAL': + data = read_json(paper_path('raw_papers', data['canonicalId'])) + # print(data) if 'paper' not in data: print(data) @@ -206,6 +210,7 @@ class RawPaper(object): if 'alternatePaperLinks' in self.data: for link in self.data['alternatePaperLinks']: paper_links.append(url_part(link)) + return paper_links def pdf_links(self): return [ link for link in self.paper_links() if 'pdf' in link ] def doi_links(self): |
