summaryrefslogtreecommitdiff
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/rm-txt-images.sh4
-rw-r--r--scraper/s2-dump-db-pdf-urls.py2
-rw-r--r--scraper/util.py5
3 files changed, 10 insertions, 1 deletions
diff --git a/scraper/rm-txt-images.sh b/scraper/rm-txt-images.sh
new file mode 100644
index 00000000..75ac1cf2
--- /dev/null
+++ b/scraper/rm-txt-images.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+find ./datasets/s2/txt/ -name "*.img" -print0 | xargs -0 rm
+
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
index b82ac6dd..8895fa0d 100644
--- a/scraper/s2-dump-db-pdf-urls.py
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -91,7 +91,7 @@ def process_raw_paper(fn):
# 0 1 2 3 4 5 6 :)
# ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json
paper_id = fn.split('/')[5]
- paper = RawPaper(paper_id)
+ paper = load_paper(paper_id)
if paper is None:
return None
pdf_url = paper.pdf_link
diff --git a/scraper/util.py b/scraper/util.py
index 1b1a0a9b..bb210012 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -174,6 +174,10 @@ class RawPaper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
data = read_json(paper_path('raw_papers', paper_id))
+ # {'responseType': 'CANONICAL', 'canonicalId': '02ccd5f0eb9a48a6af088197b950fb30a8e3abcc', 'canonicalSlug': 'Scaling-for-Multimodal-3-D-Object-Detection-Stanford'}
+ if 'responseType' in data and data['responseType'] == 'CANONICAL':
+ data = read_json(paper_path('raw_papers', data['canonicalId']))
+
# print(data)
if 'paper' not in data:
print(data)
@@ -206,6 +210,7 @@ class RawPaper(object):
if 'alternatePaperLinks' in self.data:
for link in self.data['alternatePaperLinks']:
paper_links.append(url_part(link))
+ return paper_links
def pdf_links(self):
return [ link for link in self.paper_links() if 'pdf' in link ]
def doi_links(self):