summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--s2-extract-papers.py4
2 files changed, 4 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index 7b6b9cc3..871577ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,3 +150,5 @@ old-package.json
datasets/
+*.tar
+
diff --git a/s2-extract-papers.py b/s2-extract-papers.py
index 3ac8ce50..90323e6e 100644
--- a/s2-extract-papers.py
+++ b/s2-extract-papers.py
@@ -16,7 +16,7 @@ def fetch_entries(fn):
def search_dataset_shard(fn, ids):
with gzip.open(fn, 'r') as f:
for line in f.readlines():
- process_paper(str(line))
+ process_paper(str(line)[2:-3])
def process_paper(line):
paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
@@ -43,7 +43,7 @@ def write_paper(paper_id, data):
if os.path.exists(fn):
return
os.makedirs(dir, exist_ok=True)
- with open(fn, 'w') as f:
+ with open(fn, 'wb') as f:
f.write(data)
if __name__ == '__main__':