diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | s2-extract-papers.py | 4 |
2 files changed, 4 insertions, 2 deletions
@@ -150,3 +150,5 @@ old-package.json datasets/ +*.tar + diff --git a/s2-extract-papers.py b/s2-extract-papers.py index 3ac8ce50..90323e6e 100644 --- a/s2-extract-papers.py +++ b/s2-extract-papers.py @@ -16,7 +16,7 @@ def fetch_entries(fn): def search_dataset_shard(fn, ids): with gzip.open(fn, 'r') as f: for line in f.readlines(): - process_paper(str(line)) + process_paper(str(line)[2:-3]) def process_paper(line): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] @@ -43,7 +43,7 @@ def write_paper(paper_id, data): if os.path.exists(fn): return os.makedirs(dir, exist_ok=True) - with open(fn, 'w') as f: + with open(fn, 'wb') as f: f.write(data) if __name__ == '__main__': |
