From cbd96aeca32dcfd37acdf5f6b7e3a2997311783c Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Sat, 3 Nov 2018 18:49:19 +0100 Subject: utf8 --- s2-extract-papers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 's2-extract-papers.py') diff --git a/s2-extract-papers.py b/s2-extract-papers.py index 00301433..bd30c24b 100644 --- a/s2-extract-papers.py +++ b/s2-extract-papers.py @@ -22,7 +22,7 @@ def search_dataset_shard(fn, ids): if (i % 1000) == 0: print("{}...".format(i)) for line in f.readlines(): - process_paper(str(line)[2:-3], ids) + process_paper(line.decode('UTF-8'), ids) def process_paper(line, ids): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] -- cgit v1.2.3-70-g09d2