From 683a20d4d29958132fd49ddaeebf3d4f672085b7 Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Fri, 2 Nov 2018 21:38:17 +0100 Subject: s2 dataset search script --- s2-extract-papers.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 s2-extract-papers.py (limited to 's2-extract-papers.py') diff --git a/s2-extract-papers.py b/s2-extract-papers.py new file mode 100644 index 00000000..fb5a8804 --- /dev/null +++ b/s2-extract-papers.py @@ -0,0 +1,37 @@ +import os +import gzip + +def find_ids(fn, ids): + with gzip.open(fn, 'r') as f: + for line in f.readlines(): + process_paper(str(line)) + +def process_paper(line): + paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] + if paper_id in ids: + print(paper_id) + del ids[paper_id] + write_file(paper_id, line) + +def read_lines(fn): + lookup = {} + with open(fn, 'r') as f: + for line in f.readlines(): + lookup[line.strip()] = True + return lookup + +def write_file(paper_id, data): + dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) + fn = dir + '/paper.json' + if os.path.exists(fn): + return + os.makedirs(dir, exist_ok=True) + with open(fn, 'w') as f: + f.write(data) + +ids = read_lines("ids.txt") +S2_DIR = '/media/blue/undisclosed/semantic-scholar' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev' + +find_ids(S2_DIR + '/sample-S2-records.gz', ids) + -- cgit v1.2.3-70-g09d2