import os import gzip def find_ids(fn, ids): with gzip.open(fn, 'r') as f: for line in f.readlines(): process_paper(str(line)) def process_paper(line): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] if paper_id in ids: print(paper_id) del ids[paper_id] write_file(paper_id, line) def read_lines(fn): lookup = {} with open(fn, 'r') as f: for line in f.readlines(): lookup[line.strip()] = True return lookup def write_file(paper_id, data): dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) fn = dir + '/paper.json' if os.path.exists(fn): return os.makedirs(dir, exist_ok=True) with open(fn, 'w') as f: f.write(data) ids = read_lines("ids.txt") S2_DIR = '/media/blue/undisclosed/semantic-scholar' DATA_DIR = '/home/lens/undisclosed/megapixels_dev' find_ids(S2_DIR + '/sample-S2-records.gz', ids)