import os import gzip import glob import click S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' @click.command() @click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.') def fetch_entries(fn): ids = load_id_lookup(fn) for filename in glob.iglob('{}/*.gz'.format(S2_DIR)): search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids) def search_dataset_shard(fn, ids): with gzip.open(fn, 'r') as f: for line in f.readlines(): process_paper(str(line)[2:-3]) def process_paper(line): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] if paper_id in ids: print(paper_id) del ids[paper_id] write_paper(paper_id, line) def load_id_lookup(fn): lookup = {} ids = read_json(fn) for paper_id in ids: path = paper_path(paper_id) if not os.path.exists(path): lookup[paper_id] = True return lookup def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) def write_paper(paper_id, data): dir = paper_path(paper_id) fn = dir + '/paper.json' if os.path.exists(fn): return os.makedirs(dir, exist_ok=True) with open(fn, 'wb') as f: f.write(data) if __name__ == '__main__': fetch_entries()