import os import gzip import glob import click from util import * S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' DATA_DIR = '/home/lens/undisclosed/megapixels_dev/scraper/datasets/s2/db_papers' @click.command() @click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') def fetch_entries(fn): ids = load_id_lookup(fn) for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): search_dataset_shard(fn, ids) def search_dataset_shard(fn, ids): print(fn) i = 0 with gzip.open(fn, 'r') as f: i += 1 if (i % 1000) == 0: print("{}...".format(i)) for line in f.readlines(): process_paper(line.decode('UTF-8'), ids) def process_paper(line, ids): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] if paper_id in ids: #print(paper_id) del ids[paper_id] write_paper(paper_id, line) def load_id_lookup(fn): lookup = {} ids = read_json(fn) skip_count = 0 save_count = 0 for paper_id in ids: path = paper_path(paper_id) if not os.path.exists(path): lookup[paper_id] = True save_count += 1 else: skip_count += 1 print("finding {} ids ({} already pulled)".format(save_count, skip_count)) return lookup def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) def write_paper(paper_id, data): dir = paper_path(paper_id) fn = dir + '/paper.json' if os.path.exists(fn): return os.makedirs(dir, exist_ok=True) with open(fn, 'w') as f: f.write(data) if __name__ == '__main__': fetch_entries()