import os import gzip import glob import json import click from util import * PAPER_JSON_DIR = 'datasets/s2/papers' @click.command() def s2_dump_ids(): ids = {} for fn in glob.iglob('{}/**/*.json'.format(PAPER_JSON_DIR), recursive=True): process_paper(fn, ids) id_list = list(ids.keys()) print("Wrote {} ids".format(len(id_list))) write_json('ids.json', id_list) def process_paper(fn, ids): with open(fn, 'r') as f: data = json.load(f) ids[data['paperId']] = True for cite in data['citations']: ids[cite['paperId']] = True def paper_path(paper_id): return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) if __name__ == '__main__': s2_dump_ids()