import os import gzip import glob import click from util import * DB_PAPER_DIR = './datasets/s2/db_papers' RAW_PAPER_DIR = './datasets/s2/raw_papers' @click.command() @click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') def fetch_missing_entries(fn): missing_ids = load_missing_ids(fn) write_csv('./missing.csv', keys=None, rows=[[id] for id in missing_ids]) def load_missing_ids(fn): lookup = {} missing_lookup = {} ids = read_json(fn) found_count = 0 missing_count = 0 for paper_id in ids: db_paper_path = make_db_paper_path(paper_id) raw_paper_path = make_raw_paper_path(paper_id) if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): lookup[paper_id] = True found_count += 1 else: missing_lookup[paper_id] = True missing_count += 1 print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() def make_db_paper_path(paper_id): return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id) def make_raw_paper_path(paper_id): return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': fetch_missing_entries()