diff options
Diffstat (limited to 's2-extract-papers.py')
| -rw-r--r-- | s2-extract-papers.py | 41 |
1 files changed, 27 insertions, 14 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py index fb5a8804..3ac8ce50 100644 --- a/s2-extract-papers.py +++ b/s2-extract-papers.py @@ -1,7 +1,19 @@ import os import gzip +import glob +import click -def find_ids(fn, ids): +S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' + +@click.command() +@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.') +def fetch_entries(fn): + ids = load_id_lookup(fn) + for filename in glob.iglob('{}/*.gz'.format(S2_DIR)): + search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids) + +def search_dataset_shard(fn, ids): with gzip.open(fn, 'r') as f: for line in f.readlines(): process_paper(str(line)) @@ -11,17 +23,22 @@ def process_paper(line): if paper_id in ids: print(paper_id) del ids[paper_id] - write_file(paper_id, line) + write_paper(paper_id, line) -def read_lines(fn): +def load_id_lookup(fn): lookup = {} - with open(fn, 'r') as f: - for line in f.readlines(): - lookup[line.strip()] = True + ids = read_json(fn) + for paper_id in ids: + path = paper_path(paper_id) + if not os.path.exists(path): + lookup[paper_id] = True return lookup + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) -def write_file(paper_id, data): - dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) +def write_paper(paper_id, data): + dir = paper_path(paper_id) fn = dir + '/paper.json' if os.path.exists(fn): return @@ -29,9 +46,5 @@ def write_file(paper_id, data): with open(fn, 'w') as f: f.write(data) -ids = read_lines("ids.txt") -S2_DIR = '/media/blue/undisclosed/semantic-scholar' -DATA_DIR = '/home/lens/undisclosed/megapixels_dev' - -find_ids(S2_DIR + '/sample-S2-records.gz', ids) - +if __name__ == '__main__': + fetch_entries() |
