diff options
| author | adamhrv <adam@ahprojects.com> | 2018-11-04 21:54:00 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2018-11-04 21:54:00 +0100 |
| commit | 9bcba0d02aafb34a5a9ca3db2f894f1fc95401c0 (patch) | |
| tree | 3dcaf94563498c15b56d51efc62750d0be72e01a /s2-extract-papers.py | |
| parent | ef45f3c93ffd39b57ee56db74a95f9d2dae074a8 (diff) | |
| parent | 0dc3e40434c23e4d48119465f39b03bf35fb56bd (diff) | |
.
Diffstat (limited to 's2-extract-papers.py')
| -rw-r--r-- | s2-extract-papers.py | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py new file mode 100644 index 00000000..90323e6e --- /dev/null +++ b/s2-extract-papers.py @@ -0,0 +1,50 @@ +import os +import gzip +import glob +import click + +S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' + +@click.command() +@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.') +def fetch_entries(fn): + ids = load_id_lookup(fn) + for filename in glob.iglob('{}/*.gz'.format(S2_DIR)): + search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids) + +def search_dataset_shard(fn, ids): + with gzip.open(fn, 'r') as f: + for line in f.readlines(): + process_paper(str(line)[2:-3]) + +def process_paper(line): + paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] + if paper_id in ids: + print(paper_id) + del ids[paper_id] + write_paper(paper_id, line) + +def load_id_lookup(fn): + lookup = {} + ids = read_json(fn) + for paper_id in ids: + path = paper_path(paper_id) + if not os.path.exists(path): + lookup[paper_id] = True + return lookup + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) + +def write_paper(paper_id, data): + dir = paper_path(paper_id) + fn = dir + '/paper.json' + if os.path.exists(fn): + return + os.makedirs(dir, exist_ok=True) + with open(fn, 'wb') as f: + f.write(data) + +if __name__ == '__main__': + fetch_entries() |
