diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-extract-papers.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 's2-extract-papers.py')
| -rw-r--r-- | s2-extract-papers.py | 62 |
1 files changed, 0 insertions, 62 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py deleted file mode 100644 index bd30c24b..00000000 --- a/s2-extract-papers.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import gzip -import glob -import click -from util import * - -S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' -DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' - -@click.command() -@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') -def fetch_entries(fn): - ids = load_id_lookup(fn) - for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): - search_dataset_shard(fn, ids) - -def search_dataset_shard(fn, ids): - print(fn) - i = 0 - with gzip.open(fn, 'r') as f: - i += 1 - if (i % 1000) == 0: - print("{}...".format(i)) - for line in f.readlines(): - process_paper(line.decode('UTF-8'), ids) - -def process_paper(line, ids): - paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] - if paper_id in ids: - print(paper_id) - del ids[paper_id] - write_paper(paper_id, line) - -def load_id_lookup(fn): - lookup = {} - ids = read_json(fn) - skip_count = 0 - save_count = 0 - for paper_id in ids: - path = paper_path(paper_id) - if not os.path.exists(path): - lookup[paper_id] = True - save_count += 1 - else: - skip_count += 1 - print("finding {} ids ({} already pulled)".format(save_count, skip_count)) - return lookup - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -def write_paper(paper_id, data): - dir = paper_path(paper_id) - fn = dir + '/paper.json' - if os.path.exists(fn): - return - os.makedirs(dir, exist_ok=True) - with open(fn, 'w') as f: - f.write(data) - -if __name__ == '__main__': - fetch_entries() |
