From ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sun, 25 Nov 2018 22:19:15 +0100 Subject: moving stuff --- s2-extract-papers.py | 62 ---------------------------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 s2-extract-papers.py (limited to 's2-extract-papers.py') diff --git a/s2-extract-papers.py b/s2-extract-papers.py deleted file mode 100644 index bd30c24b..00000000 --- a/s2-extract-papers.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import gzip -import glob -import click -from util import * - -S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' -DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' - -@click.command() -@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') -def fetch_entries(fn): - ids = load_id_lookup(fn) - for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): - search_dataset_shard(fn, ids) - -def search_dataset_shard(fn, ids): - print(fn) - i = 0 - with gzip.open(fn, 'r') as f: - i += 1 - if (i % 1000) == 0: - print("{}...".format(i)) - for line in f.readlines(): - process_paper(line.decode('UTF-8'), ids) - -def process_paper(line, ids): - paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] - if paper_id in ids: - print(paper_id) - del ids[paper_id] - write_paper(paper_id, line) - -def load_id_lookup(fn): - lookup = {} - ids = read_json(fn) - skip_count = 0 - save_count = 0 - for paper_id in ids: - path = paper_path(paper_id) - if not os.path.exists(path): - lookup[paper_id] = True - save_count += 1 - else: - skip_count += 1 - print("finding {} ids ({} already pulled)".format(save_count, skip_count)) - return lookup - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -def write_paper(paper_id, data): - dir = paper_path(paper_id) - fn = dir + '/paper.json' - if os.path.exists(fn): - return - os.makedirs(dir, exist_ok=True) - with open(fn, 'w') as f: - f.write(data) - -if __name__ == '__main__': - fetch_entries() -- cgit v1.2.3-70-g09d2