From 9dca4bf45656f6c327d3a276809ca3d5724560da Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Sat, 3 Nov 2018 01:24:40 +0100 Subject: fixing script --- s2-extract-papers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 's2-extract-papers.py') diff --git a/s2-extract-papers.py b/s2-extract-papers.py index 3ac8ce50..67d9bb8b 100644 --- a/s2-extract-papers.py +++ b/s2-extract-papers.py @@ -2,23 +2,29 @@ import os import gzip import glob import click +from util import * S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' @click.command() -@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.') +@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') def fetch_entries(fn): ids = load_id_lookup(fn) - for filename in glob.iglob('{}/*.gz'.format(S2_DIR)): - search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids) + for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): + search_dataset_shard(fn, ids) def search_dataset_shard(fn, ids): + print(fn) + i = 0 with gzip.open(fn, 'r') as f: + i += 1 + if (i % 1000) == 0: + print("{}...".format(i)) for line in f.readlines(): - process_paper(str(line)) + process_paper(str(line), ids) -def process_paper(line): +def process_paper(line, ids): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] if paper_id in ids: print(paper_id) -- cgit v1.2.3-70-g09d2