diff options
| author | jules@lens <julescarbon@gmail.com> | 2018-11-03 01:24:40 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2018-11-03 01:24:40 +0100 |
| commit | 9dca4bf45656f6c327d3a276809ca3d5724560da (patch) | |
| tree | 52fa2da97d519a84918f98a8e83be5ddb80caa8f /s2-extract-papers.py | |
| parent | c2f4665dbe5ff1225f90afbaf590975057dc5026 (diff) | |
fixing script
Diffstat (limited to 's2-extract-papers.py')
| -rw-r--r-- | s2-extract-papers.py | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py index 3ac8ce50..67d9bb8b 100644 --- a/s2-extract-papers.py +++ b/s2-extract-papers.py @@ -2,23 +2,29 @@ import os import gzip import glob import click +from util import * S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' @click.command() -@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.') +@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') def fetch_entries(fn): ids = load_id_lookup(fn) - for filename in glob.iglob('{}/*.gz'.format(S2_DIR)): - search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids) + for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): + search_dataset_shard(fn, ids) def search_dataset_shard(fn, ids): + print(fn) + i = 0 with gzip.open(fn, 'r') as f: + i += 1 + if (i % 1000) == 0: + print("{}...".format(i)) for line in f.readlines(): - process_paper(str(line)) + process_paper(str(line), ids) -def process_paper(line): +def process_paper(line, ids): paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] if paper_id in ids: print(paper_id) |
