summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjules@lens <julescarbon@gmail.com>2018-11-03 01:24:40 +0100
committerjules@lens <julescarbon@gmail.com>2018-11-03 01:24:40 +0100
commit9dca4bf45656f6c327d3a276809ca3d5724560da (patch)
tree52fa2da97d519a84918f98a8e83be5ddb80caa8f
parentc2f4665dbe5ff1225f90afbaf590975057dc5026 (diff)
fixing script
-rw-r--r--s2-extract-papers.py16
1 files changed, 11 insertions, 5 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py
index 3ac8ce50..67d9bb8b 100644
--- a/s2-extract-papers.py
+++ b/s2-extract-papers.py
@@ -2,23 +2,29 @@ import os
import gzip
import glob
import click
+from util import *
S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'
@click.command()
-@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.')
+@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
def fetch_entries(fn):
ids = load_id_lookup(fn)
- for filename in glob.iglob('{}/*.gz'.format(S2_DIR)):
- search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids)
+ for fn in glob.iglob('{}/*.gz'.format(S2_DIR)):
+ search_dataset_shard(fn, ids)
def search_dataset_shard(fn, ids):
+ print(fn)
+ i = 0
with gzip.open(fn, 'r') as f:
+ i += 1
+ if (i % 1000) == 0:
+ print("{}...".format(i))
for line in f.readlines():
- process_paper(str(line))
+ process_paper(str(line), ids)
-def process_paper(line):
+def process_paper(line, ids):
paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
if paper_id in ids:
print(paper_id)