summaryrefslogtreecommitdiff
path: root/s2-extract-papers.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-03 01:13:49 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-03 01:13:49 +0100
commitc2f4665dbe5ff1225f90afbaf590975057dc5026 (patch)
tree48109101798b42018540deff7ac6873a067c937e /s2-extract-papers.py
parent683a20d4d29958132fd49ddaeebf3d4f672085b7 (diff)
s2 dump scripts...
Diffstat (limited to 's2-extract-papers.py')
-rw-r--r--s2-extract-papers.py41
1 files changed, 27 insertions, 14 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py
index fb5a8804..3ac8ce50 100644
--- a/s2-extract-papers.py
+++ b/s2-extract-papers.py
@@ -1,7 +1,19 @@
import os
import gzip
+import glob
+import click
-def find_ids(fn, ids):
+S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
+DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'
+
+@click.command()
+@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.')
+def fetch_entries(fn):
+ ids = load_id_lookup(fn)
+ for filename in glob.iglob('{}/*.gz'.format(S2_DIR)):
+ search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids)
+
+def search_dataset_shard(fn, ids):
with gzip.open(fn, 'r') as f:
for line in f.readlines():
process_paper(str(line))
@@ -11,17 +23,22 @@ def process_paper(line):
if paper_id in ids:
print(paper_id)
del ids[paper_id]
- write_file(paper_id, line)
+ write_paper(paper_id, line)
-def read_lines(fn):
+def load_id_lookup(fn):
lookup = {}
- with open(fn, 'r') as f:
- for line in f.readlines():
- lookup[line.strip()] = True
+ ids = read_json(fn)
+ for paper_id in ids:
+ path = paper_path(paper_id)
+ if not os.path.exists(path):
+ lookup[paper_id] = True
return lookup
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id)
-def write_file(paper_id, data):
- dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id)
+def write_paper(paper_id, data):
+ dir = paper_path(paper_id)
fn = dir + '/paper.json'
if os.path.exists(fn):
return
@@ -29,9 +46,5 @@ def write_file(paper_id, data):
with open(fn, 'w') as f:
f.write(data)
-ids = read_lines("ids.txt")
-S2_DIR = '/media/blue/undisclosed/semantic-scholar'
-DATA_DIR = '/home/lens/undisclosed/megapixels_dev'
-
-find_ids(S2_DIR + '/sample-S2-records.gz', ids)
-
+if __name__ == '__main__':
+ fetch_entries()