summaryrefslogtreecommitdiff
path: root/s2-extract-papers.py
diff options
context:
space:
mode:
authoradamhrv <adam@ahprojects.com>2018-11-04 21:54:00 +0100
committeradamhrv <adam@ahprojects.com>2018-11-04 21:54:00 +0100
commit9bcba0d02aafb34a5a9ca3db2f894f1fc95401c0 (patch)
tree3dcaf94563498c15b56d51efc62750d0be72e01a /s2-extract-papers.py
parentef45f3c93ffd39b57ee56db74a95f9d2dae074a8 (diff)
parent0dc3e40434c23e4d48119465f39b03bf35fb56bd (diff)
.
Diffstat (limited to 's2-extract-papers.py')
-rw-r--r--s2-extract-papers.py50
1 files changed, 50 insertions, 0 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py
new file mode 100644
index 00000000..90323e6e
--- /dev/null
+++ b/s2-extract-papers.py
@@ -0,0 +1,50 @@
+import os
+import gzip
+import glob
+import click
+
+S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
+DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'
+
+@click.command()
+@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.')
+def fetch_entries(fn):
+ ids = load_id_lookup(fn)
+ for filename in glob.iglob('{}/*.gz'.format(S2_DIR)):
+ search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids)
+
+def search_dataset_shard(fn, ids):
+ with gzip.open(fn, 'r') as f:
+ for line in f.readlines():
+ process_paper(str(line)[2:-3])
+
+def process_paper(line):
+ paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
+ if paper_id in ids:
+ print(paper_id)
+ del ids[paper_id]
+ write_paper(paper_id, line)
+
+def load_id_lookup(fn):
+ lookup = {}
+ ids = read_json(fn)
+ for paper_id in ids:
+ path = paper_path(paper_id)
+ if not os.path.exists(path):
+ lookup[paper_id] = True
+ return lookup
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id)
+
+def write_paper(paper_id, data):
+ dir = paper_path(paper_id)
+ fn = dir + '/paper.json'
+ if os.path.exists(fn):
+ return
+ os.makedirs(dir, exist_ok=True)
+ with open(fn, 'wb') as f:
+ f.write(data)
+
+if __name__ == '__main__':
+ fetch_entries()