summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--s2-extract-papers.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py
new file mode 100644
index 00000000..fb5a8804
--- /dev/null
+++ b/s2-extract-papers.py
@@ -0,0 +1,37 @@
+import os
+import gzip
+
+def find_ids(fn, ids):
+ with gzip.open(fn, 'r') as f:
+ for line in f.readlines():
+ process_paper(str(line))
+
+def process_paper(line):
+ paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
+ if paper_id in ids:
+ print(paper_id)
+ del ids[paper_id]
+ write_file(paper_id, line)
+
+def read_lines(fn):
+ lookup = {}
+ with open(fn, 'r') as f:
+ for line in f.readlines():
+ lookup[line.strip()] = True
+ return lookup
+
+def write_file(paper_id, data):
+ dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id)
+ fn = dir + '/paper.json'
+ if os.path.exists(fn):
+ return
+ os.makedirs(dir, exist_ok=True)
+ with open(fn, 'w') as f:
+ f.write(data)
+
+ids = read_lines("ids.txt")
+S2_DIR = '/media/blue/undisclosed/semantic-scholar'
+DATA_DIR = '/home/lens/undisclosed/megapixels_dev'
+
+find_ids(S2_DIR + '/sample-S2-records.gz', ids)
+