summaryrefslogtreecommitdiff
path: root/scraper/s2-extract-papers.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-extract-papers.py')
-rw-r--r--scraper/s2-extract-papers.py62
1 files changed, 62 insertions, 0 deletions
diff --git a/scraper/s2-extract-papers.py b/scraper/s2-extract-papers.py
new file mode 100644
index 00000000..bd30c24b
--- /dev/null
+++ b/scraper/s2-extract-papers.py
@@ -0,0 +1,62 @@
+import os
+import gzip
+import glob
+import click
+from util import *
+
+S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
+DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'
+
+@click.command()
+@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
+def fetch_entries(fn):
+ ids = load_id_lookup(fn)
+ for fn in glob.iglob('{}/*.gz'.format(S2_DIR)):
+ search_dataset_shard(fn, ids)
+
+def search_dataset_shard(fn, ids):
+ print(fn)
+ i = 0
+ with gzip.open(fn, 'r') as f:
+ i += 1
+ if (i % 1000) == 0:
+ print("{}...".format(i))
+ for line in f.readlines():
+ process_paper(line.decode('UTF-8'), ids)
+
+def process_paper(line, ids):
+ paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
+ if paper_id in ids:
+ print(paper_id)
+ del ids[paper_id]
+ write_paper(paper_id, line)
+
+def load_id_lookup(fn):
+ lookup = {}
+ ids = read_json(fn)
+ skip_count = 0
+ save_count = 0
+ for paper_id in ids:
+ path = paper_path(paper_id)
+ if not os.path.exists(path):
+ lookup[paper_id] = True
+ save_count += 1
+ else:
+ skip_count += 1
+ print("finding {} ids ({} already pulled)".format(save_count, skip_count))
+ return lookup
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+def write_paper(paper_id, data):
+ dir = paper_path(paper_id)
+ fn = dir + '/paper.json'
+ if os.path.exists(fn):
+ return
+ os.makedirs(dir, exist_ok=True)
+ with open(fn, 'w') as f:
+ f.write(data)
+
+if __name__ == '__main__':
+ fetch_entries()