1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
import os
import gzip
import glob
import click
S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'
@click.command()
@click.option('--input', '-i', default='ids.json', help='List of IDs to extract from the big dataset.')
def fetch_entries(fn):
ids = load_id_lookup(fn)
for filename in glob.iglob('{}/*.gz'.format(S2_DIR)):
search_dataset_shard('{}/{}'.format(S2_DIR, filename), ids)
def search_dataset_shard(fn, ids):
with gzip.open(fn, 'r') as f:
for line in f.readlines():
process_paper(str(line)[2:-3])
def process_paper(line):
paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
if paper_id in ids:
print(paper_id)
del ids[paper_id]
write_paper(paper_id, line)
def load_id_lookup(fn):
lookup = {}
ids = read_json(fn)
for paper_id in ids:
path = paper_path(paper_id)
if not os.path.exists(path):
lookup[paper_id] = True
return lookup
def paper_path(paper_id):
return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id)
def write_paper(paper_id, data):
dir = paper_path(paper_id)
fn = dir + '/paper.json'
if os.path.exists(fn):
return
os.makedirs(dir, exist_ok=True)
with open(fn, 'wb') as f:
f.write(data)
if __name__ == '__main__':
fetch_entries()
|