diff options
| author | jules@lens <julescarbon@gmail.com> | 2018-11-02 21:38:17 +0100 |
|---|---|---|
| committer | jules@lens <julescarbon@gmail.com> | 2018-11-02 21:38:17 +0100 |
| commit | 683a20d4d29958132fd49ddaeebf3d4f672085b7 (patch) | |
| tree | 872b4a91440d9898c721befeb1bd9134d7c5f2a9 /s2-extract-papers.py | |
| parent | 022e7408e7a4519877ac3b240e6a55c0fb6dbfc0 (diff) | |
s2 dataset search script
Diffstat (limited to 's2-extract-papers.py')
| -rw-r--r-- | s2-extract-papers.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py new file mode 100644 index 00000000..fb5a8804 --- /dev/null +++ b/s2-extract-papers.py @@ -0,0 +1,37 @@ +import os +import gzip + +def find_ids(fn, ids): + with gzip.open(fn, 'r') as f: + for line in f.readlines(): + process_paper(str(line)) + +def process_paper(line): + paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] + if paper_id in ids: + print(paper_id) + del ids[paper_id] + write_file(paper_id, line) + +def read_lines(fn): + lookup = {} + with open(fn, 'r') as f: + for line in f.readlines(): + lookup[line.strip()] = True + return lookup + +def write_file(paper_id, data): + dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) + fn = dir + '/paper.json' + if os.path.exists(fn): + return + os.makedirs(dir, exist_ok=True) + with open(fn, 'w') as f: + f.write(data) + +ids = read_lines("ids.txt") +S2_DIR = '/media/blue/undisclosed/semantic-scholar' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev' + +find_ids(S2_DIR + '/sample-S2-records.gz', ids) + |
