diff options
| -rw-r--r-- | s2-extract-papers.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/s2-extract-papers.py b/s2-extract-papers.py new file mode 100644 index 00000000..fb5a8804 --- /dev/null +++ b/s2-extract-papers.py @@ -0,0 +1,37 @@ +import os +import gzip + +def find_ids(fn, ids): + with gzip.open(fn, 'r') as f: + for line in f.readlines(): + process_paper(str(line)) + +def process_paper(line): + paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] + if paper_id in ids: + print(paper_id) + del ids[paper_id] + write_file(paper_id, line) + +def read_lines(fn): + lookup = {} + with open(fn, 'r') as f: + for line in f.readlines(): + lookup[line.strip()] = True + return lookup + +def write_file(paper_id, data): + dir = '{}/papers/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) + fn = dir + '/paper.json' + if os.path.exists(fn): + return + os.makedirs(dir, exist_ok=True) + with open(fn, 'w') as f: + f.write(data) + +ids = read_lines("ids.txt") +S2_DIR = '/media/blue/undisclosed/semantic-scholar' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev' + +find_ids(S2_DIR + '/sample-S2-records.gz', ids) + |
