diff options
Diffstat (limited to 'scraper/s2-extract-papers.py')
| -rw-r--r-- | scraper/s2-extract-papers.py | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/scraper/s2-extract-papers.py b/scraper/s2-extract-papers.py new file mode 100644 index 00000000..bd30c24b --- /dev/null +++ b/scraper/s2-extract-papers.py @@ -0,0 +1,62 @@ +import os +import gzip +import glob +import click +from util import * + +S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' + +@click.command() +@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') +def fetch_entries(fn): + ids = load_id_lookup(fn) + for fn in glob.iglob('{}/*.gz'.format(S2_DIR)): + search_dataset_shard(fn, ids) + +def search_dataset_shard(fn, ids): + print(fn) + i = 0 + with gzip.open(fn, 'r') as f: + i += 1 + if (i % 1000) == 0: + print("{}...".format(i)) + for line in f.readlines(): + process_paper(line.decode('UTF-8'), ids) + +def process_paper(line, ids): + paper_id = line.split('"id":"', 2)[1].split('"', 2)[0] + if paper_id in ids: + print(paper_id) + del ids[paper_id] + write_paper(paper_id, line) + +def load_id_lookup(fn): + lookup = {} + ids = read_json(fn) + skip_count = 0 + save_count = 0 + for paper_id in ids: + path = paper_path(paper_id) + if not os.path.exists(path): + lookup[paper_id] = True + save_count += 1 + else: + skip_count += 1 + print("finding {} ids ({} already pulled)".format(save_count, skip_count)) + return lookup + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +def write_paper(paper_id, data): + dir = paper_path(paper_id) + fn = dir + '/paper.json' + if os.path.exists(fn): + return + os.makedirs(dir, exist_ok=True) + with open(fn, 'w') as f: + f.write(data) + +if __name__ == '__main__': + fetch_entries() |
