diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 18:10:21 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 18:10:21 +0100 |
| commit | 753e41d819030a62418705fc4484d9303e3e1a00 (patch) | |
| tree | 40a13336278e330de882de3cca90134286d8c952 /s2-dump-missing-paper-ids.py | |
| parent | aa0470a3076f5ac65a0311c76e58254547f3eae0 (diff) | |
scripts to fetch papers from main s2 api
Diffstat (limited to 's2-dump-missing-paper-ids.py')
| -rw-r--r-- | s2-dump-missing-paper-ids.py | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/s2-dump-missing-paper-ids.py b/s2-dump-missing-paper-ids.py new file mode 100644 index 00000000..72ff1c44 --- /dev/null +++ b/s2-dump-missing-paper-ids.py @@ -0,0 +1,40 @@ +import os +import gzip +import glob +import click +from util import * + +DB_PAPER_DIR = './datasets/s2/db_papers' +RAW_PAPER_DIR = './datasets/s2/raw_papers' + +@click.command() +@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') +def fetch_missing_entries(fn): + missing_ids = load_missing_ids(fn) + write_csv('./missing.csv', [[id] for id in missing_ids]) + +def load_missing_ids(fn): + lookup = {} + missing_lookup = {} + ids = read_json(fn) + found_count = 0 + missing_count = 0 + for paper_id in ids: + db_paper_path = make_db_paper_path(paper_id) + raw_paper_path = make_raw_paper_path(paper_id) + if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): + lookup[paper_id] = True + found_count += 1 + else: + missing_lookup[paper_id] = True + missing_count += 1 + print("{} papers found, {} must be fetched".format(found_count, missing_count)) + return missing_lookup.keys() + +def make_db_paper_path(paper_id): + return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id) +def make_raw_paper_path(paper_id): + return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + fetch_missing_entries() |
