diff options
Diffstat (limited to 'scraper/s2-dump-missing-paper-ids.py')
| -rw-r--r-- | scraper/s2-dump-missing-paper-ids.py | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py new file mode 100644 index 00000000..bf0b7e50 --- /dev/null +++ b/scraper/s2-dump-missing-paper-ids.py @@ -0,0 +1,40 @@ +import os +import gzip +import glob +import click +from util import * + +DB_PAPER_DIR = './datasets/s2/db_papers' +RAW_PAPER_DIR = './datasets/s2/raw_papers' + +@click.command() +@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') +def fetch_missing_entries(fn): + missing_ids = load_missing_ids(fn) + write_csv('./missing.csv', keys=None, rows=[[id] for id in missing_ids]) + +def load_missing_ids(fn): + lookup = {} + missing_lookup = {} + ids = read_json(fn) + found_count = 0 + missing_count = 0 + for paper_id in ids: + db_paper_path = make_db_paper_path(paper_id) + raw_paper_path = make_raw_paper_path(paper_id) + if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): + lookup[paper_id] = True + found_count += 1 + else: + missing_lookup[paper_id] = True + missing_count += 1 + print("{} papers found, {} must be fetched".format(found_count, missing_count)) + return missing_lookup.keys() + +def make_db_paper_path(paper_id): + return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id) +def make_raw_paper_path(paper_id): + return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + fetch_missing_entries() |
