blob: bf0b7e50d08cd5f97ebe2d85fd8bf1ce82f82f04 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
import os
import gzip
import glob
import click
from util import *
DB_PAPER_DIR = './datasets/s2/db_papers'
RAW_PAPER_DIR = './datasets/s2/raw_papers'
@click.command()
@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
def fetch_missing_entries(fn):
missing_ids = load_missing_ids(fn)
write_csv('./missing.csv', keys=None, rows=[[id] for id in missing_ids])
def load_missing_ids(fn):
lookup = {}
missing_lookup = {}
ids = read_json(fn)
found_count = 0
missing_count = 0
for paper_id in ids:
db_paper_path = make_db_paper_path(paper_id)
raw_paper_path = make_raw_paper_path(paper_id)
if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
lookup[paper_id] = True
found_count += 1
else:
missing_lookup[paper_id] = True
missing_count += 1
print("{} papers found, {} must be fetched".format(found_count, missing_count))
return missing_lookup.keys()
def make_db_paper_path(paper_id):
return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id)
def make_raw_paper_path(paper_id):
return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
fetch_missing_entries()
|