summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-missing-paper-ids.py
blob: 6f7eb8baf10ed758adc76365c5b55987f59bb967 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import gzip
import glob
import click
from util import *

# DB_PAPER_DIR = './datasets/s2/db_papers'
RAW_PAPER_DIR = './datasets/s2/raw_papers'

@click.command()
@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
def fetch_missing_entries(fn):
  missing_ids = load_missing_ids(fn)
  write_csv('./missing.csv', keys=None, rows=[[id] for id in missing_ids])

def load_missing_ids(fn):
  lookup = {}
  missing_lookup = {}
  ids = read_json(fn)
  found_count = 0
  missing_count = 0
  for paper_id in ids:
    # db_paper_path = make_db_paper_path(paper_id)
    raw_paper_path = make_raw_paper_path(paper_id)
    # if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
    if os.path.exists(raw_paper_path):
      lookup[paper_id] = True
      found_count += 1
    else:
      # print(">> {} {}".format(dataset, paper_id))
      missing_lookup[paper_id] = True
      missing_count += 1

  verified_lookup, verified_totals = fetch_verified_paper_lookup()
  rows = []
  for dataset, lookup in verified_lookup.items():
    for paper_id in lookup.keys():
      if dataset == 'brainwash':
        print('>> {} {}'.format(dataset, paper_id))
      paper_path = make_raw_paper_path(paper_id)
      if not os.path.exists(paper_path) and paper_id not in missing_lookup:
        print(">> {} {}".format(dataset, paper_id))
        missing_count += 1
        missing_lookup[paper_id] = True

  print("{} papers found, {} must be fetched".format(found_count, missing_count))
  return missing_lookup.keys()

def make_db_paper_path(paper_id):
  return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id)
def make_raw_paper_path(paper_id):
  return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id)
  
if __name__ == '__main__':
  fetch_missing_entries()