scraper/s2-extract-papers.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

import os
import gzip
import glob
import click
from util import *

S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03'
DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers'

@click.command()
@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
def fetch_entries(fn):
  ids = load_id_lookup(fn)
  for fn in glob.iglob('{}/*.gz'.format(S2_DIR)):
    search_dataset_shard(fn, ids)

def search_dataset_shard(fn, ids):
  print(fn)
  i = 0
  with gzip.open(fn, 'r') as f:
    i += 1
    if (i % 1000) == 0:
      print("{}...".format(i))
    for line in f.readlines():
      process_paper(line.decode('UTF-8'), ids)

def process_paper(line, ids):
  paper_id = line.split('"id":"', 2)[1].split('"', 2)[0]
  if paper_id in ids:
    print(paper_id)
    del ids[paper_id]
    write_paper(paper_id, line)

def load_id_lookup(fn):
  lookup = {}
  ids = read_json(fn)
  skip_count = 0
  save_count = 0
  for paper_id in ids:
    path = paper_path(paper_id)
    if not os.path.exists(path):
      lookup[paper_id] = True
      save_count += 1
    else:
      skip_count += 1
  print("finding {} ids ({} already pulled)".format(save_count, skip_count))
  return lookup

def paper_path(paper_id):
  return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
  
def write_paper(paper_id, data):
  dir = paper_path(paper_id)
  fn = dir + '/paper.json'
  if os.path.exists(fn):
    return
  os.makedirs(dir, exist_ok=True)
  with open(fn, 'w') as f:
    f.write(data)

if __name__ == '__main__':
  fetch_entries()