scraper/s2-search-deep.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from s2 import SemanticScholarAPI
from util import *
from importlib import import_module
raw_paper_module = import_module('s2-raw-papers')

'''
s2 search API format:
results
matchedAuthors
matchedPresentations
query
querySuggestions
results
stats
totalPages
totalResults
'''

s2 = SemanticScholarAPI()

MAX_PAGES = 20

def fetch_query(query, since=None, refresh=False):
  clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
  yearFilter = {'min': since, 'max': 2020 } if since else None
  results_path = './datasets/s2/search/{}'.format(clean_title)
  os.makedirs(results_path, exist_ok=True)
  page = 1
  total = 0
  paper_ids = {}

  while True:
    dump_fn = '{}/{}.json'.format(results_path, page)
    #print(dump_fn)
    if not refresh and os.path.exists(dump_fn):
      results = read_json(dump_fn)
    else:
      q = '"{}"'.format(clean_title.replace(' ', '+'))
      results = s2.search(q=q, page=page, pageSize=10, yearFilter=yearFilter)
      write_json(dump_fn, results)
      time.sleep(5)

    #print(results)
    if not results or len(results['results']) == 0:
      break

    total += len(results['results'])

    print("+ {} page {}".format(query, page))

    for result in results['results']:
      paper_id = result['id']
      if paper_id not in paper_ids:
        paper_ids[paper_id] = True
    page += 1
    if page > MAX_PAGES:
      break
    if total >= results['totalResults'] - 9:
      break
  return paper_ids

@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
def search_deep(refresh):
  s2_queries = fetch_google_sheet_objects("s2_queries")
  os.makedirs('./datasets/s2/search_papers', exist_ok=True)
  for row in s2_queries:
    since = row['since']
    if not since:
      continue
    since = int(since)
    queries = []
    row_paper_ids = {}
    for i in range(1, 6):
      query_key = 'query{}'.format(i)
      query = row[query_key]
      if query:
        paper_ids = fetch_query(query, since, refresh)
        for paper_id in paper_ids:
          row_paper_ids[paper_id] = True

    paper_ids = list(row_paper_ids.keys())
    if len(paper_ids):
      print("Writing {} paper ids".format(len(paper_ids)))
      row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
      write_json(row_fn, paper_ids)
      parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])

if __name__ == '__main__':
  search_deep()