scraper/s2-search-deep.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from s2 import SemanticScholarAPI
from util import *
from importlib import import_module
raw_paper_module = import_module('s2-raw-papers')

'''
s2 search API format:
results
matchedAuthors
matchedPresentations
query
querySuggestions
results
stats
totalPages
totalResults
'''

s2 = SemanticScholarAPI()

MAX_PAGES = 20

def fetch_query(query, since=None, refresh=False):
  clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
  if since:
    yearFilter = {'min': since, 'max': 2020 }
  else:
    yearFilter = None
  results_path = './datasets/s2/search/{}'.format(clean_title)
  os.makedirs(results_path, exist_ok=True)
  page = 1
  total = 0
  paper_ids = {}

  while True:
    dump_fn = '{}/{}.json'.format(results_path, page)
    #print(dump_fn)
    if not refresh and os.path.exists(dump_fn):
      results = read_json(dump_fn)
    else:
      q = '"{}"'.format(clean_title.replace(' ', '+'))
      results = s2.search(q=q, page=page, pageSize=10, yearFilter=yearFilter)
      write_json(dump_fn, results)
      time.sleep(5)

    #print(results)
    if not results or len(results['results']) == 0:
      break

    total += len(results['results'])

    print("+ {} page {}".format(query, page))

    for result in results['results']:
      paper_id = result['id']
      if paper_id not in paper_ids:
        paper_ids[paper_id] = True
    page += 1
    if page > MAX_PAGES:
      break
    if total >= results['totalResults'] - 9:
      break
  return paper_ids

@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
def search_deep(refresh):
  s2_queries = fetch_google_sheet_objects("s2_queries")
  os.makedirs('./datasets/s2/search_papers', exist_ok=True)
  for row in s2_queries:
    since = row['since']
    if not since:
      continue
    if since == '#N/A':
      since = None
    else:
      since = int(since)
    queries = []
    row_paper_ids = {}
    for i in range(1, 6):
      query_key = 'query{}'.format(i)
      query = row[query_key]
      if query:
        paper_ids = fetch_query(query, since, refresh)
        for paper_id in paper_ids:
          row_paper_ids[paper_id] = True

    paper_ids = list(row_paper_ids.keys())
    if len(paper_ids):
      print("Writing {} paper ids".format(len(paper_ids)))
      row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
      write_json(row_fn, paper_ids)
      parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])

if __name__ == '__main__':
  search_deep()