1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from s2 import SemanticScholarAPI
from util import *
from importlib import import_module
raw_paper_module = import_module('s2-raw-papers')
'''
s2 search API format:
results
matchedAuthors
matchedPresentations
query
querySuggestions
results
stats
totalPages
totalResults
'''
s2 = SemanticScholarAPI()
MAX_PAGES = 20
def fetch_query(query, since=None, refresh=False):
clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
yearFilter = {'min': since, 'max': 2020 } if since else None
results_path = './datasets/s2/search/{}'.format(clean_title)
os.makedirs(results_path, exist_ok=True)
page = 1
total = 0
paper_ids = {}
while True:
dump_fn = '{}/{}.json'.format(results_path, page)
#print(dump_fn)
if not refresh and os.path.exists(dump_fn):
results = read_json(dump_fn)
else:
results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter)
write_json(dump_fn, results)
time.sleep(5)
#print(results)
if not results or len(results['results']) == 0:
break
total += len(results['results'])
print("+ {} page {}".format(query, page))
for result in results['results']:
paper_id = result['id']
if paper_id not in paper_ids:
paper_ids[paper_id] = True
page += 1
if page > MAX_PAGES:
break
if total >= results['totalResults'] - 9:
break
return paper_ids
@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
def search_deep(refresh):
s2_queries = fetch_google_sheet_objects("s2_queries")
os.makedirs('./datasets/s2/search_papers', exist_ok=True)
for row in s2_queries:
since = row['since']
if not since:
continue
since = int(since)
queries = []
row_paper_ids = {}
for i in range(1, 6):
query_key = 'query{}'.format(i)
query = row[query_key]
if query:
paper_ids = fetch_query(query, since, refresh)
for paper_id in paper_ids:
row_paper_ids[paper_id] = True
paper_ids = list(row_paper_ids.keys())
if len(paper_ids):
print("Writing {} paper ids".format(len(paper_ids)))
row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
write_json(row_fn, paper_ids)
parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])
if __name__ == '__main__':
search_deep()
|