import os import sys import csv import subprocess import time import random import re import simplejson as json import click from s2 import SemanticScholarAPI from util import * from importlib import import_module raw_paper_module = import_module('s2-raw-papers') ''' s2 search API format: results matchedAuthors matchedPresentations query querySuggestions results stats totalPages totalResults ''' s2 = SemanticScholarAPI() def fetch_query(query, since=None, refresh=False): clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query) yearFilter = {'min': since, 'max': 2020 } if since else None results_path = './datasets/s2/search/{}'.format(clean_title) os.makedirs(results_path, exist_ok=True) page = 1 total = 0 paper_ids = {} while True: dump_fn = '{}/{}.json'.format(results_path, page) #print(dump_fn) if not refresh and os.path.exists(dump_fn): results = read_json(dump_fn) else: results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter) write_json(dump_fn, results) time.sleep(5) #print(results) if not results or len(results['results']) == 0: break total += len(results['results']) print("+ {} page {}".format(query, page)) for result in results['results']: paper_id = result['id'] if paper_id not in paper_ids: paper_ids[paper_id] = True page += 1 if total >= results['totalResults'] - 9: break return paper_ids @click.command() @click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again') def search_deep(refresh): s2_queries = fetch_google_sheet_objects("s2_queries") os.makedirs('./datasets/s2/search_papers', exist_ok=True) for row in s2_queries: since = row['since'] if not since: continue since = int(since) queries = [] row_paper_ids = {} for i in range(1, 6): query_key = 'query{}'.format(i) query = row[query_key] if query: paper_ids = fetch_query(query, since, refresh) for paper_id in paper_ids: row_paper_ids[paper_id] = True paper_ids = list(row_paper_ids.keys()) if len(paper_ids): print("Writing {} paper ids".format(len(paper_ids))) row_fn = './datasets/s2/search_papers/{}.json'.format(row['key']) write_json(row_fn, paper_ids) parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids]) if __name__ == '__main__': search_deep()