import os import sys import csv import subprocess import time import random import re import simplejson as json import click from s2 import SemanticScholarAPI from util import * from importlib import import_module raw_paper_module = import_module('s2-raw-papers') ''' s2 search API format: results matchedAuthors matchedPresentations query querySuggestions results stats totalPages totalResults ''' s2 = SemanticScholarAPI() def fetch_query(query, since=None): clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query) yearFilter = {'min': since, 'max': 2020 } if since else None results_path = './datasets/s2/search/{}'.format(clean_title) os.makedirs(results_path, exist_ok=True) page = 1 total = 0 paper_ids = {} while True: dump_fn = '{}/{}.json'.format(results_path, page) if not refresh and os.path.exists(dump_fn): results = read_json(dump_fn) else: results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter) write_json(dump_fn, results) total += len(results['results']) if len(results['results']) == 0: break print("+ {} page {}".format(query, page)) for result in results['results']: paper_id = result['id'] if paper_id not in paper_ids: paper_ids[paper_id] = True if total >= results['totalResults']: break return paper_ids @click.command() @click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again') def search_deep(index, refresh): s2_queries = fetch_google_sheet_objects("s2_queries") os.makedirs('./datasets/s2/search_papers', exist_ok=True) for row in s2_queries: since = row['since'] queries = [] row_paper_ids = {} for i in range(1, 6): query_key = 'query' + i query = row[query_key] if query: paper_ids = fetch_query(query, since) for paper_id in paper_ids: row_paper_ids[paper_id] = True row_fn = './datasets/s2/search_papers/{}'.format(row['key']) write_csv(row_fn, keys=False, rows=row_paper_ids.keys()) # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys()) if __name__ == '__main__': fetch_entries()