import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from s2 import SemanticScholarAPI
from util import *
from importlib import import_module
raw_paper_module = import_module('s2-raw-papers')

'''
s2 search API format:
results
matchedAuthors
matchedPresentations
query
querySuggestions
results
stats
totalPages
totalResults
'''

s2 = SemanticScholarAPI()

def fetch_query(query, since=None):
  clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
  yearFilter = {'min': since, 'max': 2020 } if since else None
  results_path = './datasets/s2/search/{}'.format(clean_title)
  os.makedirs(results_path, exist_ok=True)
  page = 1
  total = 0
  paper_ids = {}

  while True:
    dump_fn = '{}/{}.json'.format(results_path, page)
    if not refresh and os.path.exists(dump_fn):
      results = read_json(dump_fn)
    else:
      results = s2.search(q=clean_title, page=page, pageSize=10, yearFilter=yearFilter)
      write_json(dump_fn, results)

    total += len(results['results'])
    if len(results['results']) == 0:
      break

    print("+ {} page {}".format(query, page))

    for result in results['results']:
      paper_id = result['id']
      if paper_id not in paper_ids:
        paper_ids[paper_id] = True
    if total >= results['totalResults']:
      break
  return paper_ids

@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
def search_deep(index, refresh):
  s2_queries = fetch_google_sheet_objects("s2_queries")
  os.makedirs('./datasets/s2/search_papers', exist_ok=True)
  for row in s2_queries:
    since = row['since']
    queries = []
    row_paper_ids = {}
    for i in range(1, 6):
      query_key = 'query' + i
      query = row[query_key]
      if query:
        paper_ids = fetch_query(query, since)
        for paper_id in paper_ids:
          row_paper_ids[paper_id] = True

    row_fn = './datasets/s2/search_papers/{}'.format(row['key'])
    write_csv(row_fn, keys=False, rows=row_paper_ids.keys())

  # parallelize(raw_paper_module.fetch_raw_paper, paper_ids.keys())

if __name__ == '__main__':
  fetch_entries()