import os
import sys
import csv
import subprocess
import time
import random
import re
import simplejson as json
import click
from s2 import SemanticScholarAPI
from util import *
from importlib import import_module
raw_paper_module = import_module('s2-raw-papers')

'''
s2 search API format:
results
matchedAuthors
matchedPresentations
query
querySuggestions
results
stats
totalPages
totalResults
'''

s2 = SemanticScholarAPI()

def fetch_query(query, since=None, refresh=False):
  clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', query)
  yearFilter = {'min': since, 'max': 2020 } if since else None
  results_path = './datasets/s2/search/{}'.format(clean_title)
  os.makedirs(results_path, exist_ok=True)
  page = 1
  total = 0
  paper_ids = {}

  while True:
    dump_fn = '{}/{}.json'.format(results_path, page)
    #print(dump_fn)
    if not refresh and os.path.exists(dump_fn):
      results = read_json(dump_fn)
    else:
      results = s2.search(q=clean_title.replace(' ', '+'), page=page, pageSize=10, yearFilter=yearFilter)
      write_json(dump_fn, results)
      time.sleep(5)

    #print(results)
    if not results or len(results['results']) == 0:
      break

    total += len(results['results'])

    print("+ {} page {}".format(query, page))

    for result in results['results']:
      paper_id = result['id']
      if paper_id not in paper_ids:
        paper_ids[paper_id] = True
    page += 1
    if total >= results['totalResults'] - 9:
      break
  return paper_ids

@click.command()
@click.option('--refresh/--no-refresh', '-f', help='Force it to query the paper API again')
def search_deep(refresh):
  s2_queries = fetch_google_sheet_objects("s2_queries")
  os.makedirs('./datasets/s2/search_papers', exist_ok=True)
  for row in s2_queries:
    since = row['since']
    if not since:
      continue
    since = int(since)
    queries = []
    row_paper_ids = {}
    for i in range(1, 6):
      query_key = 'query{}'.format(i)
      query = row[query_key]
      if query:
        paper_ids = fetch_query(query, since, refresh)
        for paper_id in paper_ids:
          row_paper_ids[paper_id] = True

    paper_ids = list(row_paper_ids.keys())
    if len(paper_ids):
      print("Writing {} paper ids".format(len(paper_ids)))
      row_fn = './datasets/s2/search_papers/{}.json'.format(row['key'])
      write_json(row_fn, paper_ids)
      parallelize(raw_paper_module.fetch_raw_paper, [(id,) for id in paper_ids])

if __name__ == '__main__':
  search_deep()