import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *

DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id')

@click.command()
def s2_citation_report():
  addresses = AddressBook()
  megapixels = load_megapixels_queries()
  successful_geocodes = {}
  papers = []
  for row in megapixels:
    paper_data = process_paper(row, addresses, successful_geocodes)
    if paper_data is not None:
      papers.append(paper_data)
  write_papers_report('reports/report_index.html', 'All Papers', papers, 'title')
  write_papers_report('reports/report_coverage.html', 'Coverage', papers, 'citations_geocoded', reverse=True)

  paper_count = 0
  geocode_count = 0
  for key, value in successful_geocodes.items():
    if value:
      geocode_count += 1
    paper_count += 1
  print("citations: {}".format(paper_count))
  print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))

  write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)

  subprocess.call([
    "s3cmd", "put", "-P", "--recursive",
    DIR_PUBLIC_CITATIONS + '/',
    "s3://megapixels/v1/citations/",
  ])

def write_master_report(fn, papers):
  # first make a lookup of the keys that have papers
  paper_key_lookup = {}
  for paper in papers:
    if paper['key'] not in paper_key_lookup:
      paper_key_lookup[paper['key']] = paper

  # then fetch the datasets csv which has things like "year"
  fields, rows = fetch_google_sheet('datasets')
  master_papers = []
  statistics = {}

  def clean(n):
    if type(n) is int:
      return n
    if type(n) is str and n:
      s = str(n).replace(',','').replace('.','').replace('?','').strip()
      try:
        return int(s)
      except e:
        return s
    if n:
      return n
    return None

  for row in rows:
    key = row[0]
    if key not in paper_key_lookup:
      continue
    paper = paper_key_lookup[key]
    stats = {}
    for index, field in enumerate(fields):
      stats[field] = row[index]
    report_fn = '../site/content/datasets/{}/index.md'.format(key)
    has_report = os.path.exists(report_fn)
    statistics[key] = stats
    # search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))

    image_count = stats['images']
    if type(image_count) is str:
      if len(image_count):
        image_count = clean(image_count)
      else:
        image_count = None,
    master_papers.append([
      key,
      row[1],
      '/datasets/{}/'.format(key) if has_report else '',
      image_count,
      #clean(stats['faces_unique']) or None,
      #stats['year_published'],
      clean(paper['citation_count']) or 0,
      # clean(search_result['citationStats']['numKeyCitations']) or 0,
      # origin
    ])
  master_paper_keys = [
    'key',
    'title',
    'link',
    'images',
    #'people',
    #'year',
    'citations',
    # 'influenced',
    # 'origin'
  ]
  write_csv(fn, keys=master_paper_keys, rows=master_papers)

def write_papers_report(fn, title, papers, key, reverse=False):
  sorted_papers = []
  for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
    sorted_papers.append([
      paper['paperId'],
      paper['key'],
      paper['name'],
      LinkLine(paper['report_link'], paper['title']),
      LinkLine(paper['pdf_link'], '[pdf]'),
      paper['journal'],
      paper['address_type'],
      paper['address'],
      paper['country'],
      paper['lat'],
      paper['lng'],
      str(percent(paper['citations_geocoded'], paper['citation_count'])) + '%',
      paper['citation_count'],
      paper['citations_geocoded'],
      paper['citations_unknown'],
      paper['citations_empty'],
      paper['citations_pdf'],
      paper['citations_doi'],
    ])
  sorted_paper_keys = [
    'Paper ID',
    'Megapixels Key',
    'Megapixels Name',
    'Report Link',
    'PDF Link',
    'Journal',
    'Type',
    'Address',
    'Country',
    'Lat',
    'Lng',
    'Coverage',
    'Total Citations',
    'Geocoded Citations',
    'Unknown Citations',
    'Empty Citations',
    'With PDF', 
    'With DOI',
  ]
  write_report(fn, title=title, keys=sorted_paper_keys, rows=sorted_papers)

def process_paper(row, addresses, success):
  res = {
    'paperId': '',
    'key': '',
    'title': '',
    'journal': '',
    'address': '',
    'country': '',
    'address_type': '',
    'lat': '',
    'lng': '',
    'pdf_link': '',
    'report_link': '',
    'citation_count': 0,
    'citations_geocoded': 0,
    'citations_unknown': 0,
    'citations_empty': 0,
    'citations_pdf': 0,
    'citations_doi': 0,
  }

  geocoded_citations = []
  unknown_citations = []
  display_geocoded_citations = []
  empty_citations = []
  pdf_count = 0
  doi_count = 0
  address_count = 0

  fn = file_path('papers', row['paper_id'], 'paper.json')
  if not os.path.exists(fn):
    print("not found: {}".format(fn))
    print(row)
    return

  with open(fn, 'r') as f:
    data = json.load(f)
    print('>> {} {}'.format(data['paperId'], row['key']))
    paper = load_paper(data['paperId'])
    if paper is None:
      print("Paper missing! {}".format(data['paperId']))
      return
    
    res['key'] = row['key']
    res['name'] = row['name']
    res['paperId'] = paper.paper_id
    res['title'] = paper.title
    res['journal'] = paper.journal
    res['report_link'] = 'papers/{}.html'.format(paper.paper_id)
    res['pdf_link'] = paper.pdf_link
    # res['authors'] = ', '.join(paper.authors)
    # res['citations'] = []

    paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
    paper_address = None
    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
      # print(inst[1])
      institution = inst[1]
      if paper_address is None:
        paper_address = addresses.find(institution)

    if paper_address:
      # print(paper_address)
      res['address'] = paper_address[0]
      res['lat'] = paper_address[3]
      res['lng'] = paper_address[4]
      res['address_type'] = paper_address[5]
      res['country'] = paper_address[7]

    for cite in data['citations']:
      citationId = cite['paperId']
      citation = load_paper(citationId)
      has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
      has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
      if has_pdf:
        pdf_count += 1
      if has_doi:
        doi_count += 1
      if citation is None or citation.data is None:
        print("Citation missing! {}".format(cite['paperId']))
        continue
      institutions = load_institutions(citationId, paper_location_lookup)
      geocoded_institutions = []
      unknown_institutions = []
      institution = ''
      address = None
      for inst in sorted(institutions, key=operator.itemgetter(1)):
        # print(inst[1])
        address_count += 1
        institution = inst[1]
        next_address = addresses.find(institution)
        if next_address:
          address = next_address
          geocoded_institutions.append(institution)
        else:
          unknown_institutions.append(institution)
      if not address:
        if has_pdf:
          headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
          heading_string = '\n'.join(headings[0:20])
          found_addresses = []
          if len(headings):
            for heading in headings:
              l = heading.lower().strip()
              if l:
                next_address = addresses.find(l)
                if next_address:
                  address = next_address
                  geocoded_institutions.append(heading)
                else:
                  unknown_institutions.append(heading)
          else:
            empty_citations.append([
              citationId,
              citation.title,
            ])

      # res['citations'].append({
      #   'title': citation.title,
      #   'journal': citation.journal,
      #   'authors': citation.authors,
      #   'institutions': [inst[1] for inst in institutions],
      #   'geocoded': geocoded_institutions,
      # })
      if address:
        success[citationId] = True
        geocoded_citations.append([
          citation.title,
          institution,
        ] + address + [
          citation.year,
        ])
        display_geocoded_citations.append([
          citationId,
          LinkLine(citation.pdf_link, '[pdf]'),
          citation.title,
        ] + address[0:5])
      else:
        success[citationId] = False
        unknown_citations.append([
          citationId,
          LinkLine(citation.pdf_link, '[pdf]'),
          citation.title,
          '<br>'.join(unknown_institutions),
        ])
    res['citation_count'] = len(data['citations'])
    res['citations_geocoded'] = len(geocoded_citations)
    res['citations_unknown'] = len(unknown_citations)
    res['citations_empty'] = len(empty_citations)
    res['citations_pdf'] = pdf_count
    res['citations_doi'] = doi_count

  total_citations = len(geocoded_citations) + len(unknown_citations)
  os.makedirs('reports/papers/', exist_ok=True)
  with open('reports/papers/{}.html'.format(paper.paper_id), 'w') as f:
    f.write("<!doctype html>")
    f.write("<html>")
    f.write("<head>")
    f.write('<meta charset="utf-8">')
    f.write("<title>{}</title>".format(paper.title))
    f.write("<link rel='stylesheet' href='../reports.css'>")
    f.write('<link rel="stylesheet" href="https://unpkg.com/leaflet@1.3.4/dist/leaflet.css" integrity="sha512-puBpdR0798OZvTTbP4A8Ix/l+A4dHDD0DGqYW6RQ+9jxkRFclaxxQb/SJAWZfWAkuyeQUytO7+7N4QKrDh+drA==" crossorigin=""/>')
    f.write("</head>")
    f.write("<body>")
    f.write("<div id='mapid'></div>")
    f.write("<h2>{}</h2>".format(paper.title))
    f.write('<ul>')
    if paper.journal:
      f.write('<li>Journal: {}</li>'.format(paper.journal))
    if paper_address:
      f.write('<li>Research institution: {}</li>'.format(paper_address[0]))
      f.write('<li>Address: {}</li>'.format(paper_address[2]))
      f.write('<li>Lat/Lng: {}, {}</li>'.format(paper_address[3], paper_address[4]))
      f.write('<li>Country: {}</li>'.format(paper_address[7]))
    f.write('<li>Year: {}</li>'.format(paper.year))
    if total_citations == 0:
      f.write('<li>Coverage: No citations found!</li>')
    else:
      f.write('<li>Coverage: {} / {} citations were located ({} %).</li>'.format(len(geocoded_citations), total_citations, math.floor(len(geocoded_citations) / total_citations * 100)))
    f.write('</ul>')
    f.write('<h3>{}</h3>'.format('Geocoded Citations'))
    write_table(f, keys=None, rows=sorted(display_geocoded_citations, key=operator.itemgetter(0)))
    f.write('<h3>{}</h3>'.format('Other Citations'))
    write_table(f, keys=None, rows=sorted(unknown_citations, key=operator.itemgetter(0)))
    f.write("</body>")
    f.write('<script src="../snap.svg-min.js"></script>')
    f.write('<script src="https://unpkg.com/leaflet@1.3.4/dist/leaflet.js" integrity="sha512-nMMmRyTVoLYqjP9hrbed9S+FzjZHW5gY1TWCHA5ckwXZBadntCNs8kEqAWdrb9O7rxbCaA4lKTIWjDXZxflOcA==" crossorigin=""></script>')
    f.write('<script src="../leaflet.arc.js"></script>')
    f.write('<script src="../leaflet.bezier.js"></script>')
    f.write('<script type="text/json" id="address">')
    json.dump(paper_address, f)
    f.write('</script>')
    f.write('<script type="text/json" id="citations">')
    json.dump(geocoded_citations, f)
    f.write('</script>')
    f.write('<script src="../map.js"></script>')
    f.write("</html>")
  # template = env.get_template('paper.html')
  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': paper.paper_id,
      'paper': res,
      'address': paper_address,
      'citations': geocoded_citations,
    }, f)
  return res

def load_megapixels_queries():
  keys, rows = fetch_google_sheet('citation_lookup')
  recs = []
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    if rec['verified'] == '1':
      recs.append(rec)
  return recs

#def load_institutions(paperId):
#  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
#    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
#  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
#    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
#  else:
#    return []

def data_path(key, paper_id):
  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
def file_path(key, paper_id, fn):
  return os.path.join(data_path(key, paper_id), fn)
  
if __name__ == '__main__':
  s2_citation_report()