import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *

DIR_PUBLIC_CITATIONS = "../site/datasets/final"

@click.command()
def s2_final_report():
  addresses = AddressBook()
  megapixels = load_megapixels_lookup()
  ft_lookup = load_ft_lookup()
  for key, row in megapixels.items():
    print(key)
    ft_share = ft_lookup[key]
    if ft_share:
      paper_data = process_paper(row, addresses)

def process_paper(row, addresses):
  aggregate_citations = {}
  papers = []
  for paper_id in row['paper_ids']:
    res = process_single_paper(row, addresses, aggregate_citations)
    if res:
      papers.append(res)
  if not len(papers):
    return
  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': papers[0]['paper_id'],
      'paper': papers[0],
      'address': papers[0]['address'],
      'additional_papers': papers[1:],
      'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
    }, f)

def process_single_paper(row, addresses, aggregate_citations):
  res = {
    'paper_id': '',
    'key': '',
    'title': '',
    # 'journal': '',
    'year': '',
    'pdf': '',
    'address': '',
    # 'citation_count': 0,
    # 'citations_geocoded': 0,
    # 'citations_unknown': 0,
    # 'citations_empty': 0,
    # 'citations_pdf': 0,
    # 'citations_doi': 0,
  }

  geocoded_citations = []
  unknown_citations = []
  empty_citations = []
  pdf_count = 0
  doi_count = 0
  address_count = 0
  paper_id = row['paper_id']

  fn = file_path('papers', paper_id, 'paper.json')

  with open(fn, 'r') as f:
    data = json.load(f)
  print('>> {} {}'.format(data['paperId'], row['key']))
  paper = load_paper(data['paperId'])
  if paper is None:
    print("Paper missing! {}".format(data['paperId']))
    return
  
  res['key'] = row['key']
  res['name'] = row['name']
  res['paper_id'] = paper.paper_id
  res['title'] = paper.title
  # res['journal'] = paper.journal
  res['year'] = paper.year
  res['pdf'] = paper.pdf_link
  # res['authors'] = ', '.join(paper.authors)
  # res['citations'] = []

  paper_institutions = load_institutions(paper.paper_id)
  paper_address = None
  for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
    # print(inst[1])
    institution = inst[1]
    if paper_address is None:
      paper_address = addresses.findObject(institution)

  if paper_address:
    # print(paper_address)
    res['address'] = paper_address

  for cite in data['citations']:
    citationId = cite['paperId']
    if citationId in aggregate_citations:
      continue
    seen_here = {}
    citation = load_paper(citationId)
    has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
    has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
    if has_pdf:
      pdf_count += 1
    if has_doi:
      doi_count += 1
    if citation is None or citation.data is None:
      print("Citation missing! {}".format(cite['paperId']))
      continue
    institutions = load_institutions(citationId)
    geocoded_addresses = []
    geocoded_institutions = []
    institution = ''
    address = None
    for inst in sorted(institutions, key=operator.itemgetter(1)):
      address_count += 1
      institution = inst[1]
      next_address = addresses.findObject(institution)
      if next_address and next_address['address'] not in seen_here:
        seen_here[next_address['address']] = True
        address = next_address
        geocoded_addresses.append(next_address)
    if not address:
      if has_pdf:
        headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
        heading_string = '\n'.join(headings[0:20])
        found_addresses = []
        if len(headings):
          for heading in headings:
            l = heading.lower().strip()
            if l:
              next_address = addresses.findObject(l)
              if next_address and next_address['address'] not in seen_here:
                seen_here[next_address['address']] = True
                address = next_address
                geocoded_addresses.append(next_address)
    if address:
      if citationId not in aggregate_citations:
        aggregate_citations[citationId] = {
          'id': citationId,
          'title': citation.title,
          'addresses': geocoded_addresses,
          'year': citation.year,
          'pdf': citation.pdf_link,
        }

  # res['citation_count'] = len(data['citations'])
  # res['citations_geocoded'] = len(geocoded_citations)
  # res['citations_unknown'] = len(unknown_citations)
  # res['citations_empty'] = len(empty_citations)
  # res['citations_pdf'] = pdf_count
  # res['citations_doi'] = doi_count

  return res

def load_ft_lookup():
  keys, rows = fetch_google_sheet('datasets')
  lookup = {}
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    if rec['ft_share'] == '1' or rec['ft_share'] == 1:
      lookup[rec['key']] = True
    else:
      lookup[rec['key']] = False
  return lookup

def load_megapixels_lookup():
  keys, rows = read_csv('datasets/citation_lookup.csv')
  lookup = {}
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    paper_key = rec['key']
    if paper_key not in lookup:
      rec['paper_ids'] = []
      lookup[paper_key] = rec
    lookup[paper_key]['paper_ids'].append(rec['paper_id'])
    # recs.append(rec)
  return lookup

def load_institutions(paperId):
  if os.path.exists(file_path('pdf', paperId, 'institutions.json')):
    return read_json(file_path('pdf', paperId, 'institutions.json'))['institutions']
  elif os.path.exists(file_path('doi', paperId, 'institutions.json')):
    return read_json(file_path('doi', paperId, 'institutions.json'))['institutions']
  else:
    return []

def data_path(key, paper_id):
  return 'datasets/s2/{}/{}/{}'.format(key, paper_id[0:2], paper_id)
def file_path(key, paper_id, fn):
  return os.path.join(data_path(key, paper_id), fn)
  
if __name__ == '__main__':
  s2_final_report()