import os
import re
import glob
import simplejson as json
import math
import operator
import click
import subprocess
from util import *

DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
DIR_FINAL_CITATIONS = "../site/datasets/final"
DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
DIR_VERIFIED_CITATIONS = "../site/datasets/verified"

addresses = AddressBook()
paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id')

@click.command()
def s2_final_report():
  megapixels = load_megapixels_lookup()
  verified_lookup = fetch_verified_paper_lookup()
  items = []
  for key, item in megapixels.items():
    ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y'
    nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y'
    if ft_share or nyt_share:
      if key in verified_lookup:
        lookup = verified_lookup[key]
      else:
        lookup = {}
      items.append((item, lookup,))
  parallelize(process_paper, items)
  # key name_short  name_full purpose url
  # wild  indoor  outdoor campus  cyberspace    parent
  # child source  usernames names year_start  year_end  year_published
  # ongoing images  videos  identities  img_per_person  num_cameras
  # faces_persons female  male  landmarks width height  color gray
  # derivative_of tags  size_gb agreement
  # citations_count
  # subprocess.call([
  #   "s3cmd", "put", "-P", "--recursive",
  #   DIR_PUBLIC_CITATIONS + '/',
  #   "s3://megapixels/v1/citations/",
  # ])
  subprocess.call([
    "s3cmd", "put", "-P", "--recursive",
    DIR_VERIFIED_CITATIONS + '/',
    "s3://megapixels/v1/citations/verified/",
  ])

def process_paper(row, verified_lookup):
  aggregate_citations = {}
  unknown_citations = {}
  address = None
  address_list = []
  papers = []
  # print(row['paper_ids'])
  for paper_id in row['paper_ids']:
    res = process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations)
    if res:
      papers.append(res)
      if res['address']:
        address_list.append(res['address'])
  process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations)
  if not len(papers):
    return
  paper = papers[0]

  # final citations - a report of all geocoded citations
  with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': paper['paper_id'],
      'dataset': row['dataset'],
      'paper': paper,
      'addresses': address_list,
      'additional_papers': papers[1:],
      'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
    }, f)

  # unkonwn citations - a report of all non-geocoded citations
  with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': papers[0]['paper_id'],
      'citations': [unknown_citations[key] for key in unknown_citations.keys()],
    }, f)

  # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know
  # this might not need to get built...
  with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': paper['paper_id'],
      'paper': {
        'key': row['key'],
        'name': row['name'],
        'title': paper['title'],
        'year': paper['year'],
        'addresses': address_list,
      },
      'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
    }, f)

  # verified citations - the final public reports 
  with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f:
    json.dump({
      'id': paper['paper_id'],
      'paper': {
        'key': row['key'],
        'name': row['name'],
        'title': paper['title'],
        'year': paper['year'],
        'addresses': address_list,
      },
      'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations],
    }, f)

def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
  res = {
    'paper_id': '',
    'key': '',
    'title': '',
    # 'journal': '',
    'year': '',
    'pdf': '',
    'address': '',
    # 'citation_count': 0,
    # 'citations_geocoded': 0,
    # 'citations_unknown': 0,
    # 'citations_empty': 0,
    # 'citations_pdf': 0,
    # 'citations_doi': 0,
  }

  if paper_id == 'search':
    dataset = row['key']
    fn = 'datasets/s2/search_papers/{}.json'.format(dataset)
    if not os.path.exists(fn):
      return
    print('>> {} {}'.format(data['paperId'], 'search results'))
    with open(fn, 'r') as f:
      citations = json.load(f)
    data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] }
  else:
    fn = file_path('papers', paper_id, 'paper.json')
    with open(fn, 'r') as f:
      data = json.load(f)
    print('>> {} {}'.format(data['paperId'], row['key']))
    paper = load_paper(data['paperId'])
    if paper is None:
      print("Paper missing! {}".format(data['paperId']))
      return
  
    res['key'] = row['key']
    res['name'] = row['name']
    res['paper_id'] = paper.paper_id
    res['title'] = paper.title
    # res['journal'] = paper.journal
    res['year'] = paper.year
    res['pdf'] = paper.pdf_links()
    res['doi'] = paper.doi_links()
    # res['authors'] = ', '.join(paper.authors)
    # res['citations'] = []

    paper_institutions = load_institutions(paper.paper_id, paper_location_lookup)
    paper_address = None
    for inst in sorted(paper_institutions, key=operator.itemgetter(1)):
      #print(inst[1])
      institution = inst[1]
      if paper_address is None:
        paper_address = addresses.findObject(institution)

    if paper_address:
      # print(paper_address)
      res['address'] = paper_address

  for cite in data['citations']:
    citationId = cite['paperId']
    if citationId in aggregate_citations:
      continue
    elif citationId in unknown_citations:
      continue
    seen_here = {}
    citation = load_paper(citationId)
    has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
    has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
    # if has_pdf:
    #   pdf_count += 1
    # if has_doi:
    #   doi_count += 1
    if citation is None or citation.data is None:
      print("Citation missing! {}".format(citationId))
      continue
    institutions = load_institutions(citationId, paper_location_lookup)
    geocoded_addresses = []
    geocoded_institutions = []
    institution = ''
    address = None
    for inst in sorted(institutions, key=operator.itemgetter(1)):
      # address_count += 1
      institution = inst[1]
      next_address = addresses.findObject(institution)
      if next_address and next_address['name'] not in seen_here:
        seen_here[next_address['name']] = True
        address = next_address
        geocoded_addresses.append(next_address)
    if not address:
      if has_pdf:
        headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation)
        heading_string = '\n'.join(headings[0:20])
        found_addresses = []
        if len(headings):
          for heading in headings:
            possible_address = heading.lower().strip()
            if possible_address:
              next_address = addresses.findObject(possible_address)
              if next_address and next_address['name'] not in seen_here:
                seen_here[next_address['name']] = True
                address = next_address
                geocoded_addresses.append(next_address)
    if address:
      aggregate_citations[citationId] = {
        'id': citationId,
        'title': citation.title,
        'addresses': geocoded_addresses,
        'year': citation.year,
        'pdf': citation.pdf_links(),
        'doi': citation.doi_links(),
      }
    else:
      unknown_citations[citationId] = {
        'id': citationId,
        'title': citation.title,
        'year': citation.year,
        'pdf': citation.pdf_links(),
        'doi': citation.doi_links(),
      }
  return res

def load_megapixels_lookup():
  keys, rows = fetch_google_sheet('citation_lookup')
  dataset_lookup = fetch_google_lookup('datasets')
  lookup = {}
  for row in rows:
    rec = {}
    for index, key in enumerate(keys):
      rec[key] = row[index]
    if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'):
      continue
    paper_key = rec['key']
    if paper_key not in lookup:
      rec['paper_ids'] = []
      lookup[paper_key] = rec
    lookup[paper_key]['paper_ids'].append(rec['paper_id'])
    if paper_key in dataset_lookup:
      lookup[paper_key]['dataset'] = dataset_lookup[paper_key]
    else:
      print("not in datasets lookup:", paper_key)
      lookup[paper_key]['dataset'] = {}
  return lookup

if __name__ == '__main__':
  s2_final_report()