from __future__ import with_statement
import calendar
import collections
import datetime
import functools
import itertools
import math
import operator
import psycopg2
import re
import sys
import time

from writer import ConsoleWriter, HtmlWriter

Nickset = None
SlowReqThreshold = 1000
WindowInterval = datetime.timedelta(minutes=5)
ConnStr = "dbname=dumpfm user=postgres password=root"

def fetch_nicks():
    try:
        print "fetching user nicknames from db"
        conn = psycopg2.connect(ConnStr)
        cur = conn.cursor()
        cur.execute("select nick from users")
        return set(n for (n,) in cur.fetchall())
    except Exception, e:
        print "Error fetching user nicknames: %s" % e
        print "Profile urls will be incorrectly classified as 'Other'"
        return set()

def comma_format(x):
    # http://code.activestate.com/recipes/498181-add-thousands-separator-commas-to-formatted-number/
    return re.sub(r'(\d{3})(?=\d)', r'\1,', str(x)[::-1])[::-1]

def format_unit(val, units, factors):
    for unit, factor in zip(units, factors):
        if factor in (0, 1):
            return "%s%s" % (val, unit)
        elif val >= factor:
            return "%0.2f%s" % (float(val) / factor, unit)
    raise ValueError('Cannot format %s with any of %s' % (val, unit))

def format_time(t):
    return t.strftime('%Y/%m/%d %H:%M')

def format_ms(ms):
    return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0])

def format_bytes(bytes):
    return format_unit(bytes,
                       ['GB', 'MB', 'KB', 'B'],
                       [1024 ** i for i in range(3, -1, -1)])

def format_pct(n):
    return '%.02f%%' % (100 * n)

def median(pool):
    copy = sorted(pool)
    size = len(copy)
    if size % 2 == 1:
        return copy[(size - 1) / 2]
    else:
        return (copy[size/2 - 1] + copy[size/2]) / 2

def mean(pool):
    return sum(pool) / len(pool)
    

def percentile(N, percent, key=lambda x:x):
    """
    From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
    Find the percentile of a list of values.

    @parameter N - is a list of values. Note N MUST BE already sorted.
    @parameter percent - a float value from 0.0 to 1.0.
    @parameter key - optional key function to compute value from each element of N.

    @return - the percentile of the values
    """
    if not N:
        return None
    k = (len(N)-1) * percent
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return key(N[int(k)])
    d0 = key(N[int(f)]) * (k-f)
    d1 = key(N[int(c)]) * (c-k)
    return d0+d1

def ninety_percentile(N):
    return percentile(N, .9)

def gather_by_key(rs, key):
    d = collections.defaultdict(list)
    for r in rs:
        group = r[key]
        d[group].append(r)
    return d

def group_by_time_interval(xs, interval, timekey):
    if len(xs) == 0:
        raise ValueError("Cannot group empty list")
    start = xs[0][timekey]
    end = xs[-1][timekey]
    interval_seconds = int(interval.seconds)
    span = int((end - start).seconds) / interval_seconds + 1
    res = [[] for i in range(span)]
    
    for x in xs:
        idx = int((x[timekey] - start).seconds) / interval_seconds
        res[idx].append(x)
    return res

def get_report_title(entries):
    start_time = entries[0]['ts']
    end_time = entries[-1]['ts']
    return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time)

def build_js_array(pairs):
    s_list = []
    for ts, val in pairs:
        unix_time = calendar.timegm(ts.timetuple()) * 1000
        s_list.append('[%s, %s]' % (unix_time, val))
    return '[%s]' % ','.join(s_list)

def summarize_window(ts, window, slow_threshold):
    reqs = 0.0
    dumps = 0.0
    images = 0.0
    tags = 0.0
    chat_users = 0.0
    slow = 0

    for r in window:
        reqs += 1
        group = r['urlgroup']
        if group in ('Images', 'Avatars'):
            images += 1
        elif group in ('Post', 'Upload'):
            dumps += 1
        elif group in ('TagAdd', 'TagRemove'):
            tags += 1
        elif group == 'ChatUpdate':
            chat_users += 1
        if r['time'] > slow_threshold:
            slow += 1

    times = [r['time'] for r in window]
    return { 'ts': ts,
             'reqs': reqs,
             'dumps': dumps,
             'images': images,
             'tags': tags,
             'chat_users': chat_users,
             'slow': slow,
             'mean_resp': mean(times),
             'median_resp': median(times),
             'ninety_resp': ninety_percentile(times),
             'max_resp': max(times)
             }

ReportSetLimit = 100

class DumpReport(object):
    
    def __init__(self, interval, slow_thresh=SlowReqThreshold):
        self.windows = []
        self.interval = interval
        self.bytes_sent = 0
        self.requests = 0
        self.dumps = 0
        self.slow_requests = 0
        self.new_users = 0
        self.unclassified_urls = set()
        self.error_urls = set()
        self.start_time = None
        self.end_time = None
        self.urlgroup_stats = {}
        self.slow_request_threshold = slow_thresh
        self.printed_dates = set()

    def record_window(self, ts, window):
        date_str = ts.strftime('%D')
        if date_str not in self.printed_dates:
            print "processing %s" % date_str
            self.printed_dates.add(date_str)
        
        if len(window) > 0:
            if self.start_time is None:
                self.start_time = window[0]['ts']
            window_end_time = window[-1]['ts']
            if self.end_time:
                self.end_time = max(self.end_time, window_end_time)
            else:
                self.end_time = window_end_time

        # TODO: consolidate summary logic
        window_summary = summarize_window(ts, window, self.slow_request_threshold)
        self.windows.append(window_summary)

        for record in window:
            self.bytes_sent += record['total_bytes']
            self.requests += 1
            
            if record['urlgroup'] == 'Other' and len(self.unclassified_urls) < ReportSetLimit:
                self.unclassified_urls.add(record['url'])

            if record['urlgroup'] == 'Error' and len(self.error_urls) < ReportSetLimit:
                self.error_urls.add(record['url'])

            r_time = record['time']
            group = record['urlgroup']
            
            if group in ('Post', 'Upload'):
                self.dumps += 1
            elif group == 'Register':
                self.new_users += 1

            if group not in self.urlgroup_stats:
                stats = self.urlgroup_stats[group] = { 'min': r_time,
                                                       'max': r_time,
                                                       'time': 0,
                                                       'slow': 0,
                                                       'requests': 0,
                                                       'bytes': 0 }
            else:
                stats = self.urlgroup_stats[group]
                stats['min'] = min(stats['min'], r_time)
                stats['max'] = max(stats['max'], r_time)

            stats['requests'] += 1
            stats['bytes'] += record['total_bytes']
            stats['time'] +=  r_time
            if r_time > self.slow_request_threshold:
                self.slow_requests += 1
                stats['slow'] += 1

    def _output_errors_and_unclassified(self, writer):
        if len(self.error_urls) > 0:
            writer.section('Error URLs (%s)' % len(self.error_urls))
            writer.paragraph(', '.join(sorted(self.error_urls)))

        if len(self.unclassified_urls) > 0:
            writer.section('Unclassified URLs (%s)' % len(self.unclassified_urls))
            writer.paragraph(', '.join(sorted(self.unclassified_urls)))


    def _output_urlgroups(self, writer):
        headers = ['#', 'min', 'mean', 'max', '# slow', '% slow', 'bytes', 'bytes / req']
        data = []
        for group in sorted(self.urlgroup_stats.keys()):
            stats = self.urlgroup_stats[group]
            avg_time = stats['time'] / stats['requests']
            data.append([group,
                         comma_format(stats['requests']),
                         format_ms(stats['min']),
                         format_ms(avg_time),
                         format_ms(stats['max']),
                         stats['slow'],
                         format_pct(float(stats['slow']) / stats['requests']),
                         format_bytes(stats['bytes']),
                         format_bytes(stats['bytes'] / stats['requests'])])
        writer.table(headers, data)
        

    def _output_usage_charts(self, writer):
        writer.section('Activity Rates')
        writer.graph('Reqs/sec', 'reqs_per_sec',
                     build_js_array((r['ts'], r['reqs'] / self.interval.seconds) for r in self.windows))
        writer.graph('Images/sec', 'images_per_sec',
                     build_js_array((r['ts'], r['images'] / self.interval.seconds) for r in self.windows))
        writer.graph('Dumps/sec', 'dumps_per_sec',
                     build_js_array((r['ts'], r['dumps'] / self.interval.seconds) for r in self.windows))
        writer.graph('Tags/sec', 'tags_per_sec',
                     build_js_array((r['ts'], r['tags'] / self.interval.seconds) for r in self.windows))
        writer.graph('Chat users/sec', 'users_per_sec',
                     build_js_array((r['ts'], r['chat_users'] / self.interval.seconds) for r in self.windows))

        writer.section('Response Rates')
        writer.graph('Mean response time', 'mean_resp',
                     build_js_array((r['ts'], r['mean_resp']) for r in self.windows))
        writer.graph('Median response time', 'median_resp',
                     build_js_array((r['ts'], r['median_resp']) for r in self.windows))
        writer.graph('90% response time', 'ninety_resp',
                     build_js_array((r['ts'], r['ninety_resp']) for r in self.windows))
        writer.graph('Max response time', 'max_resp',
                     build_js_array((r['ts'], r['max_resp']) for r in self.windows))


    def _output_report(self, writer):
        writer.section('Overview')
        duration = self.end_time - self.start_time
        summary = [
            '%s EST to %s EST (%s)' % (format_time(self.start_time),
                                       format_time(self.end_time),
                                       duration),
            '%s requests (%.02f r/s)' % (comma_format(self.requests),
                                         self.requests / duration.seconds),
            '%s slow requests (%s)' % (comma_format(self.slow_requests),
                                       format_pct(float(self.slow_requests) / self.requests)),
            '%s bytes sent' % format_bytes(self.bytes_sent)]
        writer.list(summary)
        self._output_usage_charts(writer)
        self._output_urlgroups(writer)
        self._output_errors_and_unclassified(writer)

    def write_report(self, path):
        writer = HtmlWriter(path, 'Dump Report')
        self._output_report(writer)
        writer.close()
        
    def print_report(self):
        writer = ConsoleWriter()
        self._output_report(writer)

def peek(iterable):
    first = None
    try:
        first = iterable.next()
    except StopIteration:
        raise ValueError('cannot peek on empty iterator')
    return first, itertools.chain([first], iterable)


def yield_log_windows(raw_iter, interval, parse_func, timekey):
    wrapped_iter = itertools.ifilter(None, itertools.imap(parse_func, itertools.count(1), raw_iter))
    first, wrapped_iter = peek(wrapped_iter)

    start_time = first[timekey]
    
    def keyfunc(record):
        return int((record[timekey] - start_time).seconds / interval.seconds)

    for i, g in itertools.groupby(wrapped_iter, keyfunc):
        yield start_time + (interval * i), list(g)


# Dump log parsing

def parse_time(s):
    # Parse times such as 02/Jul/2010:22:21:25 -0400
    return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M')
    
InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall',
            '/register', '/reset-request']
StaticUrls = ['/favicon.ico', '/robots.txt', '/']

def _classify_profile_url(url, nickset):
    if url.startswith('/u/'):
        url = url[2:]
    comps = url.split('/')
    comps.remove('')
    if len(comps) == 0 or comps[0] not in nickset:
        return None
    else:
        if len(comps) == 1:
            return 'Profile'
        elif comps[1] == 'popular':
            return 'Popular'
        elif comps[1] == 'favorites':
            return 'Favorites'
        elif comps[1] == 'log':
            return 'Log'
        else:
            return 'Profile'

def classify_url(method, url):
    profile_classification = _classify_profile_url(url, Nickset)
    if profile_classification:
        return profile_classification
    elif url.startswith('/avatars/'):
        return 'Avatars'
    elif url.startswith('/refresh'):
        return 'ChatUpdate'
    elif url.startswith('/chat') or url.startswith('/browser'):
        return 'ChatLoad'
    elif url.startswith('/json/') and url.endswith('/favorites'):
        return 'JsonFavs'
    elif url.startswith('/log'):
        return 'RoomLog'
    elif url.startswith('/login') or url.startswith('/logout'):
        return 'Login'
    elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'):
        return 'InfoPages'
    elif url == '/msg':
        return 'Post'
    elif url == '/submit-registration':
        return 'Register'
    elif url.startswith('/cmd/ghettosearch/'):
        return 'Search'
    elif url.startswith('/static/') or url in StaticUrls:
        return 'Static'
    elif url.startswith('/images/'):
        return 'Images'
    elif url == '/cmd/tag/add':
        return 'TagAdd'
    elif url == '/cmd/tag/rm':
        return 'TagRemove'
    elif url == '/upload/message':
        return 'Upload'
    elif url.startswith('/p/'):
        return 'ViewDump'
    else:
        return 'Other'

def parse_ms(ms):
    return int(float(ms) * 1000)

def parse_request(s):
    method, url, httpversion = s.split(' ')
    url_group = classify_url(method, url)
    return {'method': method,
            'httpversion': httpversion,
            'urlgroup': url_group,
            'url': url }

def parse_status(s):
    code = int(s)
    # This works because statuses are parsed after urls.
    if code == 302:
        return {'code': 302, 'urlgroup': 'Redirect'}
    elif code == 404:
        return {'code': 404, 'urlgroup': '404'}
    elif code >= 500:
        return {'code': code, 'urlgroup': 'Error'}
    else:
        return code

LineFormat = [('addr', str),
              ('ts', parse_time),
              ('request', parse_request),
              ('pipe', str),
              ('status', parse_status),
              ('time', parse_ms),
              ('total_bytes', int),
              ('body_bytes', int),
              ('refer', str),
              ('agent', str)]

def parse_dump_log_entry(i, l, error_func=None):
    comps = [c.strip() for c in l.split('|')]
    if len(comps) != len(LineFormat):
        if error_func:
            error_func(i, l, "Invalid number of log components (%s instead of %s)" \
                           % (len(comps), len(LineFormat)))
        return None
    r = {}
    for (label, f), v in zip(LineFormat, comps):
        try:
            res = f(v)
            if isinstance(res, dict):
                r.update(res)
            else:
                r[label] = f(v)
        except Exception, e:
            if error_func:
                error_func(i, l, "error converting %s field '%s': %s" % (label, v, e))
            return None
    return r

def default_error_handler(i, line, err):
    # print "line %s: %s" % (i, err)
    pass

default_line_parser = functools.partial(parse_dump_log_entry,
                                        error_func=default_error_handler)

def analyze_dump_log(path,
                     interval=WindowInterval,
                     line_parser=default_line_parser,
                     timekey='ts'):
    report = DumpReport(interval)
    with open(path, 'r') as f:
        for ts, window in yield_log_windows(f, interval, line_parser, timekey):
            report.record_window(ts, window)
    return report

if __name__ == "__main__":
    if len(sys.argv) == 1:
        print "usage: logreport.py logfile [outfile]"
        sys.exit(1)
    logpath = sys.argv[1]
    outpath = sys.argv[2] if len(sys.argv) > 2 else None

    Nickset = fetch_nicks()
    report = analyze_dump_log(logpath)

    if outpath:
        report.write_report(outpath)
    else:
        report.print_report()