From 88fb87e7c2d72db2c5f7b3e7a1ee41f021e5c40d Mon Sep 17 00:00:00 2001
From: Scott Ostler <scottbot9000@gmail.com>
Date: Thu, 8 Jul 2010 19:37:38 -0400
Subject: logreport.py rewritten to use generators

---
 scripts/logreport.py | 509 ++++++++++++++++++++++++---------------------------
 1 file changed, 238 insertions(+), 271 deletions(-)

(limited to 'scripts/logreport.py')

diff --git a/scripts/logreport.py b/scripts/logreport.py
index 6f4799c..c2d7712 100644
--- a/scripts/logreport.py
+++ b/scripts/logreport.py
@@ -11,11 +11,13 @@ import psycopg2
 import sys
 import time
 
+from writer import ConsoleWriter, HtmlWriter
+
 locale.setlocale(locale.LC_ALL, "")
 
 Nickset = None
 SlowReqThreshold = 1000
-DefaultTimeBucket = datetime.timedelta(minutes=5)
+WindowInterval = datetime.timedelta(minutes=5)
 ConnStr = "dbname=dumpfm user=postgres password=root"
 
 def fetch_nicks():
@@ -52,6 +54,9 @@ def format_bytes(bytes):
                        ['GB', 'MB', 'KB', 'B'],
                        [1024 ** i for i in range(3, -1, -1)])
 
+def format_pct(n):
+    return '%.02f%%' % (100 * n)
+
 def median(pool):
     copy = sorted(pool)
     size = len(copy)
@@ -115,266 +120,223 @@ def get_report_title(entries):
     end_time = entries[-1]['ts']
     return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time)
 
-def build_js_array(start_time, interval, values):
-    pairs = []
-    for i, val in enumerate(values):
-        t = start_time + interval * i
-        unix_time = calendar.timegm(t.timetuple()) * 1000
-        pairs.append('[%s, %s]' % (unix_time, val))
-    return '[%s]' % ','.join(pairs)
+def build_js_array(pairs):
+    s_list = []
+    for ts, val in pairs:
+        unix_time = calendar.timegm(ts.timetuple()) * 1000
+        s_list.append('[%s, %s]' % (unix_time, val))
+    return '[%s]' % ','.join(s_list)
+
+def summarize_window(ts, window, slow_threshold):
+    reqs = 0.0
+    dumps = 0.0
+    images = 0.0
+    tags = 0.0
+    chat_users = 0.0
+    slow = 0
+
+    for r in window:
+        reqs += 1
+        group = r['urlgroup']
+        if group in ('Images', 'Avatars'):
+            images += 1
+        elif group in ('Post', 'Upload'):
+            dumps += 1
+        elif group in ('TagAdd', 'TagRemove'):
+            tags += 1
+        elif group == 'ChatUpdate':
+            chat_users += 1
+        if r['time'] > slow_threshold:
+            slow += 1
+
+    times = [r['time'] for r in window]
+    return { 'ts': ts,
+             'reqs': reqs,
+             'dumps': dumps,
+             'images': images,
+             'tags': tags,
+             'chat_users': chat_users,
+             'slow': slow,
+             'mean_resp': mean(times),
+             'median_resp': median(times),
+             'ninety_resp': ninety_percentile(times),
+             'max_resp': max(times)
+             }
+
+ReportSetLimit = 100
+
+class DumpReport(object):
+    
+    def __init__(self, interval, slow_thresh=SlowReqThreshold):
+        self.windows = []
+        self.interval = interval
+        self.bytes_sent = 0
+        self.requests = 0
+        self.dumps = 0
+        self.slow_requests = 0
+        self.new_users = 0
+        self.unclassified_urls = set()
+        self.error_urls = set()
+        self.start_time = None
+        self.end_time = None
+        self.urlgroup_stats = {}
+        self.slow_request_threshold = slow_thresh
+
+    def record_window(self, ts, window):
+        if len(window) > 0:
+            if self.start_time is None:
+                self.start_time = window[0]['ts']
+            window_end_time = window[-1]['ts']
+            if self.end_time:
+                self.end_time = max(self.end_time, window_end_time)
+            else:
+                self.end_time = window_end_time
 
-SlowReportCount = 10
-IpReportCount = 10
+        # TODO: consolidate summary logic
+        window_summary = summarize_window(ts, window, self.slow_request_threshold)
+        self.windows.append(window_summary)
 
-class ConsoleWriter(object):
+        for record in window:
+            self.bytes_sent += record['total_bytes']
+            self.requests += 1
+            
+            if record['urlgroup'] == 'Other' and len(self.unclassified_urls) < ReportSetLimit:
+                self.unclassified_urls.add(record['url'])
 
-    def section(self, label):
-        print "\n----- %s -----" % label
-    
-    def paragraph(self, text):
-        print text
-
-    def list(self, entries):
-        for e in entries:
-            print "- %s" % e
-
-    def table(self, headers, data):
-        for row in data:
-            label = row[0]
-            if label:
-                print "\n%s:" % label
-            for h, d in zip(headers, row[1:]):
-                print "  - %s: %s" % (h, d)
-
-    def graph(self, *args, **kwds):
-        pass
-
-    def close(self):
-        pass
-
-HtmlTemplate = """
-<html>
-<head>
-<title>%s</title>
-<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
-<script type="text/javascript" src="http://dump.fm/static/js/jquery.flot.js"></script>
-<style>
-table {
-    margin: 0;
-    cellpadding: 1px;
-}
-
-th { text-align: left; }
-
-tr:hover { background-color: #AFF; }
-
-</style>
-</head>
-<body>%s</body>
-</html>
-"""
-
-JqueryTemplate = """
-$(function () {
-    $.plot($("#%s"),
-           [ { data: %s, label: "%s" } ],
-           { xaxis: { mode: 'time' },
-             yaxis: { min: 0 },
-             legend: { position: 'sw' } });
-});
-
-"""
-
-class HtmlWriter(object):
-    
-    def __init__(self, path, title):
-        self.path = path
-        self.title = title
-        self.content = ""
-        
-    def section(self, label):
-        self.content += "<h2>%s</h2>\n" % label
-
-    def paragraph(self, text):
-        self.content += "<p>%s</p>\n" % text
-
-    def list(self, entries):
-        self.content += "<ul>\n"
-        for e in entries:
-            self.content += "<li>%s</li>\n" % e
-        self.content += "</ul>\n"
-
-    def table(self, headers, data):
-        self.content += "<table><tr><th></th>"
-        for h in headers:
-            self.content += "<th>%s</th>" % h
-        self.content += "</tr>\n"
-        for row in data:
-            self.content += "<tr>"
+            if record['urlgroup'] == 'Error' and len(self.error_urls) < ReportSetLimit:
+                self.error_urls.add(record['url'])
+
+            r_time = record['time']
+            group = record['urlgroup']
             
-            for i, val in enumerate(row):
-                if i == 0:
-                    self.content += "<th>%s</th>" % val
-                else:
-                    self.content += "<td>%s</td>" % val
-            self.content += "</tr>\n"
-        self.content += "</table>\n"
-
-    def graph(self, label, graph_id, data):
-        self.content += '<div id="%s" style="width: 1000px; height: 350px;"></div>' % graph_id
-        js_array = "var %s = %s;" % (graph_id, data)
-        graph_command = JqueryTemplate % (graph_id, graph_id, label)
-        self.content += '<script>%s\n%s</script>' % (js_array, graph_command)
-        
-    def _assemble(self):
-        return HtmlTemplate % (self.title or 'Untitled', self.content)
-
-    def close(self):
-        print "writing report to %s" % self.path
-        text = self._assemble()
-        with open(self.path, 'w') as f:
-            f.write(text)
-        
+            if group in ('Post', 'Upload'):
+                self.dumps += 1
+            elif group == 'Register':
+                self.new_users += 1
+
+            if group not in self.urlgroup_stats:
+                stats = self.urlgroup_stats[group] = { 'min': r_time,
+                                                       'max': r_time,
+                                                       'time': 0,
+                                                       'slow': 0,
+                                                       'requests': 0,
+                                                       'bytes': 0 }
+            else:
+                stats = self.urlgroup_stats[group]
+                stats['min'] = min(stats['min'], r_time)
+                stats['max'] = max(stats['max'], r_time)
 
-class Report(object):
+            stats['requests'] += 1
+            stats['bytes'] += record['total_bytes']
+            stats['time'] +=  r_time
+            if r_time > self.slow_request_threshold:
+                self.slow_requests += 1
+                stats['slow'] += 1
 
-    def __init__(self, writer, good, bad):
-        self.writer = writer
-        self.good = good
-        self.bad = bad
-        self.count = len(good)
-        self.start = self.good[0]['ts']
-        self.end = self.good[-1]['ts']
-    
-    def output_group_stats(self):
-        self.writer.section('URL group stats')
-        grouped = gather_by_key(self.good, 'urlgroup')
+    def _output_errors_and_unclassified(self, writer):
+        if len(self.error_urls) > 0:
+            writer.section('Error URLs (%s)' % len(self.error_urls))
+            writer.paragraph(', '.join(sorted(self.error_urls)))
 
-        headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', '% slow',
-                   'bytes sent', 'bytes/req']
-        data = []
+        if len(self.unclassified_urls) > 0:
+            writer.section('Unclassified URLs (%s)' % len(self.unclassified_urls))
+            writer.paragraph(', '.join(sorted(self.unclassified_urls)))
 
-        for url_group in sorted(grouped.keys()):
-            vals = sorted(r['time'] for r in grouped[url_group])
-            slow_count = sum(1 for v in vals if v > SlowReqThreshold)
-            bytes_sent = sum(r['total_bytes'] for r in grouped[url_group])
-            data.append([url_group,
-                         comma_format(len(vals)),
-                         format_ms(min(vals)),
-                         format_ms(median(vals)),
-                         format_ms(mean(vals)),
-                         format_ms(ninety_percentile(vals)),
-                         format_ms(max(vals)),
-                         slow_count,
-                         '%.01f%%' % (100 * slow_count / len(vals)),
-                         format_bytes(bytes_sent),
-                         format_bytes(bytes_sent / len(vals))])
-        self.writer.table(headers, data)
-        
-    def output_activity_graphs(self, interval=DefaultTimeBucket):
-        self.writer.section('Activity Rates')
-        buckets = group_by_time_interval(self.good, interval, 'ts')
 
-        def second_avg(count): return float(count) / interval.seconds
-        
-        def avg_bucket_rate(bs, cond):
-            return [second_avg(sum(1 for r in b if cond(r))) for b in bs]
-
-        reqs_per_sec = avg_bucket_rate(buckets, lambda r: True)
-        images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars'))
-        dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload'))
-        tags_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('TagAdd', 'TagRemove'))
-        chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate')
-
-
-        self.writer.graph('Reqs / sec', 'reqs_per_sec',
-                          build_js_array(self.start, interval, reqs_per_sec))
-        self.writer.graph('Images / sec', 'images_per_sec',
-                          build_js_array(self.start, interval, images_per_sec))
-        self.writer.graph('Dumps / sec', 'dumps_per_sec',
-                          build_js_array(self.start, interval, dumps_per_sec))
-        self.writer.graph('Tags / sec', 'tags_per_sec',
-                          build_js_array(self.start, interval, tags_per_sec))
-        self.writer.graph('Chat users / sec', 'chat_users_per_sec',
-                          build_js_array(self.start, interval, chat_users_per_sec))
-
-        self.writer.section('Response Time')
+
+    def _output_urlgroups(self, writer):
+        headers = ['#', 'min', 'mean', 'max', '# slow', '% slow', 'bytes']
+        data = []
+        for group in sorted(self.urlgroup_stats.keys()):
+            stats = self.urlgroup_stats[group]
+            avg_time = stats['time'] / stats['requests']
+            data.append([group,
+                         comma_format(stats['requests']),
+                         format_ms(stats['min']),
+                         format_ms(avg_time),
+                         format_ms(stats['max']),
+                         stats['slow'],
+                         format_pct(float(stats['slow']) / stats['requests']),
+                         format_bytes(stats['bytes']),
+                         format_bytes(stats['bytes'] / stats['requests'])])
+        writer.table(headers, data)
         
-        def calc_bucket_resp_time(bs, func):
-            return [func([r['time'] for r in b]) for b in bs]
-
-        mean_resp_time = calc_bucket_resp_time(buckets, mean)
-        median_resp_time = calc_bucket_resp_time(buckets, median)
-        ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile)
-        max_resp_time = calc_bucket_resp_time(buckets, max)
-
-        self.writer.graph('Mean response time (ms)', 'mean_resp_time',
-                          build_js_array(self.start, interval, mean_resp_time))
-        self.writer.graph('Median response time (ms)', 'median_per_sec',
-                          build_js_array(self.start, interval, median_resp_time))
-        self.writer.graph('90% response time (ms)', 'ninety_pct_time',
-                          build_js_array(self.start, interval, ninety_pct_time))
-        self.writer.graph('Max response time (ms)', 'max_resp_time',
-                          build_js_array(self.start, interval, max_resp_time))
-                          
-
-    def output_slow_requests(self):
-        self.writer.section('Slow URLs')
-        slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold),
-                      key=lambda r: r['time'] * -1)
-        entries = []
-        for r in slow[:SlowReportCount]:
-            e = "%s: %s, %s sent" % (r['url'],
-                                     format_ms(r['time']),
-                                     format_bytes(r['total_bytes']))
-            entries.append(e)
-        self.writer.list(entries)
-
-    def output_unclassified_urls(self):
-        unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other')
-        unclassified_str = ', '.join(sorted(unclassified)[:100])
-        if len(unclassified) > 100:
-            unclassified_str += '...'
-        self.writer.section('Unclassified URLs (%s)' % len(unclassified))
-        self.writer.paragraph(unclassified_str)
-
-    def output_errors(self):
-        errors = set(g['url'] for g in self.good if g['urlgroup'] == 'Error')
-        error_str = ', '.join(sorted(errors)[:100])
-        self.writer.section('Error Urls (%s)' % len(errors))
-        self.writer.paragraph(error_str)
-
-    def output_ip_usage(self):
-        grouped = gather_by_key(self.good, 'addr')
-        sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs)
-        addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems()))
-        self.writer.section('Heaviest Users by IP')
+
+    def _output_usage_charts(self, writer):
+        writer.section('Activity Rates')
+        writer.graph('Reqs/sec', 'reqs_per_sec',
+                     build_js_array((r['ts'], r['reqs'] / self.interval.seconds) for r in self.windows))
+        writer.graph('Images/sec', 'images_per_sec',
+                     build_js_array((r['ts'], r['images'] / self.interval.seconds) for r in self.windows))
+        writer.graph('Dumps/sec', 'dumps_per_sec',
+                     build_js_array((r['ts'], r['dumps'] / self.interval.seconds) for r in self.windows))
+        writer.graph('Tags/sec', 'tags_per_sec',
+                     build_js_array((r['ts'], r['tags'] / self.interval.seconds) for r in self.windows))
+        writer.graph('Chat users/sec', 'users_per_sec',
+                     build_js_array((r['ts'], r['chat_users'] / self.interval.seconds) for r in self.windows))
+
+        writer.section('Response Rates')
+        writer.graph('Mean response time', 'mean_resp',
+                     build_js_array((r['ts'], r['mean_resp']) for r in self.windows))
+        writer.graph('Median response time', 'median_resp',
+                     build_js_array((r['ts'], r['median_resp']) for r in self.windows))
+        writer.graph('90% response time', 'ninety_resp',
+                     build_js_array((r['ts'], r['ninety_resp']) for r in self.windows))
+        writer.graph('Max response time', 'max_resp',
+                     build_js_array((r['ts'], r['max_resp']) for r in self.windows))
+
+
+
+    def _output_report(self, writer):
+        writer.section('Overview')
+        duration = self.end_time - self.start_time
+        summary = [
+            '%s EST to %s EST (%s)' % (format_time(self.start_time),
+                                       format_time(self.end_time),
+                                       duration),
+            '%s requests (%.02f r/s)' % (comma_format(self.requests),
+                                         self.requests / duration.seconds),
+            '%s slow requests (%s)' % (self.slow_requests,
+                                       format_pct(float(self.slow_requests) / self.requests)),
+            '%s bytes sent' % format_bytes(self.bytes_sent)]
+        writer.list(summary)
+        self._output_usage_charts(writer)
+        self._output_urlgroups(writer)
+        self._output_errors_and_unclassified(writer)
+
+    def write_report(self, path):
+        writer = HtmlWriter(path, 'Dump Report')
+        self._output_report(writer)
+        writer.close()
         
-        l = []
-        for count, addr in itertools.islice(addr_bytes_gen, IpReportCount):
-            bytes = sum(r['total_bytes'] for r in grouped[addr])
-            l.append("%s: %s reqs, %s sent" % (addr,
-                                               comma_format(count),
-                                               format_bytes(bytes)))
-        self.writer.list(l)
-
-    def output_report(self):
-        l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start),
-                                          format_time(self.end),
-                                          self.end - self.start),
-             "%s Total Requests (%.02f r/s)" \
-                 % (comma_format(self.count), self.count / (self.end - self.start).seconds),
-             "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))]
-        self.writer.section('Overview')
-        self.writer.list(l)
-        self.output_activity_graphs()
-        self.output_group_stats()
-        self.output_slow_requests()
-        self.output_ip_usage()
-        self.output_errors()
-        self.output_unclassified_urls()
+    def print_report(self):
+        writer = ConsoleWriter()
+        self._output_report(writer)
+
+def peek(iterable):
+    first = None
+    try:
+        first = iterable.next()
+    except StopIteration:
+        raise ValueError('cannot peek on empty iterator')
+    return first, itertools.chain([first], iterable)
+
+
+def yield_log_windows(raw_iter, interval, parse_func, timekey):
+    wrapped_iter = itertools.ifilter(None, itertools.imap(parse_func, itertools.count(1), raw_iter))
+    first, wrapped_iter = peek(wrapped_iter)
+
+    start_time = first[timekey]
+    
+    def keyfunc(record):
+        return int((record[timekey] - start_time).seconds / interval.seconds)
+
+    for i, g in itertools.groupby(wrapped_iter, keyfunc):
+        yield start_time + (interval * i), list(g)
 
 
+# Dump log parsing
+
 def parse_time(s):
     # Parse times such as 02/Jul/2010:22:21:25 -0400
     return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M')
@@ -475,11 +437,13 @@ LineFormat = [('addr', str),
               ('refer', str),
               ('agent', str)]
 
-def parse_line(l):
+def parse_dump_log_entry(i, l, error_func=None):
     comps = [c.strip() for c in l.split('|')]
     if len(comps) != len(LineFormat):
-        raise ValueError("Invalid number of log components (%s instead of %s): %s"
-                         % (len(comps), len(LineFormat), l))
+        if error_func:
+            error_func(i, l, "Invalid number of log components (%s instead of %s)" \
+                           % (len(comps), len(LineFormat)))
+        return None
     r = {}
     for (label, f), v in zip(LineFormat, comps):
         try:
@@ -489,36 +453,39 @@ def parse_line(l):
             else:
                 r[label] = f(v)
         except Exception, e:
-            return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip())
+            if error_func:
+                error_func(i, l, "error converting %s field '%s': %s" % (label, v, e))
+            return None
     return r
 
-def parse_log(path):
-    print 'reading %s' % path
-    lines = [parse_line(l) for l in open(path, 'r')]
-    good_lines = [l for l in lines if isinstance(l, dict)]
-    bad_lines = [l for l in lines if not isinstance(l, dict)]
-    return good_lines, bad_lines
+def default_error_handler(i, line, err):
+    # print "line %s: %s" % (i, err)
+    pass
 
+default_line_parser = functools.partial(parse_dump_log_entry,
+                                        error_func=default_error_handler)
+
+def analyze_dump_log(path,
+                     interval=WindowInterval,
+                     line_parser=default_line_parser,
+                     timekey='ts'):
+    report = DumpReport(interval)
+    with open(path, 'r') as f:
+        for ts, window in yield_log_windows(f, interval, line_parser, timekey):
+            report.record_window(ts, window)
+    return report
 
 if __name__ == "__main__":
-    if len(sys.argv) < 1:
+    if len(sys.argv) == 1:
         print "usage: logreport.py logfile [outfile]"
         sys.exit(1)
     logpath = sys.argv[1]
-    outpath = sys.argv[2] if len(sys.argv) > 2 else None        
+    outpath = sys.argv[2] if len(sys.argv) > 2 else None
 
     Nickset = fetch_nicks()
-    good_lines, bad_lines = parse_log(logpath)
-    if len(good_lines) == 0:
-        print "Log file is empty or cannot be read!"
-        sys.exit(1)
+    report = analyze_dump_log(logpath)
 
     if outpath:
-        report_title = get_report_title(good_lines)
-        writer = HtmlWriter(outpath, report_title)
+        report.write_report(outpath)
     else:
-        writer = ConsoleWriter()
-
-    report = Report(writer, good_lines, bad_lines)
-    report.output_report()
-    writer.close()
+        report.print_report()
-- 
cgit v1.2.3-70-g09d2