Initial log analyzer

author: Scott Ostler <scottbot9000@gmail.com> 2010-07-04 17:51:48 -0400
committer: Scott Ostler <scottbot9000@gmail.com> 2010-07-04 17:51:48 -0400
commit: cb0e566d9cccadc4e19ef654f366e0159688242b (patch)
tree: a03d8f2ddc69ed28337595b15cf52901fc0bd5d8 /scripts
parent: 41caa07a16baf32ed9f82bc28b4c3a63825ab472 (diff)
1 files changed, 512 insertions, 0 deletions
diff --git a/scripts/logreport.py b/scripts/logreport.py
new file mode 100644
index 0000000..3977105
--- /dev/null
+++ b/scripts/logreport.py
@@ -0,0 +1,512 @@
+from __future__ import with_statement
+import calendar
+import collections
+import datetime
+import functools
+import itertools
+import locale
+import math
+import operator
+import psycopg2
+import sys
+import time
+
+locale.setlocale(locale.LC_ALL, "")
+
+Nickset = None
+SlowReqThreshold = 1000
+DefaultTimeBucket = datetime.timedelta(minutes=5)
+ConnStr = "dbname=dumpfm user=postgres password=root"
+
+def fetch_nicks():
+    try:
+        print "fetching user nicknames from db"
+        conn = psycopg2.connect(ConnStr)
+        cur = conn.cursor()
+        cur.execute("select nick from users")
+        return set(n for (n,) in cur.fetchall())
+    except Exception, e:
+        print "Error fetching user nicknames: %s" % e
+        print "Profile urls will be incorrectly classified as 'Other'"
+        return set()
+
+def comma_format(i):
+    return locale.format('%d', i, True)
+
+def format_unit(val, units, factors):
+    for unit, factor in zip(units, factors):
+        if factor in (0, 1):
+            return "%s%s" % (val, unit)
+        elif val >= factor:
+            return "%0.2f%s" % (float(val) / factor, unit)
+    raise ValueError('Cannot format %s with any of %s' % (val, unit))
+
+def format_time(t):
+    return t.strftime('%Y/%m/%d %H:%M')
+
+def format_ms(ms):
+    return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0])
+
+def format_bytes(bytes):
+    return format_unit(bytes,
+                       ['GB', 'MB', 'KB', 'B'],
+                       [1024 ** i for i in range(3, -1, -1)])
+
+def median(pool):
+    copy = sorted(pool)
+    size = len(copy)
+    if size % 2 == 1:
+        return copy[(size - 1) / 2]
+    else:
+        return (copy[size/2 - 1] + copy[size/2]) / 2
+
+def mean(pool):
+    return sum(pool) / len(pool)
+    
+
+def percentile(N, percent, key=lambda x:x):
+    """
+    From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
+    Find the percentile of a list of values.
+
+    @parameter N - is a list of values. Note N MUST BE already sorted.
+    @parameter percent - a float value from 0.0 to 1.0.
+    @parameter key - optional key function to compute value from each element of N.
+
+    @return - the percentile of the values
+    """
+    if not N:
+        return None
+    k = (len(N)-1) * percent
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return key(N[int(k)])
+    d0 = key(N[int(f)]) * (k-f)
+    d1 = key(N[int(c)]) * (c-k)
+    return d0+d1
+
+def ninety_percentile(N):
+    return percentile(N, .9)
+
+def gather_by_key(rs, key):
+    d = collections.defaultdict(list)
+    for r in rs:
+        group = r[key]
+        d[group].append(r)
+    return d
+
+def group_by_time_interval(xs, interval, timekey):
+    if len(xs) == 0:
+        raise ValueError("Cannot group empty list")
+    start = xs[0][timekey]
+    end = xs[-1][timekey]
+    interval_seconds = int(interval.seconds)
+    span = int((end - start).seconds) / interval_seconds + 1
+    res = [[] for i in range(span)]
+    
+    for x in xs:
+        idx = int((x[timekey] - start).seconds) / interval_seconds
+        res[idx].append(x)
+    return res
+
+def get_report_title(entries):
+    start_time = entries[0]['ts']
+    end_time = entries[-1]['ts']
+    return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time)
+
+def build_js_array(start_time, interval, values):
+    pairs = []
+    for i, val in enumerate(values):
+        t = start_time + interval * i
+        unix_time = calendar.timegm(t.timetuple()) * 1000
+        pairs.append('[%s, %s]' % (unix_time, val))
+    return '[%s]' % ','.join(pairs)
+
+SlowReportCount = 10
+IpReportCount = 10
+
+class ConsoleWriter(object):
+            
+    def section(self, label):
+        print "\n----- %s -----" % label
+    
+    def paragraph(self, text):
+        print text
+
+    def list(self, entries):
+        for e in entries:
+            print "- %s" % e
+
+    def table(self, headers, data):
+        for row in data:
+            label = row[0]
+            if label:
+                print "\n%s:" % label
+            for h, d in zip(headers, row[1:]):
+                print "  - %s: %s" % (h, d)
+
+    def graph(self, *args, **kwds):
+        pass
+
+    def close(self):
+        pass
+
+HtmlTemplate = """
+<html>
+<head>
+<title>%s</title>
+<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
+<script type="text/javascript" src="http://dump.fm/static/js/jquery.flot.js"></script>
+<style>
+table {
+    margin: 0;
+    cellpadding: 1px;
+}
+
+th { text-align: left; }
+
+tr:hover { background-color: #AFF; }
+
+</style>
+</head>
+<body>%s</body>
+</html>
+"""
+
+JqueryTemplate = """
+$(function () {
+    $.plot($("#%s"),
+           [ { data: %s, label: "%s" } ],
+           { xaxis: { mode: 'time' },
+             yaxis: { min: 0 },
+             legend: { position: 'sw' } });
+});
+
+"""
+
+class HtmlWriter(object):
+    
+    def __init__(self, path, title):
+        self.path = path
+        self.title = title
+        self.content = ""
+        
+    def section(self, label):
+        self.content += "<h2>%s</h2>\n" % label
+
+    def paragraph(self, text):
+        self.content += "<p>%s</p>\n" % text
+
+    def list(self, entries):
+        self.content += "<ul>\n"
+        for e in entries:
+            self.content += "<li>%s</li>\n" % e
+        self.content += "</ul>\n"
+
+    def table(self, headers, data):
+        self.content += "<table><tr><th></th>"
+        for h in headers:
+            self.content += "<th>%s</th>" % h
+        self.content += "</tr>\n"
+        for row in data:
+            self.content += "<tr>"
+            
+            for i, val in enumerate(row):
+                if i == 0:
+                    self.content += "<th>%s</th>" % val
+                else:
+                    self.content += "<td>%s</td>" % val
+            self.content += "</tr>\n"
+        self.content += "</table>\n"
+
+    def graph(self, label, graph_id, data):
+        self.content += '<div id="%s" style="width: 100%%; height: 350px;"></div>' % graph_id
+        js_array = "var %s = %s;" % (graph_id, data)
+        graph_command = JqueryTemplate % (graph_id, graph_id, label)
+        self.content += '<script>%s\n%s</script>' % (js_array, graph_command)
+        
+    def _assemble(self):
+        return HtmlTemplate % (self.title or 'Untitled', self.content)
+
+    def close(self):
+        print "writing report to %s" % self.path
+        text = self._assemble()
+        with open(self.path, 'w') as f:
+            f.write(text)
+        
+
+class Report(object):
+
+    def __init__(self, writer, good, bad):
+        self.writer = writer
+        self.good = good
+        self.bad = bad
+        self.count = len(good)
+        self.start = self.good[0]['ts']
+        self.end = self.good[-1]['ts']
+    
+    def output_group_stats(self):
+        self.writer.section('URL group stats')
+        grouped = gather_by_key(self.good, 'urlgroup')
+
+        headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', 'bytes sent', 'bytes/req']
+        data = []
+
+        for url_group in sorted(grouped.keys()):
+            vals = sorted(r['time'] for r in grouped[url_group])
+            slow_count = sum(1 for v in vals if v > SlowReqThreshold)
+            bytes_sent = sum(r['total_bytes'] for r in grouped[url_group])
+            data.append([url_group,
+                         comma_format(len(vals)),
+                         format_ms(min(vals)),
+                         format_ms(median(vals)),
+                         format_ms(mean(vals)),
+                         format_ms(ninety_percentile(vals)),
+                         format_ms(max(vals)),
+                         slow_count,
+                         format_bytes(bytes_sent),
+                         format_bytes(bytes_sent / len(vals))])
+        self.writer.table(headers, data)
+        
+    def output_activity_graphs(self, interval=DefaultTimeBucket):
+        self.writer.section('Activity Rates')
+        buckets = group_by_time_interval(self.good, interval, 'ts')
+
+        def second_avg(count): return float(count) / interval.seconds
+        
+        def avg_bucket_rate(bs, cond):
+            return [second_avg(sum(1 for r in b if cond(r))) for b in bs]
+
+        reqs_per_sec = avg_bucket_rate(buckets, lambda r: True)
+        images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars'))
+        dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload'))
+        chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate')
+
+
+        self.writer.graph('Reqs / sec', 'reqs_per_sec',
+                          build_js_array(self.start, interval, reqs_per_sec))
+        self.writer.graph('Images / sec', 'images_per_sec',
+                          build_js_array(self.start, interval, images_per_sec))
+        self.writer.graph('Dumps / sec', 'dumps_per_sec',
+                          build_js_array(self.start, interval, dumps_per_sec))
+        self.writer.graph('Chat users / sec', 'chat_users_per_sec',
+                          build_js_array(self.start, interval, chat_users_per_sec))
+
+        self.writer.section('Response Time')
+        
+        def calc_bucket_resp_time(bs, func):
+            return [func([r['time'] for r in b]) for b in bs]
+
+        mean_resp_time = calc_bucket_resp_time(buckets, mean)
+        median_resp_time = calc_bucket_resp_time(buckets, median)
+        ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile)
+        max_resp_time = calc_bucket_resp_time(buckets, max)
+
+        self.writer.graph('Mean response time (ms)', 'mean_resp_time',
+                          build_js_array(self.start, interval, mean_resp_time))
+        self.writer.graph('Median response time (ms)', 'median_per_sec',
+                          build_js_array(self.start, interval, median_resp_time))
+        self.writer.graph('90% response time (ms)', 'ninety_pct_time',
+                          build_js_array(self.start, interval, ninety_pct_time))
+        self.writer.graph('Max response time (ms)', 'max_resp_time',
+                          build_js_array(self.start, interval, max_resp_time))
+                          
+
+    def output_slow_requests(self):
+        self.writer.section('Slow URLs')
+        slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold),
+                      key=lambda r: r['time'] * -1)
+        entries = []
+        for r in slow[:SlowReportCount]:
+            e = "%s: %s, %s sent" % (r['url'],
+                                     format_ms(r['time']),
+                                     format_bytes(r['total_bytes']))
+            entries.append(e)
+        self.writer.list(entries)
+
+    def output_unclassified_urls(self):
+        unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other')
+        unclassified_str = ', '.join(sorted(unclassified)[:100])
+        if len(unclassified) > 100:
+            unclassified_str += '...'
+        self.writer.section('Unclassified URLs (%s)' % len(unclassified))
+        self.writer.paragraph(unclassified_str)
+
+    def output_ip_usage(self):
+        grouped = gather_by_key(self.good, 'addr')
+        sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs)
+        addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems()))
+        self.writer.section('Heaviest Users by IP')
+        
+        l = []
+        for count, addr in itertools.islice(addr_bytes_gen, IpReportCount):
+            bytes = sum(r['total_bytes'] for r in grouped[addr])
+            l.append("%s: %s reqs, %s sent" % (addr,
+                                               comma_format(count),
+                                               format_bytes(bytes)))
+        self.writer.list(l)
+
+    def output_report(self):
+        l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start),
+                                          format_time(self.end),
+                                          self.end - self.start),
+             "%s Total Requests (%.02f r/s)" \
+                 % (comma_format(self.count), self.count / (self.end - self.start).seconds),
+             "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))]
+        self.writer.section('Overview')
+        self.writer.list(l)
+        self.output_activity_graphs()
+        self.output_group_stats()
+        self.output_slow_requests()
+        self.output_ip_usage()
+        self.output_unclassified_urls()
+
+
+def parse_time(s):
+    # Parse times such as 02/Jul/2010:22:21:25 -0400
+    return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M')
+    
+InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall',
+            '/register', '/reset-request']
+StaticUrls = ['/favicon.ico', '/robots.txt', '/']
+
+def _classify_profile_url(url, nickset):
+    if url.startswith('/u/'):
+        url = url[2:]
+    comps = url.split('/')
+    comps.remove('')
+    if len(comps) == 0 or comps[0] not in nickset:
+        return None
+    else:
+        if len(comps) == 1:
+            return 'Profile'
+        elif comps[1] == 'popular':
+            return 'Popular'
+        elif comps[1] == 'favorites':
+            return 'Favorites'
+        elif comps[1] == 'log':
+            return 'Log'
+        else:
+            return 'Profile'
+
+def classify_url(method, url):
+    profile_classification = _classify_profile_url(url, Nickset)
+    if profile_classification:
+        return profile_classification
+    elif url.startswith('/avatars/'):
+        return 'Avatars'
+    elif url.startswith('/refresh'):
+        return 'ChatUpdate'
+    elif url.startswith('/chat') or url.startswith('/browser'):
+        return 'ChatLoad'
+    elif url.startswith('/json/') and url.endswith('/favorites'):
+        return 'JsonFavs'
+    elif url.startswith('/log'):
+        return 'RoomLog'
+    elif url.startswith('/login') or url.startswith('/logout'):
+        return 'Login'
+    elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'):
+        return 'InfoPages'
+    elif url == '/msg':
+        return 'Post'
+    elif url == '/submit-registration':
+        return 'Register'
+    elif url.startswith('/cmd/ghettosearch/'):
+        return 'Search'
+    elif url.startswith('/static/') or url in StaticUrls:
+        return 'Static'
+    elif url.startswith('/images/'):
+        return 'Images'
+    elif url == '/cmd/tag/add':
+        return 'TagAdd'
+    elif url == '/cmd/tag/rm':
+        return 'TagRemove'
+    elif url == '/upload/message':
+        return 'Upload'
+    elif url.startswith('/p/'):
+        return 'ViewDump'
+    else:
+        return 'Other'
+
+def parse_ms(ms):
+    return int(float(ms) * 1000)
+
+def parse_request(s):
+    method, url, httpversion = s.split(' ')
+    url_group = classify_url(method, url)
+    return {'method': method,
+            'httpversion': httpversion,
+            'urlgroup': url_group,
+            'url': url }
+
+def parse_status(s):
+    code = int(s)
+    # This works because statuses are parsed after urls.
+    if code == 302:
+        return {'code': 302, 'urlgroup': 'Redirect'}
+    elif code == 404:
+        return {'code': 404, 'urlgroup': '404'}
+    elif code >= 500:
+        return {'code': code, 'urlgroup': 'Error'}
+    else:
+        return code
+
+LineFormat = [('addr', str),
+              ('ts', parse_time),
+              ('request', parse_request),
+              ('pipe', str),
+              ('status', parse_status),
+              ('time', parse_ms),
+              ('total_bytes', int),
+              ('body_bytes', int),
+              ('refer', str),
+              ('agent', str)]
+
+def parse_line(l):
+    comps = [c.strip() for c in l.split('|')]
+    if len(comps) != len(LineFormat):
+        raise ValueError("Invalid number of log components (%s instead of %s): %s"
+                         % (len(comps), len(LineFormat), l))
+    r = {}
+    for (label, f), v in zip(LineFormat, comps):
+        try:
+            res = f(v)
+            if isinstance(res, dict):
+                r.update(res)
+            else:
+                r[label] = f(v)
+        except Exception, e:
+            return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip())
+    return r
+
+def parse_log(path):
+    print 'reading %s' % path
+    lines = [parse_line(l) for l in open(path, 'r')]
+    good_lines = [l for l in lines if isinstance(l, dict)]
+    bad_lines = [l for l in lines if not isinstance(l, dict)]
+    return good_lines, bad_lines
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 1:
+        print "usage: logreport.py logfile [outfile]"
+        sys.exit(1)
+    logpath = sys.argv[1]
+    outpath = sys.argv[2] if len(sys.argv) > 2 else None        
+
+    Nickset = fetch_nicks()
+    good_lines, bad_lines = parse_log(logpath)
+    if len(good_lines) == 0:
+        print "Log file is empty or cannot be read!"
+        sys.exit(1)
+
+    if outpath:
+        report_title = get_report_title(good_lines)
+        writer = HtmlWriter(outpath, report_title)
+    else:
+        writer = ConsoleWriter()
+
+    report = Report(writer, good_lines, bad_lines)
+    report.output_report()
+    writer.close()
author	Scott Ostler <scottbot9000@gmail.com>	2010-07-04 17:51:48 -0400
committer	Scott Ostler <scottbot9000@gmail.com>	2010-07-04 17:51:48 -0400
commit	cb0e566d9cccadc4e19ef654f366e0159688242b (patch)
tree	a03d8f2ddc69ed28337595b15cf52901fc0bd5d8 /scripts
parent	41caa07a16baf32ed9f82bc28b4c3a63825ab472 (diff)