diff options
| author | Scott Ostler <scottbot9000@gmail.com> | 2010-07-04 17:51:48 -0400 |
|---|---|---|
| committer | Scott Ostler <scottbot9000@gmail.com> | 2010-07-04 17:51:48 -0400 |
| commit | cb0e566d9cccadc4e19ef654f366e0159688242b (patch) | |
| tree | a03d8f2ddc69ed28337595b15cf52901fc0bd5d8 /scripts | |
| parent | 41caa07a16baf32ed9f82bc28b4c3a63825ab472 (diff) | |
Initial log analyzer
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/logreport.py | 512 |
1 files changed, 512 insertions, 0 deletions
diff --git a/scripts/logreport.py b/scripts/logreport.py new file mode 100644 index 0000000..3977105 --- /dev/null +++ b/scripts/logreport.py @@ -0,0 +1,512 @@ +from __future__ import with_statement +import calendar +import collections +import datetime +import functools +import itertools +import locale +import math +import operator +import psycopg2 +import sys +import time + +locale.setlocale(locale.LC_ALL, "") + +Nickset = None +SlowReqThreshold = 1000 +DefaultTimeBucket = datetime.timedelta(minutes=5) +ConnStr = "dbname=dumpfm user=postgres password=root" + +def fetch_nicks(): + try: + print "fetching user nicknames from db" + conn = psycopg2.connect(ConnStr) + cur = conn.cursor() + cur.execute("select nick from users") + return set(n for (n,) in cur.fetchall()) + except Exception, e: + print "Error fetching user nicknames: %s" % e + print "Profile urls will be incorrectly classified as 'Other'" + return set() + +def comma_format(i): + return locale.format('%d', i, True) + +def format_unit(val, units, factors): + for unit, factor in zip(units, factors): + if factor in (0, 1): + return "%s%s" % (val, unit) + elif val >= factor: + return "%0.2f%s" % (float(val) / factor, unit) + raise ValueError('Cannot format %s with any of %s' % (val, unit)) + +def format_time(t): + return t.strftime('%Y/%m/%d %H:%M') + +def format_ms(ms): + return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0]) + +def format_bytes(bytes): + return format_unit(bytes, + ['GB', 'MB', 'KB', 'B'], + [1024 ** i for i in range(3, -1, -1)]) + +def median(pool): + copy = sorted(pool) + size = len(copy) + if size % 2 == 1: + return copy[(size - 1) / 2] + else: + return (copy[size/2 - 1] + copy[size/2]) / 2 + +def mean(pool): + return sum(pool) / len(pool) + + +def percentile(N, percent, key=lambda x:x): + """ + From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ + Find the percentile of a list of values. + + @parameter N - is a list of values. Note N MUST BE already sorted. + @parameter percent - a float value from 0.0 to 1.0. + @parameter key - optional key function to compute value from each element of N. + + @return - the percentile of the values + """ + if not N: + return None + k = (len(N)-1) * percent + f = math.floor(k) + c = math.ceil(k) + if f == c: + return key(N[int(k)]) + d0 = key(N[int(f)]) * (k-f) + d1 = key(N[int(c)]) * (c-k) + return d0+d1 + +def ninety_percentile(N): + return percentile(N, .9) + +def gather_by_key(rs, key): + d = collections.defaultdict(list) + for r in rs: + group = r[key] + d[group].append(r) + return d + +def group_by_time_interval(xs, interval, timekey): + if len(xs) == 0: + raise ValueError("Cannot group empty list") + start = xs[0][timekey] + end = xs[-1][timekey] + interval_seconds = int(interval.seconds) + span = int((end - start).seconds) / interval_seconds + 1 + res = [[] for i in range(span)] + + for x in xs: + idx = int((x[timekey] - start).seconds) / interval_seconds + res[idx].append(x) + return res + +def get_report_title(entries): + start_time = entries[0]['ts'] + end_time = entries[-1]['ts'] + return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time) + +def build_js_array(start_time, interval, values): + pairs = [] + for i, val in enumerate(values): + t = start_time + interval * i + unix_time = calendar.timegm(t.timetuple()) * 1000 + pairs.append('[%s, %s]' % (unix_time, val)) + return '[%s]' % ','.join(pairs) + +SlowReportCount = 10 +IpReportCount = 10 + +class ConsoleWriter(object): + + def section(self, label): + print "\n----- %s -----" % label + + def paragraph(self, text): + print text + + def list(self, entries): + for e in entries: + print "- %s" % e + + def table(self, headers, data): + for row in data: + label = row[0] + if label: + print "\n%s:" % label + for h, d in zip(headers, row[1:]): + print " - %s: %s" % (h, d) + + def graph(self, *args, **kwds): + pass + + def close(self): + pass + +HtmlTemplate = """ +<html> +<head> +<title>%s</title> +<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script> +<script type="text/javascript" src="http://dump.fm/static/js/jquery.flot.js"></script> +<style> +table { + margin: 0; + cellpadding: 1px; +} + +th { text-align: left; } + +tr:hover { background-color: #AFF; } + +</style> +</head> +<body>%s</body> +</html> +""" + +JqueryTemplate = """ +$(function () { + $.plot($("#%s"), + [ { data: %s, label: "%s" } ], + { xaxis: { mode: 'time' }, + yaxis: { min: 0 }, + legend: { position: 'sw' } }); +}); + +""" + +class HtmlWriter(object): + + def __init__(self, path, title): + self.path = path + self.title = title + self.content = "" + + def section(self, label): + self.content += "<h2>%s</h2>\n" % label + + def paragraph(self, text): + self.content += "<p>%s</p>\n" % text + + def list(self, entries): + self.content += "<ul>\n" + for e in entries: + self.content += "<li>%s</li>\n" % e + self.content += "</ul>\n" + + def table(self, headers, data): + self.content += "<table><tr><th></th>" + for h in headers: + self.content += "<th>%s</th>" % h + self.content += "</tr>\n" + for row in data: + self.content += "<tr>" + + for i, val in enumerate(row): + if i == 0: + self.content += "<th>%s</th>" % val + else: + self.content += "<td>%s</td>" % val + self.content += "</tr>\n" + self.content += "</table>\n" + + def graph(self, label, graph_id, data): + self.content += '<div id="%s" style="width: 100%%; height: 350px;"></div>' % graph_id + js_array = "var %s = %s;" % (graph_id, data) + graph_command = JqueryTemplate % (graph_id, graph_id, label) + self.content += '<script>%s\n%s</script>' % (js_array, graph_command) + + def _assemble(self): + return HtmlTemplate % (self.title or 'Untitled', self.content) + + def close(self): + print "writing report to %s" % self.path + text = self._assemble() + with open(self.path, 'w') as f: + f.write(text) + + +class Report(object): + + def __init__(self, writer, good, bad): + self.writer = writer + self.good = good + self.bad = bad + self.count = len(good) + self.start = self.good[0]['ts'] + self.end = self.good[-1]['ts'] + + def output_group_stats(self): + self.writer.section('URL group stats') + grouped = gather_by_key(self.good, 'urlgroup') + + headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', 'bytes sent', 'bytes/req'] + data = [] + + for url_group in sorted(grouped.keys()): + vals = sorted(r['time'] for r in grouped[url_group]) + slow_count = sum(1 for v in vals if v > SlowReqThreshold) + bytes_sent = sum(r['total_bytes'] for r in grouped[url_group]) + data.append([url_group, + comma_format(len(vals)), + format_ms(min(vals)), + format_ms(median(vals)), + format_ms(mean(vals)), + format_ms(ninety_percentile(vals)), + format_ms(max(vals)), + slow_count, + format_bytes(bytes_sent), + format_bytes(bytes_sent / len(vals))]) + self.writer.table(headers, data) + + def output_activity_graphs(self, interval=DefaultTimeBucket): + self.writer.section('Activity Rates') + buckets = group_by_time_interval(self.good, interval, 'ts') + + def second_avg(count): return float(count) / interval.seconds + + def avg_bucket_rate(bs, cond): + return [second_avg(sum(1 for r in b if cond(r))) for b in bs] + + reqs_per_sec = avg_bucket_rate(buckets, lambda r: True) + images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars')) + dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload')) + chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate') + + + self.writer.graph('Reqs / sec', 'reqs_per_sec', + build_js_array(self.start, interval, reqs_per_sec)) + self.writer.graph('Images / sec', 'images_per_sec', + build_js_array(self.start, interval, images_per_sec)) + self.writer.graph('Dumps / sec', 'dumps_per_sec', + build_js_array(self.start, interval, dumps_per_sec)) + self.writer.graph('Chat users / sec', 'chat_users_per_sec', + build_js_array(self.start, interval, chat_users_per_sec)) + + self.writer.section('Response Time') + + def calc_bucket_resp_time(bs, func): + return [func([r['time'] for r in b]) for b in bs] + + mean_resp_time = calc_bucket_resp_time(buckets, mean) + median_resp_time = calc_bucket_resp_time(buckets, median) + ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile) + max_resp_time = calc_bucket_resp_time(buckets, max) + + self.writer.graph('Mean response time (ms)', 'mean_resp_time', + build_js_array(self.start, interval, mean_resp_time)) + self.writer.graph('Median response time (ms)', 'median_per_sec', + build_js_array(self.start, interval, median_resp_time)) + self.writer.graph('90% response time (ms)', 'ninety_pct_time', + build_js_array(self.start, interval, ninety_pct_time)) + self.writer.graph('Max response time (ms)', 'max_resp_time', + build_js_array(self.start, interval, max_resp_time)) + + + def output_slow_requests(self): + self.writer.section('Slow URLs') + slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold), + key=lambda r: r['time'] * -1) + entries = [] + for r in slow[:SlowReportCount]: + e = "%s: %s, %s sent" % (r['url'], + format_ms(r['time']), + format_bytes(r['total_bytes'])) + entries.append(e) + self.writer.list(entries) + + def output_unclassified_urls(self): + unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other') + unclassified_str = ', '.join(sorted(unclassified)[:100]) + if len(unclassified) > 100: + unclassified_str += '...' + self.writer.section('Unclassified URLs (%s)' % len(unclassified)) + self.writer.paragraph(unclassified_str) + + def output_ip_usage(self): + grouped = gather_by_key(self.good, 'addr') + sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs) + addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems())) + self.writer.section('Heaviest Users by IP') + + l = [] + for count, addr in itertools.islice(addr_bytes_gen, IpReportCount): + bytes = sum(r['total_bytes'] for r in grouped[addr]) + l.append("%s: %s reqs, %s sent" % (addr, + comma_format(count), + format_bytes(bytes))) + self.writer.list(l) + + def output_report(self): + l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start), + format_time(self.end), + self.end - self.start), + "%s Total Requests (%.02f r/s)" \ + % (comma_format(self.count), self.count / (self.end - self.start).seconds), + "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))] + self.writer.section('Overview') + self.writer.list(l) + self.output_activity_graphs() + self.output_group_stats() + self.output_slow_requests() + self.output_ip_usage() + self.output_unclassified_urls() + + +def parse_time(s): + # Parse times such as 02/Jul/2010:22:21:25 -0400 + return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M') + +InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall', + '/register', '/reset-request'] +StaticUrls = ['/favicon.ico', '/robots.txt', '/'] + +def _classify_profile_url(url, nickset): + if url.startswith('/u/'): + url = url[2:] + comps = url.split('/') + comps.remove('') + if len(comps) == 0 or comps[0] not in nickset: + return None + else: + if len(comps) == 1: + return 'Profile' + elif comps[1] == 'popular': + return 'Popular' + elif comps[1] == 'favorites': + return 'Favorites' + elif comps[1] == 'log': + return 'Log' + else: + return 'Profile' + +def classify_url(method, url): + profile_classification = _classify_profile_url(url, Nickset) + if profile_classification: + return profile_classification + elif url.startswith('/avatars/'): + return 'Avatars' + elif url.startswith('/refresh'): + return 'ChatUpdate' + elif url.startswith('/chat') or url.startswith('/browser'): + return 'ChatLoad' + elif url.startswith('/json/') and url.endswith('/favorites'): + return 'JsonFavs' + elif url.startswith('/log'): + return 'RoomLog' + elif url.startswith('/login') or url.startswith('/logout'): + return 'Login' + elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'): + return 'InfoPages' + elif url == '/msg': + return 'Post' + elif url == '/submit-registration': + return 'Register' + elif url.startswith('/cmd/ghettosearch/'): + return 'Search' + elif url.startswith('/static/') or url in StaticUrls: + return 'Static' + elif url.startswith('/images/'): + return 'Images' + elif url == '/cmd/tag/add': + return 'TagAdd' + elif url == '/cmd/tag/rm': + return 'TagRemove' + elif url == '/upload/message': + return 'Upload' + elif url.startswith('/p/'): + return 'ViewDump' + else: + return 'Other' + +def parse_ms(ms): + return int(float(ms) * 1000) + +def parse_request(s): + method, url, httpversion = s.split(' ') + url_group = classify_url(method, url) + return {'method': method, + 'httpversion': httpversion, + 'urlgroup': url_group, + 'url': url } + +def parse_status(s): + code = int(s) + # This works because statuses are parsed after urls. + if code == 302: + return {'code': 302, 'urlgroup': 'Redirect'} + elif code == 404: + return {'code': 404, 'urlgroup': '404'} + elif code >= 500: + return {'code': code, 'urlgroup': 'Error'} + else: + return code + +LineFormat = [('addr', str), + ('ts', parse_time), + ('request', parse_request), + ('pipe', str), + ('status', parse_status), + ('time', parse_ms), + ('total_bytes', int), + ('body_bytes', int), + ('refer', str), + ('agent', str)] + +def parse_line(l): + comps = [c.strip() for c in l.split('|')] + if len(comps) != len(LineFormat): + raise ValueError("Invalid number of log components (%s instead of %s): %s" + % (len(comps), len(LineFormat), l)) + r = {} + for (label, f), v in zip(LineFormat, comps): + try: + res = f(v) + if isinstance(res, dict): + r.update(res) + else: + r[label] = f(v) + except Exception, e: + return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip()) + return r + +def parse_log(path): + print 'reading %s' % path + lines = [parse_line(l) for l in open(path, 'r')] + good_lines = [l for l in lines if isinstance(l, dict)] + bad_lines = [l for l in lines if not isinstance(l, dict)] + return good_lines, bad_lines + + +if __name__ == "__main__": + if len(sys.argv) < 1: + print "usage: logreport.py logfile [outfile]" + sys.exit(1) + logpath = sys.argv[1] + outpath = sys.argv[2] if len(sys.argv) > 2 else None + + Nickset = fetch_nicks() + good_lines, bad_lines = parse_log(logpath) + if len(good_lines) == 0: + print "Log file is empty or cannot be read!" + sys.exit(1) + + if outpath: + report_title = get_report_title(good_lines) + writer = HtmlWriter(outpath, report_title) + else: + writer = ConsoleWriter() + + report = Report(writer, good_lines, bad_lines) + report.output_report() + writer.close() |
