from __future__ import with_statement import calendar import collections import datetime import functools import itertools import locale import math import operator import psycopg2 import sys import time locale.setlocale(locale.LC_ALL, "") Nickset = None SlowReqThreshold = 1000 DefaultTimeBucket = datetime.timedelta(minutes=5) ConnStr = "dbname=dumpfm user=postgres password=root" def fetch_nicks(): try: print "fetching user nicknames from db" conn = psycopg2.connect(ConnStr) cur = conn.cursor() cur.execute("select nick from users") return set(n for (n,) in cur.fetchall()) except Exception, e: print "Error fetching user nicknames: %s" % e print "Profile urls will be incorrectly classified as 'Other'" return set() def comma_format(i): return locale.format('%d', i, True) def format_unit(val, units, factors): for unit, factor in zip(units, factors): if factor in (0, 1): return "%s%s" % (val, unit) elif val >= factor: return "%0.2f%s" % (float(val) / factor, unit) raise ValueError('Cannot format %s with any of %s' % (val, unit)) def format_time(t): return t.strftime('%Y/%m/%d %H:%M') def format_ms(ms): return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0]) def format_bytes(bytes): return format_unit(bytes, ['GB', 'MB', 'KB', 'B'], [1024 ** i for i in range(3, -1, -1)]) def median(pool): copy = sorted(pool) size = len(copy) if size % 2 == 1: return copy[(size - 1) / 2] else: return (copy[size/2 - 1] + copy[size/2]) / 2 def mean(pool): return sum(pool) / len(pool) def percentile(N, percent, key=lambda x:x): """ From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ Find the percentile of a list of values. @parameter N - is a list of values. Note N MUST BE already sorted. @parameter percent - a float value from 0.0 to 1.0. @parameter key - optional key function to compute value from each element of N. @return - the percentile of the values """ if not N: return None k = (len(N)-1) * percent f = math.floor(k) c = math.ceil(k) if f == c: return key(N[int(k)]) d0 = key(N[int(f)]) * (k-f) d1 = key(N[int(c)]) * (c-k) return d0+d1 def ninety_percentile(N): return percentile(N, .9) def gather_by_key(rs, key): d = collections.defaultdict(list) for r in rs: group = r[key] d[group].append(r) return d def group_by_time_interval(xs, interval, timekey): if len(xs) == 0: raise ValueError("Cannot group empty list") start = xs[0][timekey] end = xs[-1][timekey] interval_seconds = int(interval.seconds) span = int((end - start).seconds) / interval_seconds + 1 res = [[] for i in range(span)] for x in xs: idx = int((x[timekey] - start).seconds) / interval_seconds res[idx].append(x) return res def get_report_title(entries): start_time = entries[0]['ts'] end_time = entries[-1]['ts'] return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time) def build_js_array(start_time, interval, values): pairs = [] for i, val in enumerate(values): t = start_time + interval * i unix_time = calendar.timegm(t.timetuple()) * 1000 pairs.append('[%s, %s]' % (unix_time, val)) return '[%s]' % ','.join(pairs) SlowReportCount = 10 IpReportCount = 10 class ConsoleWriter(object): def section(self, label): print "\n----- %s -----" % label def paragraph(self, text): print text def list(self, entries): for e in entries: print "- %s" % e def table(self, headers, data): for row in data: label = row[0] if label: print "\n%s:" % label for h, d in zip(headers, row[1:]): print " - %s: %s" % (h, d) def graph(self, *args, **kwds): pass def close(self): pass HtmlTemplate = """ %s %s """ JqueryTemplate = """ $(function () { $.plot($("#%s"), [ { data: %s, label: "%s" } ], { xaxis: { mode: 'time' }, yaxis: { min: 0 }, legend: { position: 'sw' } }); }); """ class HtmlWriter(object): def __init__(self, path, title): self.path = path self.title = title self.content = "" def section(self, label): self.content += "

%s

\n" % label def paragraph(self, text): self.content += "

%s

\n" % text def list(self, entries): self.content += "\n" def table(self, headers, data): self.content += "" for h in headers: self.content += "" % h self.content += "\n" for row in data: self.content += "" for i, val in enumerate(row): if i == 0: self.content += "" % val else: self.content += "" % val self.content += "\n" self.content += "
%s
%s%s
\n" def graph(self, label, graph_id, data): self.content += '
' % graph_id js_array = "var %s = %s;" % (graph_id, data) graph_command = JqueryTemplate % (graph_id, graph_id, label) self.content += '' % (js_array, graph_command) def _assemble(self): return HtmlTemplate % (self.title or 'Untitled', self.content) def close(self): print "writing report to %s" % self.path text = self._assemble() with open(self.path, 'w') as f: f.write(text) class Report(object): def __init__(self, writer, good, bad): self.writer = writer self.good = good self.bad = bad self.count = len(good) self.start = self.good[0]['ts'] self.end = self.good[-1]['ts'] def output_group_stats(self): self.writer.section('URL group stats') grouped = gather_by_key(self.good, 'urlgroup') headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', '% slow', 'bytes sent', 'bytes/req'] data = [] for url_group in sorted(grouped.keys()): vals = sorted(r['time'] for r in grouped[url_group]) slow_count = sum(1 for v in vals if v > SlowReqThreshold) bytes_sent = sum(r['total_bytes'] for r in grouped[url_group]) data.append([url_group, comma_format(len(vals)), format_ms(min(vals)), format_ms(median(vals)), format_ms(mean(vals)), format_ms(ninety_percentile(vals)), format_ms(max(vals)), slow_count, '%.01f%%' % (100 * slow_count / len(vals)), format_bytes(bytes_sent), format_bytes(bytes_sent / len(vals))]) self.writer.table(headers, data) def output_activity_graphs(self, interval=DefaultTimeBucket): self.writer.section('Activity Rates') buckets = group_by_time_interval(self.good, interval, 'ts') def second_avg(count): return float(count) / interval.seconds def avg_bucket_rate(bs, cond): return [second_avg(sum(1 for r in b if cond(r))) for b in bs] reqs_per_sec = avg_bucket_rate(buckets, lambda r: True) images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars')) dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload')) tags_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('TagAdd', 'TagRemove')) chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate') self.writer.graph('Reqs / sec', 'reqs_per_sec', build_js_array(self.start, interval, reqs_per_sec)) self.writer.graph('Images / sec', 'images_per_sec', build_js_array(self.start, interval, images_per_sec)) self.writer.graph('Dumps / sec', 'dumps_per_sec', build_js_array(self.start, interval, dumps_per_sec)) self.writer.graph('Tags / sec', 'tags_per_sec', build_js_array(self.start, interval, tags_per_sec)) self.writer.graph('Chat users / sec', 'chat_users_per_sec', build_js_array(self.start, interval, chat_users_per_sec)) self.writer.section('Response Time') def calc_bucket_resp_time(bs, func): return [func([r['time'] for r in b]) for b in bs] mean_resp_time = calc_bucket_resp_time(buckets, mean) median_resp_time = calc_bucket_resp_time(buckets, median) ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile) max_resp_time = calc_bucket_resp_time(buckets, max) self.writer.graph('Mean response time (ms)', 'mean_resp_time', build_js_array(self.start, interval, mean_resp_time)) self.writer.graph('Median response time (ms)', 'median_per_sec', build_js_array(self.start, interval, median_resp_time)) self.writer.graph('90% response time (ms)', 'ninety_pct_time', build_js_array(self.start, interval, ninety_pct_time)) self.writer.graph('Max response time (ms)', 'max_resp_time', build_js_array(self.start, interval, max_resp_time)) def output_slow_requests(self): self.writer.section('Slow URLs') slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold), key=lambda r: r['time'] * -1) entries = [] for r in slow[:SlowReportCount]: e = "%s: %s, %s sent" % (r['url'], format_ms(r['time']), format_bytes(r['total_bytes'])) entries.append(e) self.writer.list(entries) def output_unclassified_urls(self): unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other') unclassified_str = ', '.join(sorted(unclassified)[:100]) if len(unclassified) > 100: unclassified_str += '...' self.writer.section('Unclassified URLs (%s)' % len(unclassified)) self.writer.paragraph(unclassified_str) def output_errors(self): errors = set(g['url'] for g in self.good if g['urlgroup'] == 'Error') error_str = ', '.join(sorted(errors)[:100]) self.writer.section('Error Urls (%s)' % len(errors)) self.writer.paragraph(error_str) def output_ip_usage(self): grouped = gather_by_key(self.good, 'addr') sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs) addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems())) self.writer.section('Heaviest Users by IP') l = [] for count, addr in itertools.islice(addr_bytes_gen, IpReportCount): bytes = sum(r['total_bytes'] for r in grouped[addr]) l.append("%s: %s reqs, %s sent" % (addr, comma_format(count), format_bytes(bytes))) self.writer.list(l) def output_report(self): l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start), format_time(self.end), self.end - self.start), "%s Total Requests (%.02f r/s)" \ % (comma_format(self.count), self.count / (self.end - self.start).seconds), "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))] self.writer.section('Overview') self.writer.list(l) self.output_activity_graphs() self.output_group_stats() self.output_slow_requests() self.output_ip_usage() self.output_errors() self.output_unclassified_urls() def parse_time(s): # Parse times such as 02/Jul/2010:22:21:25 -0400 return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M') InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall', '/register', '/reset-request'] StaticUrls = ['/favicon.ico', '/robots.txt', '/'] def _classify_profile_url(url, nickset): if url.startswith('/u/'): url = url[2:] comps = url.split('/') comps.remove('') if len(comps) == 0 or comps[0] not in nickset: return None else: if len(comps) == 1: return 'Profile' elif comps[1] == 'popular': return 'Popular' elif comps[1] == 'favorites': return 'Favorites' elif comps[1] == 'log': return 'Log' else: return 'Profile' def classify_url(method, url): profile_classification = _classify_profile_url(url, Nickset) if profile_classification: return profile_classification elif url.startswith('/avatars/'): return 'Avatars' elif url.startswith('/refresh'): return 'ChatUpdate' elif url.startswith('/chat') or url.startswith('/browser'): return 'ChatLoad' elif url.startswith('/json/') and url.endswith('/favorites'): return 'JsonFavs' elif url.startswith('/log'): return 'RoomLog' elif url.startswith('/login') or url.startswith('/logout'): return 'Login' elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'): return 'InfoPages' elif url == '/msg': return 'Post' elif url == '/submit-registration': return 'Register' elif url.startswith('/cmd/ghettosearch/'): return 'Search' elif url.startswith('/static/') or url in StaticUrls: return 'Static' elif url.startswith('/images/'): return 'Images' elif url == '/cmd/tag/add': return 'TagAdd' elif url == '/cmd/tag/rm': return 'TagRemove' elif url == '/upload/message': return 'Upload' elif url.startswith('/p/'): return 'ViewDump' else: return 'Other' def parse_ms(ms): return int(float(ms) * 1000) def parse_request(s): method, url, httpversion = s.split(' ') url_group = classify_url(method, url) return {'method': method, 'httpversion': httpversion, 'urlgroup': url_group, 'url': url } def parse_status(s): code = int(s) # This works because statuses are parsed after urls. if code == 302: return {'code': 302, 'urlgroup': 'Redirect'} elif code == 404: return {'code': 404, 'urlgroup': '404'} elif code >= 500: return {'code': code, 'urlgroup': 'Error'} else: return code LineFormat = [('addr', str), ('ts', parse_time), ('request', parse_request), ('pipe', str), ('status', parse_status), ('time', parse_ms), ('total_bytes', int), ('body_bytes', int), ('refer', str), ('agent', str)] def parse_line(l): comps = [c.strip() for c in l.split('|')] if len(comps) != len(LineFormat): raise ValueError("Invalid number of log components (%s instead of %s): %s" % (len(comps), len(LineFormat), l)) r = {} for (label, f), v in zip(LineFormat, comps): try: res = f(v) if isinstance(res, dict): r.update(res) else: r[label] = f(v) except Exception, e: return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip()) return r def parse_log(path): print 'reading %s' % path lines = [parse_line(l) for l in open(path, 'r')] good_lines = [l for l in lines if isinstance(l, dict)] bad_lines = [l for l in lines if not isinstance(l, dict)] return good_lines, bad_lines if __name__ == "__main__": if len(sys.argv) < 1: print "usage: logreport.py logfile [outfile]" sys.exit(1) logpath = sys.argv[1] outpath = sys.argv[2] if len(sys.argv) > 2 else None Nickset = fetch_nicks() good_lines, bad_lines = parse_log(logpath) if len(good_lines) == 0: print "Log file is empty or cannot be read!" sys.exit(1) if outpath: report_title = get_report_title(good_lines) writer = HtmlWriter(outpath, report_title) else: writer = ConsoleWriter() report = Report(writer, good_lines, bad_lines) report.output_report() writer.close()