from __future__ import with_statement import calendar import collections import datetime import functools import itertools import math import operator import psycopg2 import re import sys import time from writer import ConsoleWriter, HtmlWriter Nickset = None SlowReqThreshold = 1000 WindowInterval = datetime.timedelta(minutes=5) ConnStr = "dbname=dumpfm user=postgres password=root" def fetch_nicks(): try: print "fetching user nicknames from db" conn = psycopg2.connect(ConnStr) cur = conn.cursor() cur.execute("select nick from users") return set(n for (n,) in cur.fetchall()) except Exception, e: print "Error fetching user nicknames: %s" % e print "Profile urls will be incorrectly classified as 'Other'" return set() def comma_format(x): # http://code.activestate.com/recipes/498181-add-thousands-separator-commas-to-formatted-number/ return re.sub(r'(\d{3})(?=\d)', r'\1,', str(x)[::-1])[::-1] def format_unit(val, units, factors): for unit, factor in zip(units, factors): if factor in (0, 1): return "%s%s" % (val, unit) elif val >= factor: return "%0.2f%s" % (float(val) / factor, unit) raise ValueError('Cannot format %s with any of %s' % (val, unit)) def format_time(t): return t.strftime('%Y/%m/%d %H:%M') def format_ms(ms): return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0]) def format_bytes(bytes): return format_unit(bytes, ['GB', 'MB', 'KB', 'B'], [1024 ** i for i in range(3, -1, -1)]) def format_pct(n): return '%.02f%%' % (100 * n) def median(pool): copy = sorted(pool) size = len(copy) if size % 2 == 1: return copy[(size - 1) / 2] else: return (copy[size/2 - 1] + copy[size/2]) / 2 def mean(pool): return sum(pool) / len(pool) def percentile(N, percent, key=lambda x:x): """ From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ Find the percentile of a list of values. @parameter N - is a list of values. Note N MUST BE already sorted. @parameter percent - a float value from 0.0 to 1.0. @parameter key - optional key function to compute value from each element of N. @return - the percentile of the values """ if not N: return None k = (len(N)-1) * percent f = math.floor(k) c = math.ceil(k) if f == c: return key(N[int(k)]) d0 = key(N[int(f)]) * (k-f) d1 = key(N[int(c)]) * (c-k) return d0+d1 def ninety_percentile(N): return percentile(N, .9) def gather_by_key(rs, key): d = collections.defaultdict(list) for r in rs: group = r[key] d[group].append(r) return d def group_by_time_interval(xs, interval, timekey): if len(xs) == 0: raise ValueError("Cannot group empty list") start = xs[0][timekey] end = xs[-1][timekey] interval_seconds = int(interval.seconds) span = int((end - start).seconds) / interval_seconds + 1 res = [[] for i in range(span)] for x in xs: idx = int((x[timekey] - start).seconds) / interval_seconds res[idx].append(x) return res def get_report_title(entries): start_time = entries[0]['ts'] end_time = entries[-1]['ts'] return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time) def build_js_array(pairs): s_list = [] for ts, val in pairs: unix_time = calendar.timegm(ts.timetuple()) * 1000 s_list.append('[%s, %s]' % (unix_time, val)) return '[%s]' % ','.join(s_list) def summarize_window(ts, window, slow_threshold): reqs = 0.0 dumps = 0.0 images = 0.0 tags = 0.0 chat_users = 0.0 slow = 0 for r in window: reqs += 1 group = r['urlgroup'] if group in ('Images', 'Avatars'): images += 1 elif group in ('Post', 'Upload'): dumps += 1 elif group in ('TagAdd', 'TagRemove'): tags += 1 elif group == 'ChatUpdate': chat_users += 1 if r['time'] > slow_threshold: slow += 1 times = [r['time'] for r in window] return { 'ts': ts, 'reqs': reqs, 'dumps': dumps, 'images': images, 'tags': tags, 'chat_users': chat_users, 'slow': slow, 'mean_resp': mean(times), 'median_resp': median(times), 'ninety_resp': ninety_percentile(times), 'max_resp': max(times) } ReportSetLimit = 100 class DumpReport(object): def __init__(self, interval, slow_thresh=SlowReqThreshold): self.windows = [] self.interval = interval self.bytes_sent = 0 self.requests = 0 self.dumps = 0 self.slow_requests = 0 self.new_users = 0 self.unclassified_urls = set() self.error_urls = set() self.start_time = None self.end_time = None self.urlgroup_stats = {} self.slow_request_threshold = slow_thresh self.printed_dates = set() def record_window(self, ts, window): date_str = ts.strftime('%D') if date_str not in self.printed_dates: print "processing %s" % date_str self.printed_dates.add(date_str) if len(window) > 0: if self.start_time is None: self.start_time = window[0]['ts'] window_end_time = window[-1]['ts'] if self.end_time: self.end_time = max(self.end_time, window_end_time) else: self.end_time = window_end_time # TODO: consolidate summary logic window_summary = summarize_window(ts, window, self.slow_request_threshold) self.windows.append(window_summary) for record in window: self.bytes_sent += record['total_bytes'] self.requests += 1 if record['urlgroup'] == 'Other' and len(self.unclassified_urls) < ReportSetLimit: self.unclassified_urls.add(record['url']) if record['urlgroup'] == 'Error' and len(self.error_urls) < ReportSetLimit: self.error_urls.add(record['url']) r_time = record['time'] group = record['urlgroup'] if group in ('Post', 'Upload'): self.dumps += 1 elif group == 'Register': self.new_users += 1 if group not in self.urlgroup_stats: stats = self.urlgroup_stats[group] = { 'min': r_time, 'max': r_time, 'time': 0, 'slow': 0, 'requests': 0, 'bytes': 0 } else: stats = self.urlgroup_stats[group] stats['min'] = min(stats['min'], r_time) stats['max'] = max(stats['max'], r_time) stats['requests'] += 1 stats['bytes'] += record['total_bytes'] stats['time'] += r_time if r_time > self.slow_request_threshold: self.slow_requests += 1 stats['slow'] += 1 def _output_errors_and_unclassified(self, writer): if len(self.error_urls) > 0: writer.section('Error URLs (%s)' % len(self.error_urls)) writer.paragraph(', '.join(sorted(self.error_urls))) if len(self.unclassified_urls) > 0: writer.section('Unclassified URLs (%s)' % len(self.unclassified_urls)) writer.paragraph(', '.join(sorted(self.unclassified_urls))) def _output_urlgroups(self, writer): headers = ['#', 'min', 'mean', 'max', '# slow', '% slow', 'bytes', 'bytes / req'] data = [] for group in sorted(self.urlgroup_stats.keys()): stats = self.urlgroup_stats[group] avg_time = stats['time'] / stats['requests'] data.append([group, comma_format(stats['requests']), format_ms(stats['min']), format_ms(avg_time), format_ms(stats['max']), stats['slow'], format_pct(float(stats['slow']) / stats['requests']), format_bytes(stats['bytes']), format_bytes(stats['bytes'] / stats['requests'])]) writer.table(headers, data) def _output_usage_charts(self, writer): writer.section('Activity Rates') writer.graph('Reqs/sec', 'reqs_per_sec', build_js_array((r['ts'], r['reqs'] / self.interval.seconds) for r in self.windows)) writer.graph('Images/sec', 'images_per_sec', build_js_array((r['ts'], r['images'] / self.interval.seconds) for r in self.windows)) writer.graph('Dumps/sec', 'dumps_per_sec', build_js_array((r['ts'], r['dumps'] / self.interval.seconds) for r in self.windows)) writer.graph('Tags/sec', 'tags_per_sec', build_js_array((r['ts'], r['tags'] / self.interval.seconds) for r in self.windows)) writer.graph('Chat users/sec', 'users_per_sec', build_js_array((r['ts'], r['chat_users'] / self.interval.seconds) for r in self.windows)) writer.section('Response Rates') writer.graph('Mean response time', 'mean_resp', build_js_array((r['ts'], r['mean_resp']) for r in self.windows)) writer.graph('Median response time', 'median_resp', build_js_array((r['ts'], r['median_resp']) for r in self.windows)) writer.graph('90% response time', 'ninety_resp', build_js_array((r['ts'], r['ninety_resp']) for r in self.windows)) writer.graph('Max response time', 'max_resp', build_js_array((r['ts'], r['max_resp']) for r in self.windows)) def _output_report(self, writer): writer.section('Overview') duration = self.end_time - self.start_time summary = [ '%s EST to %s EST (%s)' % (format_time(self.start_time), format_time(self.end_time), duration), '%s requests (%.02f r/s)' % (comma_format(self.requests), self.requests / duration.seconds), '%s slow requests (%s)' % (comma_format(self.slow_requests), format_pct(float(self.slow_requests) / self.requests)), '%s bytes sent' % format_bytes(self.bytes_sent)] writer.list(summary) self._output_usage_charts(writer) self._output_urlgroups(writer) self._output_errors_and_unclassified(writer) def write_report(self, path): writer = HtmlWriter(path, 'Dump Report') self._output_report(writer) writer.close() def print_report(self): writer = ConsoleWriter() self._output_report(writer) def peek(iterable): first = None try: first = iterable.next() except StopIteration: raise ValueError('cannot peek on empty iterator') return first, itertools.chain([first], iterable) def yield_log_windows(raw_iter, interval, parse_func, timekey): wrapped_iter = itertools.ifilter(None, itertools.imap(parse_func, itertools.count(1), raw_iter)) first, wrapped_iter = peek(wrapped_iter) start_time = first[timekey] def keyfunc(record): return int((record[timekey] - start_time).seconds / interval.seconds) for i, g in itertools.groupby(wrapped_iter, keyfunc): yield start_time + (interval * i), list(g) # Dump log parsing def parse_time(s): # Parse times such as 02/Jul/2010:22:21:25 -0400 return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M') InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall', '/register', '/reset-request'] StaticUrls = ['/favicon.ico', '/robots.txt', '/'] def _classify_profile_url(url, nickset): if url.startswith('/u/'): url = url[2:] comps = url.split('/') comps.remove('') if len(comps) == 0 or comps[0] not in nickset: return None else: if len(comps) == 1: return 'Profile' elif comps[1] == 'popular': return 'Popular' elif comps[1] == 'favorites': return 'Favorites' elif comps[1] == 'log': return 'Log' else: return 'Profile' def classify_url(method, url): profile_classification = _classify_profile_url(url, Nickset) if profile_classification: return profile_classification elif url.startswith('/avatars/'): return 'Avatars' elif url.startswith('/refresh'): return 'ChatUpdate' elif url.startswith('/chat') or url.startswith('/browser'): return 'ChatLoad' elif url.startswith('/json/') and url.endswith('/favorites'): return 'JsonFavs' elif url.startswith('/log'): return 'RoomLog' elif url.startswith('/login') or url.startswith('/logout'): return 'Login' elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'): return 'InfoPages' elif url == '/msg': return 'Post' elif url == '/submit-registration': return 'Register' elif url.startswith('/cmd/ghettosearch/'): return 'Search' elif url.startswith('/static/') or url in StaticUrls: return 'Static' elif url.startswith('/images/'): return 'Images' elif url == '/cmd/tag/add': return 'TagAdd' elif url == '/cmd/tag/rm': return 'TagRemove' elif url == '/upload/message': return 'Upload' elif url.startswith('/p/'): return 'ViewDump' else: return 'Other' def parse_ms(ms): return int(float(ms) * 1000) def parse_request(s): method, url, httpversion = s.split(' ') url_group = classify_url(method, url) return {'method': method, 'httpversion': httpversion, 'urlgroup': url_group, 'url': url } def parse_status(s): code = int(s) # This works because statuses are parsed after urls. if code == 302: return {'code': 302, 'urlgroup': 'Redirect'} elif code == 404: return {'code': 404, 'urlgroup': '404'} elif code >= 500: return {'code': code, 'urlgroup': 'Error'} else: return code LineFormat = [('addr', str), ('ts', parse_time), ('request', parse_request), ('pipe', str), ('status', parse_status), ('time', parse_ms), ('total_bytes', int), ('body_bytes', int), ('refer', str), ('agent', str)] def parse_dump_log_entry(i, l, error_func=None): comps = [c.strip() for c in l.split('|')] if len(comps) != len(LineFormat): if error_func: error_func(i, l, "Invalid number of log components (%s instead of %s)" \ % (len(comps), len(LineFormat))) return None r = {} for (label, f), v in zip(LineFormat, comps): try: res = f(v) if isinstance(res, dict): r.update(res) else: r[label] = f(v) except Exception, e: if error_func: error_func(i, l, "error converting %s field '%s': %s" % (label, v, e)) return None return r def default_error_handler(i, line, err): # print "line %s: %s" % (i, err) pass default_line_parser = functools.partial(parse_dump_log_entry, error_func=default_error_handler) def analyze_dump_log(path, interval=WindowInterval, line_parser=default_line_parser, timekey='ts'): report = DumpReport(interval) with open(path, 'r') as f: for ts, window in yield_log_windows(f, interval, line_parser, timekey): report.record_window(ts, window) return report if __name__ == "__main__": if len(sys.argv) == 1: print "usage: logreport.py logfile [outfile]" sys.exit(1) logpath = sys.argv[1] outpath = sys.argv[2] if len(sys.argv) > 2 else None Nickset = fetch_nicks() report = analyze_dump_log(logpath) if outpath: report.write_report(outpath) else: report.print_report()