From 88fb87e7c2d72db2c5f7b3e7a1ee41f021e5c40d Mon Sep 17 00:00:00 2001 From: Scott Ostler Date: Thu, 8 Jul 2010 19:37:38 -0400 Subject: logreport.py rewritten to use generators --- scripts/logreport.py | 509 ++++++++++++++++++++++++--------------------------- 1 file changed, 238 insertions(+), 271 deletions(-) (limited to 'scripts/logreport.py') diff --git a/scripts/logreport.py b/scripts/logreport.py index 6f4799c..c2d7712 100644 --- a/scripts/logreport.py +++ b/scripts/logreport.py @@ -11,11 +11,13 @@ import psycopg2 import sys import time +from writer import ConsoleWriter, HtmlWriter + locale.setlocale(locale.LC_ALL, "") Nickset = None SlowReqThreshold = 1000 -DefaultTimeBucket = datetime.timedelta(minutes=5) +WindowInterval = datetime.timedelta(minutes=5) ConnStr = "dbname=dumpfm user=postgres password=root" def fetch_nicks(): @@ -52,6 +54,9 @@ def format_bytes(bytes): ['GB', 'MB', 'KB', 'B'], [1024 ** i for i in range(3, -1, -1)]) +def format_pct(n): + return '%.02f%%' % (100 * n) + def median(pool): copy = sorted(pool) size = len(copy) @@ -115,266 +120,223 @@ def get_report_title(entries): end_time = entries[-1]['ts'] return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time) -def build_js_array(start_time, interval, values): - pairs = [] - for i, val in enumerate(values): - t = start_time + interval * i - unix_time = calendar.timegm(t.timetuple()) * 1000 - pairs.append('[%s, %s]' % (unix_time, val)) - return '[%s]' % ','.join(pairs) +def build_js_array(pairs): + s_list = [] + for ts, val in pairs: + unix_time = calendar.timegm(ts.timetuple()) * 1000 + s_list.append('[%s, %s]' % (unix_time, val)) + return '[%s]' % ','.join(s_list) + +def summarize_window(ts, window, slow_threshold): + reqs = 0.0 + dumps = 0.0 + images = 0.0 + tags = 0.0 + chat_users = 0.0 + slow = 0 + + for r in window: + reqs += 1 + group = r['urlgroup'] + if group in ('Images', 'Avatars'): + images += 1 + elif group in ('Post', 'Upload'): + dumps += 1 + elif group in ('TagAdd', 'TagRemove'): + tags += 1 + elif group == 'ChatUpdate': + chat_users += 1 + if r['time'] > slow_threshold: + slow += 1 + + times = [r['time'] for r in window] + return { 'ts': ts, + 'reqs': reqs, + 'dumps': dumps, + 'images': images, + 'tags': tags, + 'chat_users': chat_users, + 'slow': slow, + 'mean_resp': mean(times), + 'median_resp': median(times), + 'ninety_resp': ninety_percentile(times), + 'max_resp': max(times) + } + +ReportSetLimit = 100 + +class DumpReport(object): + + def __init__(self, interval, slow_thresh=SlowReqThreshold): + self.windows = [] + self.interval = interval + self.bytes_sent = 0 + self.requests = 0 + self.dumps = 0 + self.slow_requests = 0 + self.new_users = 0 + self.unclassified_urls = set() + self.error_urls = set() + self.start_time = None + self.end_time = None + self.urlgroup_stats = {} + self.slow_request_threshold = slow_thresh + + def record_window(self, ts, window): + if len(window) > 0: + if self.start_time is None: + self.start_time = window[0]['ts'] + window_end_time = window[-1]['ts'] + if self.end_time: + self.end_time = max(self.end_time, window_end_time) + else: + self.end_time = window_end_time -SlowReportCount = 10 -IpReportCount = 10 + # TODO: consolidate summary logic + window_summary = summarize_window(ts, window, self.slow_request_threshold) + self.windows.append(window_summary) -class ConsoleWriter(object): + for record in window: + self.bytes_sent += record['total_bytes'] + self.requests += 1 + + if record['urlgroup'] == 'Other' and len(self.unclassified_urls) < ReportSetLimit: + self.unclassified_urls.add(record['url']) - def section(self, label): - print "\n----- %s -----" % label - - def paragraph(self, text): - print text - - def list(self, entries): - for e in entries: - print "- %s" % e - - def table(self, headers, data): - for row in data: - label = row[0] - if label: - print "\n%s:" % label - for h, d in zip(headers, row[1:]): - print " - %s: %s" % (h, d) - - def graph(self, *args, **kwds): - pass - - def close(self): - pass - -HtmlTemplate = """ - - -%s - - - - -%s - -""" - -JqueryTemplate = """ -$(function () { - $.plot($("#%s"), - [ { data: %s, label: "%s" } ], - { xaxis: { mode: 'time' }, - yaxis: { min: 0 }, - legend: { position: 'sw' } }); -}); - -""" - -class HtmlWriter(object): - - def __init__(self, path, title): - self.path = path - self.title = title - self.content = "" - - def section(self, label): - self.content += "

%s

\n" % label - - def paragraph(self, text): - self.content += "

%s

\n" % text - - def list(self, entries): - self.content += "\n" - - def table(self, headers, data): - self.content += "" - for h in headers: - self.content += "" % h - self.content += "\n" - for row in data: - self.content += "" + if record['urlgroup'] == 'Error' and len(self.error_urls) < ReportSetLimit: + self.error_urls.add(record['url']) + + r_time = record['time'] + group = record['urlgroup'] - for i, val in enumerate(row): - if i == 0: - self.content += "" % val - else: - self.content += "" % val - self.content += "\n" - self.content += "
%s
%s%s
\n" - - def graph(self, label, graph_id, data): - self.content += '
' % graph_id - js_array = "var %s = %s;" % (graph_id, data) - graph_command = JqueryTemplate % (graph_id, graph_id, label) - self.content += '' % (js_array, graph_command) - - def _assemble(self): - return HtmlTemplate % (self.title or 'Untitled', self.content) - - def close(self): - print "writing report to %s" % self.path - text = self._assemble() - with open(self.path, 'w') as f: - f.write(text) - + if group in ('Post', 'Upload'): + self.dumps += 1 + elif group == 'Register': + self.new_users += 1 + + if group not in self.urlgroup_stats: + stats = self.urlgroup_stats[group] = { 'min': r_time, + 'max': r_time, + 'time': 0, + 'slow': 0, + 'requests': 0, + 'bytes': 0 } + else: + stats = self.urlgroup_stats[group] + stats['min'] = min(stats['min'], r_time) + stats['max'] = max(stats['max'], r_time) -class Report(object): + stats['requests'] += 1 + stats['bytes'] += record['total_bytes'] + stats['time'] += r_time + if r_time > self.slow_request_threshold: + self.slow_requests += 1 + stats['slow'] += 1 - def __init__(self, writer, good, bad): - self.writer = writer - self.good = good - self.bad = bad - self.count = len(good) - self.start = self.good[0]['ts'] - self.end = self.good[-1]['ts'] - - def output_group_stats(self): - self.writer.section('URL group stats') - grouped = gather_by_key(self.good, 'urlgroup') + def _output_errors_and_unclassified(self, writer): + if len(self.error_urls) > 0: + writer.section('Error URLs (%s)' % len(self.error_urls)) + writer.paragraph(', '.join(sorted(self.error_urls))) - headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', '% slow', - 'bytes sent', 'bytes/req'] - data = [] + if len(self.unclassified_urls) > 0: + writer.section('Unclassified URLs (%s)' % len(self.unclassified_urls)) + writer.paragraph(', '.join(sorted(self.unclassified_urls))) - for url_group in sorted(grouped.keys()): - vals = sorted(r['time'] for r in grouped[url_group]) - slow_count = sum(1 for v in vals if v > SlowReqThreshold) - bytes_sent = sum(r['total_bytes'] for r in grouped[url_group]) - data.append([url_group, - comma_format(len(vals)), - format_ms(min(vals)), - format_ms(median(vals)), - format_ms(mean(vals)), - format_ms(ninety_percentile(vals)), - format_ms(max(vals)), - slow_count, - '%.01f%%' % (100 * slow_count / len(vals)), - format_bytes(bytes_sent), - format_bytes(bytes_sent / len(vals))]) - self.writer.table(headers, data) - - def output_activity_graphs(self, interval=DefaultTimeBucket): - self.writer.section('Activity Rates') - buckets = group_by_time_interval(self.good, interval, 'ts') - def second_avg(count): return float(count) / interval.seconds - - def avg_bucket_rate(bs, cond): - return [second_avg(sum(1 for r in b if cond(r))) for b in bs] - - reqs_per_sec = avg_bucket_rate(buckets, lambda r: True) - images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars')) - dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload')) - tags_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('TagAdd', 'TagRemove')) - chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate') - - - self.writer.graph('Reqs / sec', 'reqs_per_sec', - build_js_array(self.start, interval, reqs_per_sec)) - self.writer.graph('Images / sec', 'images_per_sec', - build_js_array(self.start, interval, images_per_sec)) - self.writer.graph('Dumps / sec', 'dumps_per_sec', - build_js_array(self.start, interval, dumps_per_sec)) - self.writer.graph('Tags / sec', 'tags_per_sec', - build_js_array(self.start, interval, tags_per_sec)) - self.writer.graph('Chat users / sec', 'chat_users_per_sec', - build_js_array(self.start, interval, chat_users_per_sec)) - - self.writer.section('Response Time') + + def _output_urlgroups(self, writer): + headers = ['#', 'min', 'mean', 'max', '# slow', '% slow', 'bytes'] + data = [] + for group in sorted(self.urlgroup_stats.keys()): + stats = self.urlgroup_stats[group] + avg_time = stats['time'] / stats['requests'] + data.append([group, + comma_format(stats['requests']), + format_ms(stats['min']), + format_ms(avg_time), + format_ms(stats['max']), + stats['slow'], + format_pct(float(stats['slow']) / stats['requests']), + format_bytes(stats['bytes']), + format_bytes(stats['bytes'] / stats['requests'])]) + writer.table(headers, data) - def calc_bucket_resp_time(bs, func): - return [func([r['time'] for r in b]) for b in bs] - - mean_resp_time = calc_bucket_resp_time(buckets, mean) - median_resp_time = calc_bucket_resp_time(buckets, median) - ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile) - max_resp_time = calc_bucket_resp_time(buckets, max) - - self.writer.graph('Mean response time (ms)', 'mean_resp_time', - build_js_array(self.start, interval, mean_resp_time)) - self.writer.graph('Median response time (ms)', 'median_per_sec', - build_js_array(self.start, interval, median_resp_time)) - self.writer.graph('90% response time (ms)', 'ninety_pct_time', - build_js_array(self.start, interval, ninety_pct_time)) - self.writer.graph('Max response time (ms)', 'max_resp_time', - build_js_array(self.start, interval, max_resp_time)) - - - def output_slow_requests(self): - self.writer.section('Slow URLs') - slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold), - key=lambda r: r['time'] * -1) - entries = [] - for r in slow[:SlowReportCount]: - e = "%s: %s, %s sent" % (r['url'], - format_ms(r['time']), - format_bytes(r['total_bytes'])) - entries.append(e) - self.writer.list(entries) - - def output_unclassified_urls(self): - unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other') - unclassified_str = ', '.join(sorted(unclassified)[:100]) - if len(unclassified) > 100: - unclassified_str += '...' - self.writer.section('Unclassified URLs (%s)' % len(unclassified)) - self.writer.paragraph(unclassified_str) - - def output_errors(self): - errors = set(g['url'] for g in self.good if g['urlgroup'] == 'Error') - error_str = ', '.join(sorted(errors)[:100]) - self.writer.section('Error Urls (%s)' % len(errors)) - self.writer.paragraph(error_str) - - def output_ip_usage(self): - grouped = gather_by_key(self.good, 'addr') - sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs) - addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems())) - self.writer.section('Heaviest Users by IP') + + def _output_usage_charts(self, writer): + writer.section('Activity Rates') + writer.graph('Reqs/sec', 'reqs_per_sec', + build_js_array((r['ts'], r['reqs'] / self.interval.seconds) for r in self.windows)) + writer.graph('Images/sec', 'images_per_sec', + build_js_array((r['ts'], r['images'] / self.interval.seconds) for r in self.windows)) + writer.graph('Dumps/sec', 'dumps_per_sec', + build_js_array((r['ts'], r['dumps'] / self.interval.seconds) for r in self.windows)) + writer.graph('Tags/sec', 'tags_per_sec', + build_js_array((r['ts'], r['tags'] / self.interval.seconds) for r in self.windows)) + writer.graph('Chat users/sec', 'users_per_sec', + build_js_array((r['ts'], r['chat_users'] / self.interval.seconds) for r in self.windows)) + + writer.section('Response Rates') + writer.graph('Mean response time', 'mean_resp', + build_js_array((r['ts'], r['mean_resp']) for r in self.windows)) + writer.graph('Median response time', 'median_resp', + build_js_array((r['ts'], r['median_resp']) for r in self.windows)) + writer.graph('90% response time', 'ninety_resp', + build_js_array((r['ts'], r['ninety_resp']) for r in self.windows)) + writer.graph('Max response time', 'max_resp', + build_js_array((r['ts'], r['max_resp']) for r in self.windows)) + + + + def _output_report(self, writer): + writer.section('Overview') + duration = self.end_time - self.start_time + summary = [ + '%s EST to %s EST (%s)' % (format_time(self.start_time), + format_time(self.end_time), + duration), + '%s requests (%.02f r/s)' % (comma_format(self.requests), + self.requests / duration.seconds), + '%s slow requests (%s)' % (self.slow_requests, + format_pct(float(self.slow_requests) / self.requests)), + '%s bytes sent' % format_bytes(self.bytes_sent)] + writer.list(summary) + self._output_usage_charts(writer) + self._output_urlgroups(writer) + self._output_errors_and_unclassified(writer) + + def write_report(self, path): + writer = HtmlWriter(path, 'Dump Report') + self._output_report(writer) + writer.close() - l = [] - for count, addr in itertools.islice(addr_bytes_gen, IpReportCount): - bytes = sum(r['total_bytes'] for r in grouped[addr]) - l.append("%s: %s reqs, %s sent" % (addr, - comma_format(count), - format_bytes(bytes))) - self.writer.list(l) - - def output_report(self): - l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start), - format_time(self.end), - self.end - self.start), - "%s Total Requests (%.02f r/s)" \ - % (comma_format(self.count), self.count / (self.end - self.start).seconds), - "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))] - self.writer.section('Overview') - self.writer.list(l) - self.output_activity_graphs() - self.output_group_stats() - self.output_slow_requests() - self.output_ip_usage() - self.output_errors() - self.output_unclassified_urls() + def print_report(self): + writer = ConsoleWriter() + self._output_report(writer) + +def peek(iterable): + first = None + try: + first = iterable.next() + except StopIteration: + raise ValueError('cannot peek on empty iterator') + return first, itertools.chain([first], iterable) + + +def yield_log_windows(raw_iter, interval, parse_func, timekey): + wrapped_iter = itertools.ifilter(None, itertools.imap(parse_func, itertools.count(1), raw_iter)) + first, wrapped_iter = peek(wrapped_iter) + + start_time = first[timekey] + + def keyfunc(record): + return int((record[timekey] - start_time).seconds / interval.seconds) + + for i, g in itertools.groupby(wrapped_iter, keyfunc): + yield start_time + (interval * i), list(g) +# Dump log parsing + def parse_time(s): # Parse times such as 02/Jul/2010:22:21:25 -0400 return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M') @@ -475,11 +437,13 @@ LineFormat = [('addr', str), ('refer', str), ('agent', str)] -def parse_line(l): +def parse_dump_log_entry(i, l, error_func=None): comps = [c.strip() for c in l.split('|')] if len(comps) != len(LineFormat): - raise ValueError("Invalid number of log components (%s instead of %s): %s" - % (len(comps), len(LineFormat), l)) + if error_func: + error_func(i, l, "Invalid number of log components (%s instead of %s)" \ + % (len(comps), len(LineFormat))) + return None r = {} for (label, f), v in zip(LineFormat, comps): try: @@ -489,36 +453,39 @@ def parse_line(l): else: r[label] = f(v) except Exception, e: - return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip()) + if error_func: + error_func(i, l, "error converting %s field '%s': %s" % (label, v, e)) + return None return r -def parse_log(path): - print 'reading %s' % path - lines = [parse_line(l) for l in open(path, 'r')] - good_lines = [l for l in lines if isinstance(l, dict)] - bad_lines = [l for l in lines if not isinstance(l, dict)] - return good_lines, bad_lines +def default_error_handler(i, line, err): + # print "line %s: %s" % (i, err) + pass +default_line_parser = functools.partial(parse_dump_log_entry, + error_func=default_error_handler) + +def analyze_dump_log(path, + interval=WindowInterval, + line_parser=default_line_parser, + timekey='ts'): + report = DumpReport(interval) + with open(path, 'r') as f: + for ts, window in yield_log_windows(f, interval, line_parser, timekey): + report.record_window(ts, window) + return report if __name__ == "__main__": - if len(sys.argv) < 1: + if len(sys.argv) == 1: print "usage: logreport.py logfile [outfile]" sys.exit(1) logpath = sys.argv[1] - outpath = sys.argv[2] if len(sys.argv) > 2 else None + outpath = sys.argv[2] if len(sys.argv) > 2 else None Nickset = fetch_nicks() - good_lines, bad_lines = parse_log(logpath) - if len(good_lines) == 0: - print "Log file is empty or cannot be read!" - sys.exit(1) + report = analyze_dump_log(logpath) if outpath: - report_title = get_report_title(good_lines) - writer = HtmlWriter(outpath, report_title) + report.write_report(outpath) else: - writer = ConsoleWriter() - - report = Report(writer, good_lines, bad_lines) - report.output_report() - writer.close() + report.print_report() -- cgit v1.2.3-70-g09d2