summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/logreport.py469
-rw-r--r--scripts/writer.py108
2 files changed, 326 insertions, 251 deletions
diff --git a/scripts/logreport.py b/scripts/logreport.py
index 6f4799c..c2d7712 100644
--- a/scripts/logreport.py
+++ b/scripts/logreport.py
@@ -11,11 +11,13 @@ import psycopg2
import sys
import time
+from writer import ConsoleWriter, HtmlWriter
+
locale.setlocale(locale.LC_ALL, "")
Nickset = None
SlowReqThreshold = 1000
-DefaultTimeBucket = datetime.timedelta(minutes=5)
+WindowInterval = datetime.timedelta(minutes=5)
ConnStr = "dbname=dumpfm user=postgres password=root"
def fetch_nicks():
@@ -52,6 +54,9 @@ def format_bytes(bytes):
['GB', 'MB', 'KB', 'B'],
[1024 ** i for i in range(3, -1, -1)])
+def format_pct(n):
+ return '%.02f%%' % (100 * n)
+
def median(pool):
copy = sorted(pool)
size = len(copy)
@@ -115,265 +120,222 @@ def get_report_title(entries):
end_time = entries[-1]['ts']
return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time)
-def build_js_array(start_time, interval, values):
- pairs = []
- for i, val in enumerate(values):
- t = start_time + interval * i
- unix_time = calendar.timegm(t.timetuple()) * 1000
- pairs.append('[%s, %s]' % (unix_time, val))
- return '[%s]' % ','.join(pairs)
-
-SlowReportCount = 10
-IpReportCount = 10
-
-class ConsoleWriter(object):
-
- def section(self, label):
- print "\n----- %s -----" % label
-
- def paragraph(self, text):
- print text
-
- def list(self, entries):
- for e in entries:
- print "- %s" % e
-
- def table(self, headers, data):
- for row in data:
- label = row[0]
- if label:
- print "\n%s:" % label
- for h, d in zip(headers, row[1:]):
- print " - %s: %s" % (h, d)
+def build_js_array(pairs):
+ s_list = []
+ for ts, val in pairs:
+ unix_time = calendar.timegm(ts.timetuple()) * 1000
+ s_list.append('[%s, %s]' % (unix_time, val))
+ return '[%s]' % ','.join(s_list)
- def graph(self, *args, **kwds):
- pass
+def summarize_window(ts, window, slow_threshold):
+ reqs = 0.0
+ dumps = 0.0
+ images = 0.0
+ tags = 0.0
+ chat_users = 0.0
+ slow = 0
- def close(self):
- pass
+ for r in window:
+ reqs += 1
+ group = r['urlgroup']
+ if group in ('Images', 'Avatars'):
+ images += 1
+ elif group in ('Post', 'Upload'):
+ dumps += 1
+ elif group in ('TagAdd', 'TagRemove'):
+ tags += 1
+ elif group == 'ChatUpdate':
+ chat_users += 1
+ if r['time'] > slow_threshold:
+ slow += 1
-HtmlTemplate = """
-<html>
-<head>
-<title>%s</title>
-<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
-<script type="text/javascript" src="http://dump.fm/static/js/jquery.flot.js"></script>
-<style>
-table {
- margin: 0;
- cellpadding: 1px;
-}
+ times = [r['time'] for r in window]
+ return { 'ts': ts,
+ 'reqs': reqs,
+ 'dumps': dumps,
+ 'images': images,
+ 'tags': tags,
+ 'chat_users': chat_users,
+ 'slow': slow,
+ 'mean_resp': mean(times),
+ 'median_resp': median(times),
+ 'ninety_resp': ninety_percentile(times),
+ 'max_resp': max(times)
+ }
-th { text-align: left; }
+ReportSetLimit = 100
-tr:hover { background-color: #AFF; }
+class DumpReport(object):
+
+ def __init__(self, interval, slow_thresh=SlowReqThreshold):
+ self.windows = []
+ self.interval = interval
+ self.bytes_sent = 0
+ self.requests = 0
+ self.dumps = 0
+ self.slow_requests = 0
+ self.new_users = 0
+ self.unclassified_urls = set()
+ self.error_urls = set()
+ self.start_time = None
+ self.end_time = None
+ self.urlgroup_stats = {}
+ self.slow_request_threshold = slow_thresh
-</style>
-</head>
-<body>%s</body>
-</html>
-"""
+ def record_window(self, ts, window):
+ if len(window) > 0:
+ if self.start_time is None:
+ self.start_time = window[0]['ts']
+ window_end_time = window[-1]['ts']
+ if self.end_time:
+ self.end_time = max(self.end_time, window_end_time)
+ else:
+ self.end_time = window_end_time
-JqueryTemplate = """
-$(function () {
- $.plot($("#%s"),
- [ { data: %s, label: "%s" } ],
- { xaxis: { mode: 'time' },
- yaxis: { min: 0 },
- legend: { position: 'sw' } });
-});
+ # TODO: consolidate summary logic
+ window_summary = summarize_window(ts, window, self.slow_request_threshold)
+ self.windows.append(window_summary)
-"""
+ for record in window:
+ self.bytes_sent += record['total_bytes']
+ self.requests += 1
+
+ if record['urlgroup'] == 'Other' and len(self.unclassified_urls) < ReportSetLimit:
+ self.unclassified_urls.add(record['url'])
-class HtmlWriter(object):
-
- def __init__(self, path, title):
- self.path = path
- self.title = title
- self.content = ""
-
- def section(self, label):
- self.content += "<h2>%s</h2>\n" % label
+ if record['urlgroup'] == 'Error' and len(self.error_urls) < ReportSetLimit:
+ self.error_urls.add(record['url'])
- def paragraph(self, text):
- self.content += "<p>%s</p>\n" % text
+ r_time = record['time']
+ group = record['urlgroup']
+
+ if group in ('Post', 'Upload'):
+ self.dumps += 1
+ elif group == 'Register':
+ self.new_users += 1
- def list(self, entries):
- self.content += "<ul>\n"
- for e in entries:
- self.content += "<li>%s</li>\n" % e
- self.content += "</ul>\n"
+ if group not in self.urlgroup_stats:
+ stats = self.urlgroup_stats[group] = { 'min': r_time,
+ 'max': r_time,
+ 'time': 0,
+ 'slow': 0,
+ 'requests': 0,
+ 'bytes': 0 }
+ else:
+ stats = self.urlgroup_stats[group]
+ stats['min'] = min(stats['min'], r_time)
+ stats['max'] = max(stats['max'], r_time)
- def table(self, headers, data):
- self.content += "<table><tr><th></th>"
- for h in headers:
- self.content += "<th>%s</th>" % h
- self.content += "</tr>\n"
- for row in data:
- self.content += "<tr>"
-
- for i, val in enumerate(row):
- if i == 0:
- self.content += "<th>%s</th>" % val
- else:
- self.content += "<td>%s</td>" % val
- self.content += "</tr>\n"
- self.content += "</table>\n"
+ stats['requests'] += 1
+ stats['bytes'] += record['total_bytes']
+ stats['time'] += r_time
+ if r_time > self.slow_request_threshold:
+ self.slow_requests += 1
+ stats['slow'] += 1
- def graph(self, label, graph_id, data):
- self.content += '<div id="%s" style="width: 1000px; height: 350px;"></div>' % graph_id
- js_array = "var %s = %s;" % (graph_id, data)
- graph_command = JqueryTemplate % (graph_id, graph_id, label)
- self.content += '<script>%s\n%s</script>' % (js_array, graph_command)
-
- def _assemble(self):
- return HtmlTemplate % (self.title or 'Untitled', self.content)
+ def _output_errors_and_unclassified(self, writer):
+ if len(self.error_urls) > 0:
+ writer.section('Error URLs (%s)' % len(self.error_urls))
+ writer.paragraph(', '.join(sorted(self.error_urls)))
- def close(self):
- print "writing report to %s" % self.path
- text = self._assemble()
- with open(self.path, 'w') as f:
- f.write(text)
-
+ if len(self.unclassified_urls) > 0:
+ writer.section('Unclassified URLs (%s)' % len(self.unclassified_urls))
+ writer.paragraph(', '.join(sorted(self.unclassified_urls)))
-class Report(object):
- def __init__(self, writer, good, bad):
- self.writer = writer
- self.good = good
- self.bad = bad
- self.count = len(good)
- self.start = self.good[0]['ts']
- self.end = self.good[-1]['ts']
-
- def output_group_stats(self):
- self.writer.section('URL group stats')
- grouped = gather_by_key(self.good, 'urlgroup')
- headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', '% slow',
- 'bytes sent', 'bytes/req']
+ def _output_urlgroups(self, writer):
+ headers = ['#', 'min', 'mean', 'max', '# slow', '% slow', 'bytes']
data = []
-
- for url_group in sorted(grouped.keys()):
- vals = sorted(r['time'] for r in grouped[url_group])
- slow_count = sum(1 for v in vals if v > SlowReqThreshold)
- bytes_sent = sum(r['total_bytes'] for r in grouped[url_group])
- data.append([url_group,
- comma_format(len(vals)),
- format_ms(min(vals)),
- format_ms(median(vals)),
- format_ms(mean(vals)),
- format_ms(ninety_percentile(vals)),
- format_ms(max(vals)),
- slow_count,
- '%.01f%%' % (100 * slow_count / len(vals)),
- format_bytes(bytes_sent),
- format_bytes(bytes_sent / len(vals))])
- self.writer.table(headers, data)
+ for group in sorted(self.urlgroup_stats.keys()):
+ stats = self.urlgroup_stats[group]
+ avg_time = stats['time'] / stats['requests']
+ data.append([group,
+ comma_format(stats['requests']),
+ format_ms(stats['min']),
+ format_ms(avg_time),
+ format_ms(stats['max']),
+ stats['slow'],
+ format_pct(float(stats['slow']) / stats['requests']),
+ format_bytes(stats['bytes']),
+ format_bytes(stats['bytes'] / stats['requests'])])
+ writer.table(headers, data)
- def output_activity_graphs(self, interval=DefaultTimeBucket):
- self.writer.section('Activity Rates')
- buckets = group_by_time_interval(self.good, interval, 'ts')
- def second_avg(count): return float(count) / interval.seconds
-
- def avg_bucket_rate(bs, cond):
- return [second_avg(sum(1 for r in b if cond(r))) for b in bs]
+ def _output_usage_charts(self, writer):
+ writer.section('Activity Rates')
+ writer.graph('Reqs/sec', 'reqs_per_sec',
+ build_js_array((r['ts'], r['reqs'] / self.interval.seconds) for r in self.windows))
+ writer.graph('Images/sec', 'images_per_sec',
+ build_js_array((r['ts'], r['images'] / self.interval.seconds) for r in self.windows))
+ writer.graph('Dumps/sec', 'dumps_per_sec',
+ build_js_array((r['ts'], r['dumps'] / self.interval.seconds) for r in self.windows))
+ writer.graph('Tags/sec', 'tags_per_sec',
+ build_js_array((r['ts'], r['tags'] / self.interval.seconds) for r in self.windows))
+ writer.graph('Chat users/sec', 'users_per_sec',
+ build_js_array((r['ts'], r['chat_users'] / self.interval.seconds) for r in self.windows))
- reqs_per_sec = avg_bucket_rate(buckets, lambda r: True)
- images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars'))
- dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload'))
- tags_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('TagAdd', 'TagRemove'))
- chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate')
+ writer.section('Response Rates')
+ writer.graph('Mean response time', 'mean_resp',
+ build_js_array((r['ts'], r['mean_resp']) for r in self.windows))
+ writer.graph('Median response time', 'median_resp',
+ build_js_array((r['ts'], r['median_resp']) for r in self.windows))
+ writer.graph('90% response time', 'ninety_resp',
+ build_js_array((r['ts'], r['ninety_resp']) for r in self.windows))
+ writer.graph('Max response time', 'max_resp',
+ build_js_array((r['ts'], r['max_resp']) for r in self.windows))
- self.writer.graph('Reqs / sec', 'reqs_per_sec',
- build_js_array(self.start, interval, reqs_per_sec))
- self.writer.graph('Images / sec', 'images_per_sec',
- build_js_array(self.start, interval, images_per_sec))
- self.writer.graph('Dumps / sec', 'dumps_per_sec',
- build_js_array(self.start, interval, dumps_per_sec))
- self.writer.graph('Tags / sec', 'tags_per_sec',
- build_js_array(self.start, interval, tags_per_sec))
- self.writer.graph('Chat users / sec', 'chat_users_per_sec',
- build_js_array(self.start, interval, chat_users_per_sec))
- self.writer.section('Response Time')
-
- def calc_bucket_resp_time(bs, func):
- return [func([r['time'] for r in b]) for b in bs]
+ def _output_report(self, writer):
+ writer.section('Overview')
+ duration = self.end_time - self.start_time
+ summary = [
+ '%s EST to %s EST (%s)' % (format_time(self.start_time),
+ format_time(self.end_time),
+ duration),
+ '%s requests (%.02f r/s)' % (comma_format(self.requests),
+ self.requests / duration.seconds),
+ '%s slow requests (%s)' % (self.slow_requests,
+ format_pct(float(self.slow_requests) / self.requests)),
+ '%s bytes sent' % format_bytes(self.bytes_sent)]
+ writer.list(summary)
+ self._output_usage_charts(writer)
+ self._output_urlgroups(writer)
+ self._output_errors_and_unclassified(writer)
- mean_resp_time = calc_bucket_resp_time(buckets, mean)
- median_resp_time = calc_bucket_resp_time(buckets, median)
- ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile)
- max_resp_time = calc_bucket_resp_time(buckets, max)
+ def write_report(self, path):
+ writer = HtmlWriter(path, 'Dump Report')
+ self._output_report(writer)
+ writer.close()
+
+ def print_report(self):
+ writer = ConsoleWriter()
+ self._output_report(writer)
- self.writer.graph('Mean response time (ms)', 'mean_resp_time',
- build_js_array(self.start, interval, mean_resp_time))
- self.writer.graph('Median response time (ms)', 'median_per_sec',
- build_js_array(self.start, interval, median_resp_time))
- self.writer.graph('90% response time (ms)', 'ninety_pct_time',
- build_js_array(self.start, interval, ninety_pct_time))
- self.writer.graph('Max response time (ms)', 'max_resp_time',
- build_js_array(self.start, interval, max_resp_time))
-
+def peek(iterable):
+ first = None
+ try:
+ first = iterable.next()
+ except StopIteration:
+ raise ValueError('cannot peek on empty iterator')
+ return first, itertools.chain([first], iterable)
- def output_slow_requests(self):
- self.writer.section('Slow URLs')
- slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold),
- key=lambda r: r['time'] * -1)
- entries = []
- for r in slow[:SlowReportCount]:
- e = "%s: %s, %s sent" % (r['url'],
- format_ms(r['time']),
- format_bytes(r['total_bytes']))
- entries.append(e)
- self.writer.list(entries)
- def output_unclassified_urls(self):
- unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other')
- unclassified_str = ', '.join(sorted(unclassified)[:100])
- if len(unclassified) > 100:
- unclassified_str += '...'
- self.writer.section('Unclassified URLs (%s)' % len(unclassified))
- self.writer.paragraph(unclassified_str)
+def yield_log_windows(raw_iter, interval, parse_func, timekey):
+ wrapped_iter = itertools.ifilter(None, itertools.imap(parse_func, itertools.count(1), raw_iter))
+ first, wrapped_iter = peek(wrapped_iter)
- def output_errors(self):
- errors = set(g['url'] for g in self.good if g['urlgroup'] == 'Error')
- error_str = ', '.join(sorted(errors)[:100])
- self.writer.section('Error Urls (%s)' % len(errors))
- self.writer.paragraph(error_str)
+ start_time = first[timekey]
+
+ def keyfunc(record):
+ return int((record[timekey] - start_time).seconds / interval.seconds)
- def output_ip_usage(self):
- grouped = gather_by_key(self.good, 'addr')
- sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs)
- addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems()))
- self.writer.section('Heaviest Users by IP')
-
- l = []
- for count, addr in itertools.islice(addr_bytes_gen, IpReportCount):
- bytes = sum(r['total_bytes'] for r in grouped[addr])
- l.append("%s: %s reqs, %s sent" % (addr,
- comma_format(count),
- format_bytes(bytes)))
- self.writer.list(l)
+ for i, g in itertools.groupby(wrapped_iter, keyfunc):
+ yield start_time + (interval * i), list(g)
- def output_report(self):
- l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start),
- format_time(self.end),
- self.end - self.start),
- "%s Total Requests (%.02f r/s)" \
- % (comma_format(self.count), self.count / (self.end - self.start).seconds),
- "Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))]
- self.writer.section('Overview')
- self.writer.list(l)
- self.output_activity_graphs()
- self.output_group_stats()
- self.output_slow_requests()
- self.output_ip_usage()
- self.output_errors()
- self.output_unclassified_urls()
+# Dump log parsing
def parse_time(s):
# Parse times such as 02/Jul/2010:22:21:25 -0400
@@ -475,11 +437,13 @@ LineFormat = [('addr', str),
('refer', str),
('agent', str)]
-def parse_line(l):
+def parse_dump_log_entry(i, l, error_func=None):
comps = [c.strip() for c in l.split('|')]
if len(comps) != len(LineFormat):
- raise ValueError("Invalid number of log components (%s instead of %s): %s"
- % (len(comps), len(LineFormat), l))
+ if error_func:
+ error_func(i, l, "Invalid number of log components (%s instead of %s)" \
+ % (len(comps), len(LineFormat)))
+ return None
r = {}
for (label, f), v in zip(LineFormat, comps):
try:
@@ -489,36 +453,39 @@ def parse_line(l):
else:
r[label] = f(v)
except Exception, e:
- return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip())
+ if error_func:
+ error_func(i, l, "error converting %s field '%s': %s" % (label, v, e))
+ return None
return r
-def parse_log(path):
- print 'reading %s' % path
- lines = [parse_line(l) for l in open(path, 'r')]
- good_lines = [l for l in lines if isinstance(l, dict)]
- bad_lines = [l for l in lines if not isinstance(l, dict)]
- return good_lines, bad_lines
+def default_error_handler(i, line, err):
+ # print "line %s: %s" % (i, err)
+ pass
+default_line_parser = functools.partial(parse_dump_log_entry,
+ error_func=default_error_handler)
+
+def analyze_dump_log(path,
+ interval=WindowInterval,
+ line_parser=default_line_parser,
+ timekey='ts'):
+ report = DumpReport(interval)
+ with open(path, 'r') as f:
+ for ts, window in yield_log_windows(f, interval, line_parser, timekey):
+ report.record_window(ts, window)
+ return report
if __name__ == "__main__":
- if len(sys.argv) < 1:
+ if len(sys.argv) == 1:
print "usage: logreport.py logfile [outfile]"
sys.exit(1)
logpath = sys.argv[1]
- outpath = sys.argv[2] if len(sys.argv) > 2 else None
+ outpath = sys.argv[2] if len(sys.argv) > 2 else None
Nickset = fetch_nicks()
- good_lines, bad_lines = parse_log(logpath)
- if len(good_lines) == 0:
- print "Log file is empty or cannot be read!"
- sys.exit(1)
+ report = analyze_dump_log(logpath)
if outpath:
- report_title = get_report_title(good_lines)
- writer = HtmlWriter(outpath, report_title)
+ report.write_report(outpath)
else:
- writer = ConsoleWriter()
-
- report = Report(writer, good_lines, bad_lines)
- report.output_report()
- writer.close()
+ report.print_report()
diff --git a/scripts/writer.py b/scripts/writer.py
new file mode 100644
index 0000000..df36b50
--- /dev/null
+++ b/scripts/writer.py
@@ -0,0 +1,108 @@
+class ConsoleWriter(object):
+
+ def section(self, label):
+ print "\n----- %s -----" % label
+
+ def paragraph(self, text):
+ print text
+
+ def list(self, entries):
+ for e in entries:
+ print "- %s" % e
+
+ def table(self, headers, data):
+ for row in data:
+ label = row[0]
+ if label:
+ print "\n%s:" % label
+ for h, d in zip(headers, row[1:]):
+ print " - %s: %s" % (h, d)
+
+ def graph(self, *args, **kwds):
+ pass
+
+ def close(self):
+ pass
+
+HtmlTemplate = """
+<html>
+<head>
+<title>%s</title>
+<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
+<script type="text/javascript" src="http://dump.fm/static/js/jquery.flot.js"></script>
+<style>
+table {
+ margin: 0;
+ cellpadding: 1px;
+}
+
+th { text-align: left; }
+
+tr:hover { background-color: #AFF; }
+
+</style>
+</head>
+<body>%s</body>
+</html>
+"""
+
+JqueryTemplate = """
+$(function () {
+ $.plot($("#%s"),
+ [ { data: %s, label: "%s" } ],
+ { xaxis: { mode: 'time' },
+ yaxis: { min: 0 },
+ legend: { position: 'sw' } });
+});
+
+"""
+
+class HtmlWriter(object):
+
+ def __init__(self, path, title):
+ self.path = path
+ self.title = title
+ self.content = ""
+
+ def section(self, label):
+ self.content += "<h2>%s</h2>\n" % label
+
+ def paragraph(self, text):
+ self.content += "<p>%s</p>\n" % text
+
+ def list(self, entries):
+ self.content += "<ul>\n"
+ for e in entries:
+ self.content += "<li>%s</li>\n" % e
+ self.content += "</ul>\n"
+
+ def table(self, headers, data):
+ self.content += "<table><tr><th></th>"
+ for h in headers:
+ self.content += "<th>%s</th>" % h
+ self.content += "</tr>\n"
+ for row in data:
+ self.content += "<tr>"
+
+ for i, val in enumerate(row):
+ if i == 0:
+ self.content += "<th>%s</th>" % val
+ else:
+ self.content += "<td>%s</td>" % val
+ self.content += "</tr>\n"
+ self.content += "</table>\n"
+
+ def graph(self, label, graph_id, data):
+ self.content += '<div id="%s" style="width: 1000px; height: 350px;"></div>' % graph_id
+ js_array = "var %s = %s;" % (graph_id, data)
+ graph_command = JqueryTemplate % (graph_id, graph_id, label)
+ self.content += '<script>%s\n%s</script>' % (js_array, graph_command)
+
+ def _assemble(self):
+ return HtmlTemplate % (self.title or 'Untitled', self.content)
+
+ def close(self):
+ print "writing report to %s" % self.path
+ text = self._assemble()
+ with open(self.path, 'w') as f:
+ f.write(text)