from __future__ import with_statement
import calendar
import collections
import datetime
import functools
import itertools
import locale
import math
import operator
import psycopg2
import sys
import time
locale.setlocale(locale.LC_ALL, "")
Nickset = None
SlowReqThreshold = 1000
DefaultTimeBucket = datetime.timedelta(minutes=5)
ConnStr = "dbname=dumpfm user=postgres password=root"
def fetch_nicks():
try:
print "fetching user nicknames from db"
conn = psycopg2.connect(ConnStr)
cur = conn.cursor()
cur.execute("select nick from users")
return set(n for (n,) in cur.fetchall())
except Exception, e:
print "Error fetching user nicknames: %s" % e
print "Profile urls will be incorrectly classified as 'Other'"
return set()
def comma_format(i):
return locale.format('%d', i, True)
def format_unit(val, units, factors):
for unit, factor in zip(units, factors):
if factor in (0, 1):
return "%s%s" % (val, unit)
elif val >= factor:
return "%0.2f%s" % (float(val) / factor, unit)
raise ValueError('Cannot format %s with any of %s' % (val, unit))
def format_time(t):
return t.strftime('%Y/%m/%d %H:%M')
def format_ms(ms):
return format_unit(ms, ['m', 's', 'ms'], [60 * 1000, 1000, 0])
def format_bytes(bytes):
return format_unit(bytes,
['GB', 'MB', 'KB', 'B'],
[1024 ** i for i in range(3, -1, -1)])
def median(pool):
copy = sorted(pool)
size = len(copy)
if size % 2 == 1:
return copy[(size - 1) / 2]
else:
return (copy[size/2 - 1] + copy[size/2]) / 2
def mean(pool):
return sum(pool) / len(pool)
def percentile(N, percent, key=lambda x:x):
"""
From http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
Find the percentile of a list of values.
@parameter N - is a list of values. Note N MUST BE already sorted.
@parameter percent - a float value from 0.0 to 1.0.
@parameter key - optional key function to compute value from each element of N.
@return - the percentile of the values
"""
if not N:
return None
k = (len(N)-1) * percent
f = math.floor(k)
c = math.ceil(k)
if f == c:
return key(N[int(k)])
d0 = key(N[int(f)]) * (k-f)
d1 = key(N[int(c)]) * (c-k)
return d0+d1
def ninety_percentile(N):
return percentile(N, .9)
def gather_by_key(rs, key):
d = collections.defaultdict(list)
for r in rs:
group = r[key]
d[group].append(r)
return d
def group_by_time_interval(xs, interval, timekey):
if len(xs) == 0:
raise ValueError("Cannot group empty list")
start = xs[0][timekey]
end = xs[-1][timekey]
interval_seconds = int(interval.seconds)
span = int((end - start).seconds) / interval_seconds + 1
res = [[] for i in range(span)]
for x in xs:
idx = int((x[timekey] - start).seconds) / interval_seconds
res[idx].append(x)
return res
def get_report_title(entries):
start_time = entries[0]['ts']
end_time = entries[-1]['ts']
return "Dump Report: " + format_time(start_time) + " to " + format_time(end_time)
def build_js_array(start_time, interval, values):
pairs = []
for i, val in enumerate(values):
t = start_time + interval * i
unix_time = calendar.timegm(t.timetuple()) * 1000
pairs.append('[%s, %s]' % (unix_time, val))
return '[%s]' % ','.join(pairs)
SlowReportCount = 10
IpReportCount = 10
class ConsoleWriter(object):
def section(self, label):
print "\n----- %s -----" % label
def paragraph(self, text):
print text
def list(self, entries):
for e in entries:
print "- %s" % e
def table(self, headers, data):
for row in data:
label = row[0]
if label:
print "\n%s:" % label
for h, d in zip(headers, row[1:]):
print " - %s: %s" % (h, d)
def graph(self, *args, **kwds):
pass
def close(self):
pass
HtmlTemplate = """
%s
%s
"""
JqueryTemplate = """
$(function () {
$.plot($("#%s"),
[ { data: %s, label: "%s" } ],
{ xaxis: { mode: 'time' },
yaxis: { min: 0 },
legend: { position: 'sw' } });
});
"""
class HtmlWriter(object):
def __init__(self, path, title):
self.path = path
self.title = title
self.content = ""
def section(self, label):
self.content += "%s
\n" % label
def paragraph(self, text):
self.content += "%s
\n" % text
def list(self, entries):
self.content += "\n"
for e in entries:
self.content += "- %s
\n" % e
self.content += "
\n"
def table(self, headers, data):
self.content += " | "
for h in headers:
self.content += "%s | " % h
self.content += "
\n"
for row in data:
self.content += ""
for i, val in enumerate(row):
if i == 0:
self.content += "| %s | " % val
else:
self.content += "%s | " % val
self.content += "
\n"
self.content += "
\n"
def graph(self, label, graph_id, data):
self.content += '' % graph_id
js_array = "var %s = %s;" % (graph_id, data)
graph_command = JqueryTemplate % (graph_id, graph_id, label)
self.content += '' % (js_array, graph_command)
def _assemble(self):
return HtmlTemplate % (self.title or 'Untitled', self.content)
def close(self):
print "writing report to %s" % self.path
text = self._assemble()
with open(self.path, 'w') as f:
f.write(text)
class Report(object):
def __init__(self, writer, good, bad):
self.writer = writer
self.good = good
self.bad = bad
self.count = len(good)
self.start = self.good[0]['ts']
self.end = self.good[-1]['ts']
def output_group_stats(self):
self.writer.section('URL group stats')
grouped = gather_by_key(self.good, 'urlgroup')
headers = ['#', 'min', 'median', 'mean', '90%', 'max', '# slow', 'bytes sent', 'bytes/req']
data = []
for url_group in sorted(grouped.keys()):
vals = sorted(r['time'] for r in grouped[url_group])
slow_count = sum(1 for v in vals if v > SlowReqThreshold)
bytes_sent = sum(r['total_bytes'] for r in grouped[url_group])
data.append([url_group,
comma_format(len(vals)),
format_ms(min(vals)),
format_ms(median(vals)),
format_ms(mean(vals)),
format_ms(ninety_percentile(vals)),
format_ms(max(vals)),
slow_count,
format_bytes(bytes_sent),
format_bytes(bytes_sent / len(vals))])
self.writer.table(headers, data)
def output_activity_graphs(self, interval=DefaultTimeBucket):
self.writer.section('Activity Rates')
buckets = group_by_time_interval(self.good, interval, 'ts')
def second_avg(count): return float(count) / interval.seconds
def avg_bucket_rate(bs, cond):
return [second_avg(sum(1 for r in b if cond(r))) for b in bs]
reqs_per_sec = avg_bucket_rate(buckets, lambda r: True)
images_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Images', 'Avatars'))
dumps_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] in ('Post', 'Upload'))
chat_users_per_sec = avg_bucket_rate(buckets, lambda r: r['urlgroup'] == 'ChatUpdate')
self.writer.graph('Reqs / sec', 'reqs_per_sec',
build_js_array(self.start, interval, reqs_per_sec))
self.writer.graph('Images / sec', 'images_per_sec',
build_js_array(self.start, interval, images_per_sec))
self.writer.graph('Dumps / sec', 'dumps_per_sec',
build_js_array(self.start, interval, dumps_per_sec))
self.writer.graph('Chat users / sec', 'chat_users_per_sec',
build_js_array(self.start, interval, chat_users_per_sec))
self.writer.section('Response Time')
def calc_bucket_resp_time(bs, func):
return [func([r['time'] for r in b]) for b in bs]
mean_resp_time = calc_bucket_resp_time(buckets, mean)
median_resp_time = calc_bucket_resp_time(buckets, median)
ninety_pct_time = calc_bucket_resp_time(buckets, ninety_percentile)
max_resp_time = calc_bucket_resp_time(buckets, max)
self.writer.graph('Mean response time (ms)', 'mean_resp_time',
build_js_array(self.start, interval, mean_resp_time))
self.writer.graph('Median response time (ms)', 'median_per_sec',
build_js_array(self.start, interval, median_resp_time))
self.writer.graph('90% response time (ms)', 'ninety_pct_time',
build_js_array(self.start, interval, ninety_pct_time))
self.writer.graph('Max response time (ms)', 'max_resp_time',
build_js_array(self.start, interval, max_resp_time))
def output_slow_requests(self):
self.writer.section('Slow URLs')
slow = sorted((r for r in self.good if r['time'] > SlowReqThreshold),
key=lambda r: r['time'] * -1)
entries = []
for r in slow[:SlowReportCount]:
e = "%s: %s, %s sent" % (r['url'],
format_ms(r['time']),
format_bytes(r['total_bytes']))
entries.append(e)
self.writer.list(entries)
def output_unclassified_urls(self):
unclassified = set(g['url'] for g in self.good if g['urlgroup'] == 'Other')
unclassified_str = ', '.join(sorted(unclassified)[:100])
if len(unclassified) > 100:
unclassified_str += '...'
self.writer.section('Unclassified URLs (%s)' % len(unclassified))
self.writer.paragraph(unclassified_str)
def output_ip_usage(self):
grouped = gather_by_key(self.good, 'addr')
sum_bytes = lambda reqs: sum(r['total_bytes'] for r in reqs)
addr_bytes_gen = reversed(sorted((sum_bytes(reqs), addr) for addr, reqs in grouped.iteritems()))
self.writer.section('Heaviest Users by IP')
l = []
for count, addr in itertools.islice(addr_bytes_gen, IpReportCount):
bytes = sum(r['total_bytes'] for r in grouped[addr])
l.append("%s: %s reqs, %s sent" % (addr,
comma_format(count),
format_bytes(bytes)))
self.writer.list(l)
def output_report(self):
l = ["Interval: %s EST to %s EST (%s)" % (format_time(self.start),
format_time(self.end),
self.end - self.start),
"%s Total Requests (%.02f r/s)" \
% (comma_format(self.count), self.count / (self.end - self.start).seconds),
"Bytes sent %s" % format_bytes(sum(r['total_bytes'] for r in self.good))]
self.writer.section('Overview')
self.writer.list(l)
self.output_activity_graphs()
self.output_group_stats()
self.output_slow_requests()
self.output_ip_usage()
self.output_unclassified_urls()
def parse_time(s):
# Parse times such as 02/Jul/2010:22:21:25 -0400
return datetime.datetime.strptime(s[:17], '%d/%b/%Y:%H:%M')
InfoUrls = ['/about_us', '/privacy', '/help', '/search', '/privacy', '/terms', '/hall',
'/register', '/reset-request']
StaticUrls = ['/favicon.ico', '/robots.txt', '/']
def _classify_profile_url(url, nickset):
if url.startswith('/u/'):
url = url[2:]
comps = url.split('/')
comps.remove('')
if len(comps) == 0 or comps[0] not in nickset:
return None
else:
if len(comps) == 1:
return 'Profile'
elif comps[1] == 'popular':
return 'Popular'
elif comps[1] == 'favorites':
return 'Favorites'
elif comps[1] == 'log':
return 'Log'
else:
return 'Profile'
def classify_url(method, url):
profile_classification = _classify_profile_url(url, Nickset)
if profile_classification:
return profile_classification
elif url.startswith('/avatars/'):
return 'Avatars'
elif url.startswith('/refresh'):
return 'ChatUpdate'
elif url.startswith('/chat') or url.startswith('/browser'):
return 'ChatLoad'
elif url.startswith('/json/') and url.endswith('/favorites'):
return 'JsonFavs'
elif url.startswith('/log'):
return 'RoomLog'
elif url.startswith('/login') or url.startswith('/logout'):
return 'Login'
elif url in InfoUrls or url.startswith('/m/') or url.startswith('/directory'):
return 'InfoPages'
elif url == '/msg':
return 'Post'
elif url == '/submit-registration':
return 'Register'
elif url.startswith('/cmd/ghettosearch/'):
return 'Search'
elif url.startswith('/static/') or url in StaticUrls:
return 'Static'
elif url.startswith('/images/'):
return 'Images'
elif url == '/cmd/tag/add':
return 'TagAdd'
elif url == '/cmd/tag/rm':
return 'TagRemove'
elif url == '/upload/message':
return 'Upload'
elif url.startswith('/p/'):
return 'ViewDump'
else:
return 'Other'
def parse_ms(ms):
return int(float(ms) * 1000)
def parse_request(s):
method, url, httpversion = s.split(' ')
url_group = classify_url(method, url)
return {'method': method,
'httpversion': httpversion,
'urlgroup': url_group,
'url': url }
def parse_status(s):
code = int(s)
# This works because statuses are parsed after urls.
if code == 302:
return {'code': 302, 'urlgroup': 'Redirect'}
elif code == 404:
return {'code': 404, 'urlgroup': '404'}
elif code >= 500:
return {'code': code, 'urlgroup': 'Error'}
else:
return code
LineFormat = [('addr', str),
('ts', parse_time),
('request', parse_request),
('pipe', str),
('status', parse_status),
('time', parse_ms),
('total_bytes', int),
('body_bytes', int),
('refer', str),
('agent', str)]
def parse_line(l):
comps = [c.strip() for c in l.split('|')]
if len(comps) != len(LineFormat):
raise ValueError("Invalid number of log components (%s instead of %s): %s"
% (len(comps), len(LineFormat), l))
r = {}
for (label, f), v in zip(LineFormat, comps):
try:
res = f(v)
if isinstance(res, dict):
r.update(res)
else:
r[label] = f(v)
except Exception, e:
return "Unable to convert %s field '%s': %s (%s)" % (label, v, e, l.strip())
return r
def parse_log(path):
print 'reading %s' % path
lines = [parse_line(l) for l in open(path, 'r')]
good_lines = [l for l in lines if isinstance(l, dict)]
bad_lines = [l for l in lines if not isinstance(l, dict)]
return good_lines, bad_lines
if __name__ == "__main__":
if len(sys.argv) < 1:
print "usage: logreport.py logfile [outfile]"
sys.exit(1)
logpath = sys.argv[1]
outpath = sys.argv[2] if len(sys.argv) > 2 else None
Nickset = fetch_nicks()
good_lines, bad_lines = parse_log(logpath)
if len(good_lines) == 0:
print "Log file is empty or cannot be read!"
sys.exit(1)
if outpath:
report_title = get_report_title(good_lines)
writer = HtmlWriter(outpath, report_title)
else:
writer = ConsoleWriter()
report = Report(writer, good_lines, bad_lines)
report.output_report()
writer.close()