Gather URL length statistics.

This commit is contained in:
Bastian Kleineidam 2010-12-15 07:55:00 +01:00
parent 7c55351511
commit f2b8c742fc
2 changed files with 35 additions and 6 deletions

View file

@ -49,8 +49,9 @@ del _
class LogStatistics (object):
"""Gather log statistics:
- number of errors, warnings and valid links
- type of links (image, video, audio, text)
- type of contents (image, video, audio, text, ...)
- number of different domains
- URL lengths
"""
def __init__ (self):
@ -71,9 +72,13 @@ class LogStatistics (object):
video=0,
audio=0,
application=0,
mail=0,
other=0,
unknown=0,
)
self.max_url_length = 0
self.min_url_length = 0
self.avg_url_length = 0.0
self.avg_number = 0
def log_url (self, url_data, do_print):
self.number += 1
@ -90,9 +95,22 @@ class LogStatistics (object):
key = url_data.content_type.split('/', 1)[0].lower()
if key not in self.link_types:
key = "other"
elif url_data.url.startswith(u"mailto:"):
key = "mail"
else:
key = "unknown"
key = "other"
self.link_types[key] += 1
if url_data.url:
l = len(url_data.url)
self.max_url_length = max(l, self.max_url_length)
if self.min_url_length == 0:
self.min_url_length = l
else:
self.min_url_length = min(l, self.min_url_length)
# track average number separately since empty URLs do not count
self.avg_number += 1
# calculate running average
self.avg_url_length += (l - self.avg_url_length) / self.avg_number
class Logger (object):

View file

@ -263,12 +263,23 @@ class TextLogger (Logger):
"duration": strformat.strduration_long(duration)})
def write_stats (self):
self.writeln()
domains = len(self.stats.domains)
if domains > 1:
self.writeln(_("Found %d different domains.") % domains)
self.writeln(_("Found %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(other)d other"
" and %(unknown)d unknown URLs.") % self.stats.link_types)
if self.stats.number > 0:
self.writeln(_(
"Detected %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(mail)d mail"
" and %(other)d other contents.") % self.stats.link_types)
self.writeln(_("Minimum URL length is %d.") %
self.stats.min_url_length)
self.writeln(_("Maximum URL length is %d.") %
self.stats.max_url_length)
self.writeln(_("Average URL length is %d.") %
self.stats.avg_url_length)
else:
self.writeln(_("No statistics available since zero URLs were checked."))
def end_output (self):
"""