mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-23 07:34:44 +00:00
Gather URL length statistics.
This commit is contained in:
parent
7c55351511
commit
f2b8c742fc
2 changed files with 35 additions and 6 deletions
|
|
@ -49,8 +49,9 @@ del _
|
|||
class LogStatistics (object):
|
||||
"""Gather log statistics:
|
||||
- number of errors, warnings and valid links
|
||||
- type of links (image, video, audio, text)
|
||||
- type of contents (image, video, audio, text, ...)
|
||||
- number of different domains
|
||||
- URL lengths
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
|
|
@ -71,9 +72,13 @@ class LogStatistics (object):
|
|||
video=0,
|
||||
audio=0,
|
||||
application=0,
|
||||
mail=0,
|
||||
other=0,
|
||||
unknown=0,
|
||||
)
|
||||
self.max_url_length = 0
|
||||
self.min_url_length = 0
|
||||
self.avg_url_length = 0.0
|
||||
self.avg_number = 0
|
||||
|
||||
def log_url (self, url_data, do_print):
|
||||
self.number += 1
|
||||
|
|
@ -90,9 +95,22 @@ class LogStatistics (object):
|
|||
key = url_data.content_type.split('/', 1)[0].lower()
|
||||
if key not in self.link_types:
|
||||
key = "other"
|
||||
elif url_data.url.startswith(u"mailto:"):
|
||||
key = "mail"
|
||||
else:
|
||||
key = "unknown"
|
||||
key = "other"
|
||||
self.link_types[key] += 1
|
||||
if url_data.url:
|
||||
l = len(url_data.url)
|
||||
self.max_url_length = max(l, self.max_url_length)
|
||||
if self.min_url_length == 0:
|
||||
self.min_url_length = l
|
||||
else:
|
||||
self.min_url_length = min(l, self.min_url_length)
|
||||
# track average number separately since empty URLs do not count
|
||||
self.avg_number += 1
|
||||
# calculate running average
|
||||
self.avg_url_length += (l - self.avg_url_length) / self.avg_number
|
||||
|
||||
|
||||
class Logger (object):
|
||||
|
|
|
|||
|
|
@ -263,12 +263,23 @@ class TextLogger (Logger):
|
|||
"duration": strformat.strduration_long(duration)})
|
||||
|
||||
def write_stats (self):
|
||||
self.writeln()
|
||||
domains = len(self.stats.domains)
|
||||
if domains > 1:
|
||||
self.writeln(_("Found %d different domains.") % domains)
|
||||
self.writeln(_("Found %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(other)d other"
|
||||
" and %(unknown)d unknown URLs.") % self.stats.link_types)
|
||||
if self.stats.number > 0:
|
||||
self.writeln(_(
|
||||
"Detected %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(mail)d mail"
|
||||
" and %(other)d other contents.") % self.stats.link_types)
|
||||
self.writeln(_("Minimum URL length is %d.") %
|
||||
self.stats.min_url_length)
|
||||
self.writeln(_("Maximum URL length is %d.") %
|
||||
self.stats.max_url_length)
|
||||
self.writeln(_("Average URL length is %d.") %
|
||||
self.stats.avg_url_length)
|
||||
else:
|
||||
self.writeln(_("No statistics available since zero URLs were checked."))
|
||||
|
||||
def end_output (self):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue