Log all valid URLs in sitemap loggers.

This commit is contained in:
Bastian Kleineidam 2012-09-19 09:17:08 +02:00
parent 681cd90405
commit 71fba0f8b7
3 changed files with 22 additions and 25 deletions

View file

@ -352,7 +352,6 @@ class Configuration (dict):
if self['loginurl']:
self.sanitize_loginurl()
self.sanitize_proxies()
self.sanitize_filter()
def sanitize_anchors (self):
"""Make anchor configuration consistent."""
@ -370,15 +369,6 @@ class Configuration (dict):
self['output'] = 'text'
self['logger'] = self.logger_new(self['output'])
def sanitize_filter(self):
"""Set filter flags for graph loggers."""
from ..logger import gml, dot, gxml
graph_classes = (gml.GMLLogger, dot.DOTLogger, gxml.GraphXMLLogger)
for logger in [self['logger']] + self['fileoutput']:
if logger.__class__ in graph_classes:
self['complete'] = True
break
def sanitize_scanvirus (self):
"""Ensure clamav is installed for virus checking."""
try:

View file

@ -32,6 +32,15 @@ class GraphLogger (Logger):
self.nodes = {}
self.nodeid = 0
def log_filter_url(self, url_data, do_print):
"""Update accounting data and log all valid URLs regardless the
do_print flag.
"""
self.stats.log_url(url_data, do_print)
# ignore the do_print flag and determine ourselves if we filter the url
if url_data.valid:
self.log_url(url_data)
def get_node (self, url_data):
"""Return new node data or None if node already exists."""
if not url_data.url:

View file

@ -62,31 +62,29 @@ class SitemapXmlLogger (xmllog.XMLLogger):
self.xml_starttag(u'urlset', attrs)
self.flush()
def filter_url(self, url_data):
"""Determine if URL should not be logged in sitemap.
Only valid HTML pages which start with the first logged URL are
considered."""
if not url_data.valid:
return True
if not url_data.url.startswith((u'http:', u'https:')):
return True
if not url_data.url.startswith(self.prefix):
return True
if url_data.content_type not in ('text/html', "application/xhtml+xml"):
return True
return False
def log_filter_url(self, url_data, do_print):
"""
Update accounting data and determine if URL should be included in the sitemap.
"""
self.stats.log_url(url_data, do_print)
# ignore the do_print flag and determine ourselves if we filter the url
if (url_data.valid and
url_data.url.startswith((u'http:', u'https:')) and
url_data.url.startswith(self.prefix) and
url_data.content_type in ('text/html', "application/xhtml+xml")):
self.log_url(url_data)
def log_url (self, url_data):
"""
Log URL data in sitemap format.
"""
if self.prefix is None:
# first URL (ie. the homepage) gets priority 1.0 per default
self.prefix = url_data.url
priority = 1.0
else:
# all other pages get priority 0.5 per default
priority = 0.5
if self.filter_url(url_data):
return
if self.priority is not None:
priority = self.priority
self.xml_starttag(u'url')