mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-21 22:54:45 +00:00
Log all valid URLs in sitemap loggers.
This commit is contained in:
parent
681cd90405
commit
71fba0f8b7
3 changed files with 22 additions and 25 deletions
|
|
@ -352,7 +352,6 @@ class Configuration (dict):
|
|||
if self['loginurl']:
|
||||
self.sanitize_loginurl()
|
||||
self.sanitize_proxies()
|
||||
self.sanitize_filter()
|
||||
|
||||
def sanitize_anchors (self):
|
||||
"""Make anchor configuration consistent."""
|
||||
|
|
@ -370,15 +369,6 @@ class Configuration (dict):
|
|||
self['output'] = 'text'
|
||||
self['logger'] = self.logger_new(self['output'])
|
||||
|
||||
def sanitize_filter(self):
|
||||
"""Set filter flags for graph loggers."""
|
||||
from ..logger import gml, dot, gxml
|
||||
graph_classes = (gml.GMLLogger, dot.DOTLogger, gxml.GraphXMLLogger)
|
||||
for logger in [self['logger']] + self['fileoutput']:
|
||||
if logger.__class__ in graph_classes:
|
||||
self['complete'] = True
|
||||
break
|
||||
|
||||
def sanitize_scanvirus (self):
|
||||
"""Ensure clamav is installed for virus checking."""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -32,6 +32,15 @@ class GraphLogger (Logger):
|
|||
self.nodes = {}
|
||||
self.nodeid = 0
|
||||
|
||||
def log_filter_url(self, url_data, do_print):
|
||||
"""Update accounting data and log all valid URLs regardless the
|
||||
do_print flag.
|
||||
"""
|
||||
self.stats.log_url(url_data, do_print)
|
||||
# ignore the do_print flag and determine ourselves if we filter the url
|
||||
if url_data.valid:
|
||||
self.log_url(url_data)
|
||||
|
||||
def get_node (self, url_data):
|
||||
"""Return new node data or None if node already exists."""
|
||||
if not url_data.url:
|
||||
|
|
|
|||
|
|
@ -62,31 +62,29 @@ class SitemapXmlLogger (xmllog.XMLLogger):
|
|||
self.xml_starttag(u'urlset', attrs)
|
||||
self.flush()
|
||||
|
||||
def filter_url(self, url_data):
|
||||
"""Determine if URL should not be logged in sitemap.
|
||||
Only valid HTML pages which start with the first logged URL are
|
||||
considered."""
|
||||
if not url_data.valid:
|
||||
return True
|
||||
if not url_data.url.startswith((u'http:', u'https:')):
|
||||
return True
|
||||
if not url_data.url.startswith(self.prefix):
|
||||
return True
|
||||
if url_data.content_type not in ('text/html', "application/xhtml+xml"):
|
||||
return True
|
||||
return False
|
||||
def log_filter_url(self, url_data, do_print):
|
||||
"""
|
||||
Update accounting data and determine if URL should be included in the sitemap.
|
||||
"""
|
||||
self.stats.log_url(url_data, do_print)
|
||||
# ignore the do_print flag and determine ourselves if we filter the url
|
||||
if (url_data.valid and
|
||||
url_data.url.startswith((u'http:', u'https:')) and
|
||||
url_data.url.startswith(self.prefix) and
|
||||
url_data.content_type in ('text/html', "application/xhtml+xml")):
|
||||
self.log_url(url_data)
|
||||
|
||||
def log_url (self, url_data):
|
||||
"""
|
||||
Log URL data in sitemap format.
|
||||
"""
|
||||
if self.prefix is None:
|
||||
# first URL (ie. the homepage) gets priority 1.0 per default
|
||||
self.prefix = url_data.url
|
||||
priority = 1.0
|
||||
else:
|
||||
# all other pages get priority 0.5 per default
|
||||
priority = 0.5
|
||||
if self.filter_url(url_data):
|
||||
return
|
||||
if self.priority is not None:
|
||||
priority = self.priority
|
||||
self.xml_starttag(u'url')
|
||||
|
|
|
|||
Loading…
Reference in a new issue