Log all valid URLs in sitemap loggers.

2026-04-21 22:54:45 +00:00 · 2012-09-19 09:17:08 +02:00 · 2012-09-19 09:17:08 +02:00 · 71fba0f8b7
commit 71fba0f8b7
parent 681cd90405
3 changed files with 22 additions and 25 deletions
--- a/linkcheck/configuration/init.py
+++ b/linkcheck/configuration/init.py
@ -352,7 +352,6 @@ class Configuration (dict):
        if self['loginurl']:
            self.sanitize_loginurl()
        self.sanitize_proxies()
-        self.sanitize_filter()

    def sanitize_anchors (self):
        """Make anchor configuration consistent."""
@ -370,15 +369,6 @@ class Configuration (dict):
            self['output'] = 'text'
        self['logger'] = self.logger_new(self['output'])

-    def sanitize_filter(self):
-        """Set filter flags for graph loggers."""
-        from ..logger import gml, dot, gxml
-        graph_classes = (gml.GMLLogger, dot.DOTLogger, gxml.GraphXMLLogger)
-        for logger in [self['logger']] + self['fileoutput']:
-            if logger.__class__ in graph_classes:
-                self['complete'] = True
-                break
-
    def sanitize_scanvirus (self):
        """Ensure clamav is installed for virus checking."""
        try:
--- a/linkcheck/logger/graph.py
+++ b/linkcheck/logger/graph.py
@ -32,6 +32,15 @@ class GraphLogger (Logger):
        self.nodes = {}
        self.nodeid = 0

+    def log_filter_url(self, url_data, do_print):
+        """Update accounting data and log all valid URLs regardless the
+        do_print flag.
+        """
+        self.stats.log_url(url_data, do_print)
+        # ignore the do_print flag and determine ourselves if we filter the url
+        if url_data.valid:
+            self.log_url(url_data)
+
    def get_node (self, url_data):
        """Return new node data or None if node already exists."""
        if not url_data.url:
--- a/linkcheck/logger/sitemapxml.py
+++ b/linkcheck/logger/sitemapxml.py
@ -62,31 +62,29 @@ class SitemapXmlLogger (xmllog.XMLLogger):
        self.xml_starttag(u'urlset', attrs)
        self.flush()

-    def filter_url(self, url_data):
-        """Determine if URL should not be logged in sitemap.
-        Only valid HTML pages which start with the first logged URL are
-        considered."""
-        if not url_data.valid:
-            return True
-        if not url_data.url.startswith((u'http:', u'https:')):
-            return True
-        if not url_data.url.startswith(self.prefix):
-            return True
-        if url_data.content_type not in ('text/html', "application/xhtml+xml"):
-            return True
-        return False
+    def log_filter_url(self, url_data, do_print):
+        """
+        Update accounting data and determine if URL should be included in the sitemap.
+        """
+        self.stats.log_url(url_data, do_print)
+        # ignore the do_print flag and determine ourselves if we filter the url
+        if (url_data.valid and
+            url_data.url.startswith((u'http:', u'https:')) and
+            url_data.url.startswith(self.prefix) and
+            url_data.content_type in ('text/html', "application/xhtml+xml")):
+            self.log_url(url_data)

    def log_url (self, url_data):
        """
        Log URL data in sitemap format.
        """
        if self.prefix is None:
+            # first URL (ie. the homepage) gets priority 1.0 per default
            self.prefix = url_data.url
            priority = 1.0
        else:
+            # all other pages get priority 0.5 per default
            priority = 0.5
-        if self.filter_url(url_data):
-            return
        if self.priority is not None:
            priority = self.priority
        self.xml_starttag(u'url')