Fix W3C HTML validation.

2026-04-23 15:44:44 +00:00 · 2011-10-09 21:16:45 +02:00 · 2011-10-09 21:16:45 +02:00 · 27b7b1cb49
commit 27b7b1cb49
parent 09bac15bfb
2 changed files with 32 additions and 11 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -6,6 +6,8 @@ Fixes:
  Closes: SF bug #3388291
 - logging: Fix SQL output.
  Closes: SF bug #3415274
+- checking: Fix W3C HTML checking by using the new soap12 output.
+  Closes: SF bug #3413022

 Features:
 - checking: If a warning regex is configured, multiple matches in
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -71,6 +71,17 @@ def url_norm (url, encoding=None):
        raise LinkCheckerError(msg)


+def getXmlText (parent, tag):
+    """Return XML content of given tag in parent element."""
+    elem = parent.getElementsByTagName(tag)[0]
+    # Yes, the DOM standard is awful.
+    rc = []
+    for node in elem.childNodes:
+        if node.nodeType == node.TEXT_NODE:
+            rc.append(node.data)
+    return ''.join(rc)
+
+
 class UrlBase (object):
    """An URL with additional information like validity etc."""

@ -829,6 +840,21 @@ class UrlBase (object):
                _("warning: cssutils parsing caused error: %(msg)s") %
                {"msg": err})

+    def check_w3_errors (self, xml, w3type):
+        """Add warnings for W3C HTML or CSS errors in xml format.
+        w3type is either "W3C HTML" or "W3C CSS"."""
+        from xml.dom.minidom import parseString
+        dom = parseString(xml)
+        for error in dom.getElementsByTagName('m:error'):
+            warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
+            attrs = {
+                "w3type": w3type,
+                "line": getXmlText(error, "m:line"),
+                "column": getXmlText(error, "m:col"),
+                "msg": getXmlText(error, "m:message"),
+            }
+            self.add_warning(warnmsg % attrs)
+
    def check_html_w3 (self):
        """Check HTML syntax of this page (which is supposed to be HTML)
        with the online W3C HTML validator documented at
@ -839,17 +865,14 @@ class UrlBase (object):
            u = urllib2.urlopen('http://validator.w3.org/check',
                urllib.urlencode({
                    'fragment': self.get_content(),
-                    'output': 'xml',
+                    'output': 'soap12',
                }))
            if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
                self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
                return
-            from xml.dom.minidom import parseString
-            dom = parseString(u.read())
-            elements = dom.getElementsByTagName('messages')[0].getElementsByTagName('msg')
-            for msg in [e.firstChild.wholeText for e in elements]:
-                self.add_warning(u"W3C HTML validation: %s" % msg)
+            self.check_w3_errors(u.read(), "W3C HTML")
        except Exception:
+            raise
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library
            err = str(sys.exc_info()[1])
@ -883,11 +906,7 @@ class UrlBase (object):
            if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
                self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
                return
-            from xml.dom.minidom import parseString
-            dom = parseString(r.read())
-            elements = dom.getElementsByTagName('m:errors')[0].getElementsByTagName('m:error')
-            for msg in [e.firstChild.wholeText for e in elements]:
-                self.add_warning(u"W3C HTML validation: %s" % msg)
+            self.check_w3_errors(r.read(), "W3C HTML")
        except Exception:
            # catch _all_ exceptions since we dont want third party module
            # errors to propagate into this library