From ad8eb424f38e06da00228c89e6de1b61d8f6933b Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Fri, 13 Jun 2014 20:50:37 +0200 Subject: [PATCH] Merge Mark-Hetherington-xml-parse-warn with slight modifications. --- linkcheck/checker/const.py | 2 ++ linkcheck/director/checker.py | 1 + linkcheck/parser/sitemap.py | 9 ++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 53db7351..14c21d6e 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -100,6 +100,7 @@ WARN_IGNORE_URL = "ignore-url" WARN_MAIL_NO_MX_HOST = "mail-no-mx-host" WARN_NNTP_NO_SERVER = "nntp-no-server" WARN_NNTP_NO_NEWSGROUP = "nntp-no-newsgroup" +WARN_XML_PARSE_ERROR = "xml-parse-error" # registered warnings Warnings = { @@ -123,4 +124,5 @@ Warnings = { WARN_NNTP_NO_SERVER: _("No NNTP server was found."), WARN_NNTP_NO_NEWSGROUP: _("The NNTP newsgroup could not be found."), WARN_URL_OBFUSCATED_IP: _("The IP is obfuscated."), + WARN_XML_PARSE_ERROR: _("XML could not be parsed."), } diff --git a/linkcheck/director/checker.py b/linkcheck/director/checker.py index 359ccfa9..1f694103 100644 --- a/linkcheck/director/checker.py +++ b/linkcheck/director/checker.py @@ -59,6 +59,7 @@ def check_url(url_data, logger): # redirect aliases cache.add_result(alias, result) # parse content recursively + # XXX this could add new warnings which should be cached. if do_parse: parser.parse_url(url_data) finally: diff --git a/linkcheck/parser/sitemap.py b/linkcheck/parser/sitemap.py index 237a8b7a..d649f647 100644 --- a/linkcheck/parser/sitemap.py +++ b/linkcheck/parser/sitemap.py @@ -18,7 +18,8 @@ Main functions for link parsing """ from xml.parsers.expat import ParserCreate - +from xml.parsers.expat import ExpatError +from ..checker.const import (WARN_XML_PARSE_ERROR) class XmlTagUrlParser(object): """Parse XML files and find URLs in text content of a tag name.""" @@ -40,8 +41,10 @@ class XmlTagUrlParser(object): self.url = u"" data = url_data.get_content() isfinal = True - self.parser.Parse(data, isfinal) - + try: + self.parser.Parse(data, isfinal) + except ExpatError as expaterr: + self.url_data.add_warning(expaterr.message,tag=WARN_XML_PARSE_ERROR) def start_element(self, name, attrs): """Set tag status for start element.""" self.in_tag = (name == self.tag)