From ec8b6e09f09b46f80a3284bc4a7d5bb708fc8dac Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 28 Oct 2019 19:19:25 +0000 Subject: [PATCH] Fix XmlTagUrlParser and make Python 3 compatible URLs within a sitemap file were not being captured. --- linkcheck/parser/sitemap.py | 11 +++++++---- tests/checker/data/sitemap.xml | 6 ++++++ tests/checker/data/sitemap.xml.result | 8 ++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/linkcheck/parser/sitemap.py b/linkcheck/parser/sitemap.py index df99ed07..3e7539b0 100644 --- a/linkcheck/parser/sitemap.py +++ b/linkcheck/parser/sitemap.py @@ -29,7 +29,10 @@ class XmlTagUrlParser(object): self.tag = tag self.parser = ParserCreate() self.parser.buffer_text = True - self.parser.returns_unicode = True + try: + self.parser.returns_unicode = True + except AttributeError: + pass # Python 3 self.parser.StartElementHandler = self.start_element self.parser.EndElementHandler = self.end_element self.parser.CharacterDataHandler = self.char_data @@ -37,7 +40,7 @@ class XmlTagUrlParser(object): def parse(self, url_data): """Parse XML URL data.""" self.url_data = url_data - self.loc = False + self.in_tag = False self.url = u"" data = url_data.get_raw_content() isfinal = True @@ -45,6 +48,7 @@ class XmlTagUrlParser(object): self.parser.Parse(data, isfinal) except ExpatError as expaterr: self.url_data.add_warning(expaterr.message,tag=WARN_XML_PARSE_ERROR) + def start_element(self, name, attrs): """Set tag status for start element.""" self.in_tag = (name == self.tag) @@ -65,7 +69,7 @@ class XmlTagUrlParser(object): def char_data(self, data): """If inside the wanted tag, append data to URL.""" - if self.loc: + if self.in_tag: self.url += data @@ -77,4 +81,3 @@ def parse_sitemap(url_data): def parse_sitemapindex(url_data): """Parse XML sitemap index data.""" XmlTagUrlParser(u"loc").parse(url_data) - diff --git a/tests/checker/data/sitemap.xml b/tests/checker/data/sitemap.xml index 30e23048..b0ae9a20 100644 --- a/tests/checker/data/sitemap.xml +++ b/tests/checker/data/sitemap.xml @@ -6,4 +6,10 @@ monthly 0.8 + + http://www.example.com/?ascii=nø + 2005-01-01 + monthly + 0.8 + diff --git a/tests/checker/data/sitemap.xml.result b/tests/checker/data/sitemap.xml.result index a4f2372d..b24a5379 100644 --- a/tests/checker/data/sitemap.xml.result +++ b/tests/checker/data/sitemap.xml.result @@ -2,3 +2,11 @@ url http://localhost:%(port)d/%(datadir)s/sitemap.xml cache key http://localhost:%(port)d/%(datadir)s/sitemap.xml real url http://localhost:%(port)d/%(datadir)s/sitemap.xml valid +url http://www.example.com/ +cache key http://www.example.com/ +real url http://www.example.com/ +valid +url http://www.example.com/?ascii=nø +cache key http://www.example.com/?ascii=n%%C3%%B8 +real url http://www.example.com/?ascii=n%%C3%%B8 +valid