Implement sitemap and sitemap index parsing.

2026-05-08 14:44:46 +00:00 · 2014-03-05 19:26:37 +01:00 · 2014-03-05 19:26:37 +01:00 · ef13a3fce1
commit ef13a3fce1
parent b72cf252fb
9 changed files with 121 additions and 9 deletions
--- a/2
+++ b/2
@ -219,7 +219,7 @@ test:	localbuild
 pyflakes:
 	pyflakes $(PY_FILES_DIRS) 2>&1 | \
          grep -v "local variable 'dummy' is assigned to but never used" | \
-          grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
+          grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_sitemap|parse_sitemapindex|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
          grep -v "undefined name '_'" | \
 	  grep -v "undefined name '_n'" | cat

--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -21,7 +21,7 @@ Handle http links.
 import requests
 from cStringIO import StringIO

-from .. import (log, LOG_CHECK, strformat,
+from .. import (log, LOG_CHECK, strformat, fileutil,
    url as urlutil, LinkCheckerError)
 from . import (internpaturl, proxysupport, httpheaders as headers)
 # import warnings
@ -227,6 +227,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        if not self.valid:
            return False
        ctype = self.get_content_type()
+        # some content types must be validated with the page content
+        if ctype in ("application/xml", "text/xml"):
+            data = self.get_content()
+            io = StringIO(data)
+            rtype = fileutil.guess_mimetype_read(io.read)
+            if rtype is not None:
+                # XXX side effect
+                ctype = self.content_type = rtype
        if ctype not in self.ContentMimetypes:
            log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
            return False
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -87,6 +87,8 @@ class UrlBase (object):
        "text/plain+chromium": "chromium",
        "application/x-plist+safari": "safari",
        "text/vnd.wap.wml": "wml",
+        "application/xml+sitemap": "sitemap",
+        "application/xml+sitemapindex": "sitemapindex",
    }

    # Read in 16kb chunks
--- a/linkcheck/fileutil.py
+++ b/linkcheck/fileutil.py
@ -218,6 +218,8 @@ PARSE_CONTENTS = {
    "text/plain+opera": re.compile(r'^Opera Hotlist'),
    "text/plain+chromium": re.compile(r'^{\s*"checksum":'),
    "text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
+    "application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
+    "application/xml+sitemap": re.compile(r'(?i)<\?xml[^<]+<urlset\s+'),
 }

 def guess_mimetype (filename, read=None):
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -49,18 +49,21 @@ def parse_opera (url_data):
    for url, name, lineno in parse_bookmark_data(url_data.get_content()):
        url_data.add_url(url, line=lineno, name=name)

+
 def parse_chromium (url_data):
    """Parse a Chromium or Google Chrome bookmark file."""
    from ..bookmarks.chromium import parse_bookmark_data
    for url, name in parse_bookmark_data(url_data.get_content()):
        url_data.add_url(url, name=name)

+
 def parse_safari (url_data):
    """Parse a Safari bookmark file."""
    from ..bookmarks.safari import parse_bookmark_data
    for url, name in parse_bookmark_data(url_data.get_content()):
        url_data.add_url(url, name=name)

+
 def parse_text (url_data):
    """Parse a text file with one url per line; comment and blank
    lines are ignored."""
@ -87,6 +90,7 @@ def parse_css (url_data):
            url = strformat.unquote(mo.group("url").strip())
            url_data.add_url(url, line=lineno, column=column)

+
 def parse_swf (url_data):
    """Parse a SWF file for URLs."""
    linkfinder = linkparse.swf_url_re.finditer
@ -94,6 +98,7 @@ def parse_swf (url_data):
        url = mo.group()
        url_data.add_url(url)

+
 def parse_word (url_data):
    """Parse a word file for hyperlinks."""
    if not winutil.has_word():
@ -116,6 +121,7 @@ def parse_word (url_data):
    except winutil.Error as msg:
        log.warn(LOG_CHECK, "Error parsing word file: %s", msg)

+
 def parse_wml (url_data):
    """Parse into WML content and search for URLs to check.
    Found URLs are added to the URL queue.
@ -161,7 +167,7 @@ def parse_firefox (url_data):
    """Parse a Firefox3 bookmark file."""
    filename = url_data.get_os_filename()
    for url, name in firefox.parse_bookmark_file(filename):
-        # XXX use add_url
-        url_data = get_url_from(url, url_data.recursion_level+1,
-            url_data.aggregate, parent_url=url_data.url, name=name)
-        url_data.aggregate.urlqueue.put(url_data)
+        url_data.add_url(url, name=name)
+
+
+from .sitemap import parse_sitemap, parse_sitemapindex
--- a/linkcheck/parser/sitemap.py
+++ b/linkcheck/parser/sitemap.py
@ -0,0 +1,77 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2000-2014 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+Main functions for link parsing
+"""
+from xml.parsers.expat import ParserCreate
+
+
+class XmlTagUrlParser(object):
+    """Parse XML files and find URLs in text content of a tag name."""
+
+    def __init__(self, tag):
+        """Initialize the parser."""
+        self.tag = tag
+        self.parser = ParserCreate()
+        self.parser.buffer_text = True
+        self.parser.returns_unicode = True
+        self.parser.StartElementHandler = self.start_element
+        self.parser.EndElementHandler = self.end_element
+        self.parser.CharacterDataHandler = self.char_data
+
+    def parse(self, url_data):
+        """Parse XML URL data."""
+        self.url_data = url_data
+        self.loc = False
+        self.url = u""
+        data = url_data.get_content()
+        isfinal = True
+        self.parser.Parse(data, isfinal)
+
+    def start_element(self, name, attrs):
+        """Set tag status for start element."""
+        self.in_tag = (name == self.tag)
+        self.url = u""
+
+    def end_element(self, name):
+        """If end tag is our tag, call add_url()."""
+        self.in_tag = False
+        if name == self.tag:
+            self.add_url()
+
+    def add_url(self):
+        """Add non-empty URLs to the queue."""
+        if self.url:
+            self.url_data.add_url(self.url, line=self.parser.CurrentLineNumber,
+                column=self.parser.CurrentColumnNumber)
+            self.url = u""
+
+    def char_data(self, data):
+        """If inside the wanted tag, append data to URL."""
+        if self.loc:
+            self.url += data
+
+
+def parse_sitemap(url_data):
+    """Parse XML sitemap data."""
+    XmlTagUrlParser(u"loc").parse(url_data)
+
+
+def parse_sitemapindex(url_data):
+    """Parse XML sitemap index data."""
+    XmlTagUrlParser(u"loc").parse(url_data)
+
--- a/tests/checker/data/sitemap.xml
+++ b/tests/checker/data/sitemap.xml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+   <url>
+      <loc>http://www.example.com/</loc>
+      <lastmod>2005-01-01</lastmod>
+      <changefreq>monthly</changefreq>
+      <priority>0.8</priority>
+   </url>
+</urlset> 
--- a/tests/checker/data/sitemapindex.xml
+++ b/tests/checker/data/sitemapindex.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+<sitemap><loc>http://example.com/foo.xml</loc></sitemap>
+</sitemapindex>
--- a/tests/test_fileutil.py
+++ b/tests/test_fileutil.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2012 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -39,7 +39,9 @@ class TestFileutil (unittest.TestCase):
        self.assertEqual(linkcheck.fileutil.get_mtime(file_non_existing), 0)

    def mime_test (self, filename, mime_expected):
-        mime = linkcheck.fileutil.guess_mimetype(get_file(filename))
+        absfilename = get_file(filename)
+        with open(absfilename) as fd:
+            mime = linkcheck.fileutil.guess_mimetype(absfilename, read=fd.read)
        self.assertEqual(mime, mime_expected)

    def test_mime (self):
@ -47,4 +49,6 @@ class TestFileutil (unittest.TestCase):
        self.mime_test(filename, "application/x-plist+safari")
        filename = os.path.join("plist_xml", "Bookmarks.plist")
        self.mime_test(filename, "application/x-plist+safari")
-        self.mime_test("test.wml", "text/vnd.wap.wml")
+        self.mime_test("file.wml", "text/vnd.wap.wml")
+        self.mime_test("sitemap.xml", "application/xml+sitemap")
+        self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")