diff --git a/Makefile b/Makefile index fc715b4f..4e39c49c 100644 --- a/Makefile +++ b/Makefile @@ -219,7 +219,7 @@ test: localbuild pyflakes: pyflakes $(PY_FILES_DIRS) 2>&1 | \ grep -v "local variable 'dummy' is assigned to but never used" | \ - grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \ + grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_sitemap|parse_sitemapindex|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \ grep -v "undefined name '_'" | \ grep -v "undefined name '_n'" | cat diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 49fcb673..a38a9640 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -21,7 +21,7 @@ Handle http links. import requests from cStringIO import StringIO -from .. import (log, LOG_CHECK, strformat, +from .. import (log, LOG_CHECK, strformat, fileutil, url as urlutil, LinkCheckerError) from . import (internpaturl, proxysupport, httpheaders as headers) # import warnings @@ -227,6 +227,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if not self.valid: return False ctype = self.get_content_type() + # some content types must be validated with the page content + if ctype in ("application/xml", "text/xml"): + data = self.get_content() + io = StringIO(data) + rtype = fileutil.guess_mimetype_read(io.read) + if rtype is not None: + # XXX side effect + ctype = self.content_type = rtype if ctype not in self.ContentMimetypes: log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype) return False diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index e35385ec..fe2a7399 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -87,6 +87,8 @@ class UrlBase (object): "text/plain+chromium": "chromium", "application/x-plist+safari": "safari", "text/vnd.wap.wml": "wml", + "application/xml+sitemap": "sitemap", + "application/xml+sitemapindex": "sitemapindex", } # Read in 16kb chunks diff --git a/linkcheck/fileutil.py b/linkcheck/fileutil.py index 35d15508..a8acfe67 100644 --- a/linkcheck/fileutil.py +++ b/linkcheck/fileutil.py @@ -218,6 +218,8 @@ PARSE_CONTENTS = { "text/plain+opera": re.compile(r'^Opera Hotlist'), "text/plain+chromium": re.compile(r'^{\s*"checksum":'), "text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'), + "application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+ + + + http://www.example.com/ + 2005-01-01 + monthly + 0.8 + + diff --git a/tests/checker/data/sitemapindex.xml b/tests/checker/data/sitemapindex.xml new file mode 100644 index 00000000..98fa54e9 --- /dev/null +++ b/tests/checker/data/sitemapindex.xml @@ -0,0 +1,4 @@ + + +http://example.com/foo.xml + diff --git a/tests/test_fileutil.py b/tests/test_fileutil.py index e1516246..446bc528 100644 --- a/tests/test_fileutil.py +++ b/tests/test_fileutil.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2010-2012 Bastian Kleineidam +# Copyright (C) 2010-2014 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -39,7 +39,9 @@ class TestFileutil (unittest.TestCase): self.assertEqual(linkcheck.fileutil.get_mtime(file_non_existing), 0) def mime_test (self, filename, mime_expected): - mime = linkcheck.fileutil.guess_mimetype(get_file(filename)) + absfilename = get_file(filename) + with open(absfilename) as fd: + mime = linkcheck.fileutil.guess_mimetype(absfilename, read=fd.read) self.assertEqual(mime, mime_expected) def test_mime (self): @@ -47,4 +49,6 @@ class TestFileutil (unittest.TestCase): self.mime_test(filename, "application/x-plist+safari") filename = os.path.join("plist_xml", "Bookmarks.plist") self.mime_test(filename, "application/x-plist+safari") - self.mime_test("test.wml", "text/vnd.wap.wml") + self.mime_test("file.wml", "text/vnd.wap.wml") + self.mime_test("sitemap.xml", "application/xml+sitemap") + self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")