Detect sitemaps that do not start with an XML declaration

This commit is contained in:
Chris Mayo 2020-08-11 19:35:56 +01:00
parent a977e4d712
commit 8c804c35a5
4 changed files with 22 additions and 2 deletions

View file

@ -56,8 +56,9 @@ PARSE_CONTENTS = {
"text/plain+opera": re.compile(r'^Opera Hotlist'),
"text/plain+chromium": re.compile(r'^{\s*"checksum":'),
"text/plain+linkchecker": re.compile(r'^# LinkChecker URL list', re.IGNORECASE),
"application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
"application/xml+sitemap": re.compile(r'<\?xml[^<]+<urlset\s+', re.IGNORECASE),
"application/xml+sitemapindex": re.compile(r'(<\?xml[^<]+)?<sitemapindex\s+',
re.IGNORECASE),
"application/xml+sitemap": re.compile(r'(<\?xml[^<]+)?<urlset\s+', re.IGNORECASE),
}

View file

@ -0,0 +1,14 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/?ascii=nø</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset>

View file

@ -0,0 +1,3 @@
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>sitemap.xml</loc></sitemap>
</sitemapindex>

View file

@ -40,3 +40,5 @@ class TestMiMeutil(unittest.TestCase):
self.mime_test("file.wml", "text/vnd.wap.wml")
self.mime_test("sitemap.xml", "application/xml+sitemap")
self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")
self.mime_test("no_decl_sitemap.xml", "application/xml+sitemap")
self.mime_test("no_decl_sitemapindex.xml", "application/xml+sitemapindex")