Implement sitemap and sitemap index parsing.

This commit is contained in:
Bastian Kleineidam 2014-03-05 19:26:37 +01:00
parent b72cf252fb
commit ef13a3fce1
9 changed files with 121 additions and 9 deletions

View file

@ -219,7 +219,7 @@ test: localbuild
pyflakes:
pyflakes $(PY_FILES_DIRS) 2>&1 | \
grep -v "local variable 'dummy' is assigned to but never used" | \
grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_sitemap|parse_sitemapindex|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
grep -v "undefined name '_'" | \
grep -v "undefined name '_n'" | cat

View file

@ -21,7 +21,7 @@ Handle http links.
import requests
from cStringIO import StringIO
from .. import (log, LOG_CHECK, strformat,
from .. import (log, LOG_CHECK, strformat, fileutil,
url as urlutil, LinkCheckerError)
from . import (internpaturl, proxysupport, httpheaders as headers)
# import warnings
@ -227,6 +227,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if not self.valid:
return False
ctype = self.get_content_type()
# some content types must be validated with the page content
if ctype in ("application/xml", "text/xml"):
data = self.get_content()
io = StringIO(data)
rtype = fileutil.guess_mimetype_read(io.read)
if rtype is not None:
# XXX side effect
ctype = self.content_type = rtype
if ctype not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
return False

View file

@ -87,6 +87,8 @@ class UrlBase (object):
"text/plain+chromium": "chromium",
"application/x-plist+safari": "safari",
"text/vnd.wap.wml": "wml",
"application/xml+sitemap": "sitemap",
"application/xml+sitemapindex": "sitemapindex",
}
# Read in 16kb chunks

View file

@ -218,6 +218,8 @@ PARSE_CONTENTS = {
"text/plain+opera": re.compile(r'^Opera Hotlist'),
"text/plain+chromium": re.compile(r'^{\s*"checksum":'),
"text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
"application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
"application/xml+sitemap": re.compile(r'(?i)<\?xml[^<]+<urlset\s+'),
}
def guess_mimetype (filename, read=None):

View file

@ -49,18 +49,21 @@ def parse_opera (url_data):
for url, name, lineno in parse_bookmark_data(url_data.get_content()):
url_data.add_url(url, line=lineno, name=name)
def parse_chromium (url_data):
"""Parse a Chromium or Google Chrome bookmark file."""
from ..bookmarks.chromium import parse_bookmark_data
for url, name in parse_bookmark_data(url_data.get_content()):
url_data.add_url(url, name=name)
def parse_safari (url_data):
"""Parse a Safari bookmark file."""
from ..bookmarks.safari import parse_bookmark_data
for url, name in parse_bookmark_data(url_data.get_content()):
url_data.add_url(url, name=name)
def parse_text (url_data):
"""Parse a text file with one url per line; comment and blank
lines are ignored."""
@ -87,6 +90,7 @@ def parse_css (url_data):
url = strformat.unquote(mo.group("url").strip())
url_data.add_url(url, line=lineno, column=column)
def parse_swf (url_data):
"""Parse a SWF file for URLs."""
linkfinder = linkparse.swf_url_re.finditer
@ -94,6 +98,7 @@ def parse_swf (url_data):
url = mo.group()
url_data.add_url(url)
def parse_word (url_data):
"""Parse a word file for hyperlinks."""
if not winutil.has_word():
@ -116,6 +121,7 @@ def parse_word (url_data):
except winutil.Error as msg:
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
def parse_wml (url_data):
"""Parse into WML content and search for URLs to check.
Found URLs are added to the URL queue.
@ -161,7 +167,7 @@ def parse_firefox (url_data):
"""Parse a Firefox3 bookmark file."""
filename = url_data.get_os_filename()
for url, name in firefox.parse_bookmark_file(filename):
# XXX use add_url
url_data = get_url_from(url, url_data.recursion_level+1,
url_data.aggregate, parent_url=url_data.url, name=name)
url_data.aggregate.urlqueue.put(url_data)
url_data.add_url(url, name=name)
from .sitemap import parse_sitemap, parse_sitemapindex

View file

@ -0,0 +1,77 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Main functions for link parsing
"""
from xml.parsers.expat import ParserCreate
class XmlTagUrlParser(object):
"""Parse XML files and find URLs in text content of a tag name."""
def __init__(self, tag):
"""Initialize the parser."""
self.tag = tag
self.parser = ParserCreate()
self.parser.buffer_text = True
self.parser.returns_unicode = True
self.parser.StartElementHandler = self.start_element
self.parser.EndElementHandler = self.end_element
self.parser.CharacterDataHandler = self.char_data
def parse(self, url_data):
"""Parse XML URL data."""
self.url_data = url_data
self.loc = False
self.url = u""
data = url_data.get_content()
isfinal = True
self.parser.Parse(data, isfinal)
def start_element(self, name, attrs):
"""Set tag status for start element."""
self.in_tag = (name == self.tag)
self.url = u""
def end_element(self, name):
"""If end tag is our tag, call add_url()."""
self.in_tag = False
if name == self.tag:
self.add_url()
def add_url(self):
"""Add non-empty URLs to the queue."""
if self.url:
self.url_data.add_url(self.url, line=self.parser.CurrentLineNumber,
column=self.parser.CurrentColumnNumber)
self.url = u""
def char_data(self, data):
"""If inside the wanted tag, append data to URL."""
if self.loc:
self.url += data
def parse_sitemap(url_data):
"""Parse XML sitemap data."""
XmlTagUrlParser(u"loc").parse(url_data)
def parse_sitemapindex(url_data):
"""Parse XML sitemap index data."""
XmlTagUrlParser(u"loc").parse(url_data)

View file

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset>

View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>http://example.com/foo.xml</loc></sitemap>
</sitemapindex>

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2012 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -39,7 +39,9 @@ class TestFileutil (unittest.TestCase):
self.assertEqual(linkcheck.fileutil.get_mtime(file_non_existing), 0)
def mime_test (self, filename, mime_expected):
mime = linkcheck.fileutil.guess_mimetype(get_file(filename))
absfilename = get_file(filename)
with open(absfilename) as fd:
mime = linkcheck.fileutil.guess_mimetype(absfilename, read=fd.read)
self.assertEqual(mime, mime_expected)
def test_mime (self):
@ -47,4 +49,6 @@ class TestFileutil (unittest.TestCase):
self.mime_test(filename, "application/x-plist+safari")
filename = os.path.join("plist_xml", "Bookmarks.plist")
self.mime_test(filename, "application/x-plist+safari")
self.mime_test("test.wml", "text/vnd.wap.wml")
self.mime_test("file.wml", "text/vnd.wap.wml")
self.mime_test("sitemap.xml", "application/xml+sitemap")
self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")