mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Implement sitemap and sitemap index parsing.
This commit is contained in:
parent
b72cf252fb
commit
ef13a3fce1
9 changed files with 121 additions and 9 deletions
2
Makefile
2
Makefile
|
|
@ -219,7 +219,7 @@ test: localbuild
|
|||
pyflakes:
|
||||
pyflakes $(PY_FILES_DIRS) 2>&1 | \
|
||||
grep -v "local variable 'dummy' is assigned to but never used" | \
|
||||
grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
|
||||
grep -v -E "'(py2exe|py2app|PyQt4|biplist|setuptools|win32com|find_executable|parse_sitemap|parse_sitemapindex|parse_bookmark_data|parse_bookmark_file|wsgiref|pyftpdlib|linkchecker_rc)' imported but unused" | \
|
||||
grep -v "undefined name '_'" | \
|
||||
grep -v "undefined name '_n'" | cat
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ Handle http links.
|
|||
import requests
|
||||
from cStringIO import StringIO
|
||||
|
||||
from .. import (log, LOG_CHECK, strformat,
|
||||
from .. import (log, LOG_CHECK, strformat, fileutil,
|
||||
url as urlutil, LinkCheckerError)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers)
|
||||
# import warnings
|
||||
|
|
@ -227,6 +227,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if not self.valid:
|
||||
return False
|
||||
ctype = self.get_content_type()
|
||||
# some content types must be validated with the page content
|
||||
if ctype in ("application/xml", "text/xml"):
|
||||
data = self.get_content()
|
||||
io = StringIO(data)
|
||||
rtype = fileutil.guess_mimetype_read(io.read)
|
||||
if rtype is not None:
|
||||
# XXX side effect
|
||||
ctype = self.content_type = rtype
|
||||
if ctype not in self.ContentMimetypes:
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -87,6 +87,8 @@ class UrlBase (object):
|
|||
"text/plain+chromium": "chromium",
|
||||
"application/x-plist+safari": "safari",
|
||||
"text/vnd.wap.wml": "wml",
|
||||
"application/xml+sitemap": "sitemap",
|
||||
"application/xml+sitemapindex": "sitemapindex",
|
||||
}
|
||||
|
||||
# Read in 16kb chunks
|
||||
|
|
|
|||
|
|
@ -218,6 +218,8 @@ PARSE_CONTENTS = {
|
|||
"text/plain+opera": re.compile(r'^Opera Hotlist'),
|
||||
"text/plain+chromium": re.compile(r'^{\s*"checksum":'),
|
||||
"text/plain+linkchecker": re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
"application/xml+sitemapindex": re.compile(r'(?i)<\?xml[^<]+<sitemapindex\s+'),
|
||||
"application/xml+sitemap": re.compile(r'(?i)<\?xml[^<]+<urlset\s+'),
|
||||
}
|
||||
|
||||
def guess_mimetype (filename, read=None):
|
||||
|
|
|
|||
|
|
@ -49,18 +49,21 @@ def parse_opera (url_data):
|
|||
for url, name, lineno in parse_bookmark_data(url_data.get_content()):
|
||||
url_data.add_url(url, line=lineno, name=name)
|
||||
|
||||
|
||||
def parse_chromium (url_data):
|
||||
"""Parse a Chromium or Google Chrome bookmark file."""
|
||||
from ..bookmarks.chromium import parse_bookmark_data
|
||||
for url, name in parse_bookmark_data(url_data.get_content()):
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
||||
def parse_safari (url_data):
|
||||
"""Parse a Safari bookmark file."""
|
||||
from ..bookmarks.safari import parse_bookmark_data
|
||||
for url, name in parse_bookmark_data(url_data.get_content()):
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
||||
def parse_text (url_data):
|
||||
"""Parse a text file with one url per line; comment and blank
|
||||
lines are ignored."""
|
||||
|
|
@ -87,6 +90,7 @@ def parse_css (url_data):
|
|||
url = strformat.unquote(mo.group("url").strip())
|
||||
url_data.add_url(url, line=lineno, column=column)
|
||||
|
||||
|
||||
def parse_swf (url_data):
|
||||
"""Parse a SWF file for URLs."""
|
||||
linkfinder = linkparse.swf_url_re.finditer
|
||||
|
|
@ -94,6 +98,7 @@ def parse_swf (url_data):
|
|||
url = mo.group()
|
||||
url_data.add_url(url)
|
||||
|
||||
|
||||
def parse_word (url_data):
|
||||
"""Parse a word file for hyperlinks."""
|
||||
if not winutil.has_word():
|
||||
|
|
@ -116,6 +121,7 @@ def parse_word (url_data):
|
|||
except winutil.Error as msg:
|
||||
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
|
||||
|
||||
|
||||
def parse_wml (url_data):
|
||||
"""Parse into WML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
|
|
@ -161,7 +167,7 @@ def parse_firefox (url_data):
|
|||
"""Parse a Firefox3 bookmark file."""
|
||||
filename = url_data.get_os_filename()
|
||||
for url, name in firefox.parse_bookmark_file(filename):
|
||||
# XXX use add_url
|
||||
url_data = get_url_from(url, url_data.recursion_level+1,
|
||||
url_data.aggregate, parent_url=url_data.url, name=name)
|
||||
url_data.aggregate.urlqueue.put(url_data)
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
||||
from .sitemap import parse_sitemap, parse_sitemapindex
|
||||
|
|
|
|||
77
linkcheck/parser/sitemap.py
Normal file
77
linkcheck/parser/sitemap.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Main functions for link parsing
|
||||
"""
|
||||
from xml.parsers.expat import ParserCreate
|
||||
|
||||
|
||||
class XmlTagUrlParser(object):
|
||||
"""Parse XML files and find URLs in text content of a tag name."""
|
||||
|
||||
def __init__(self, tag):
|
||||
"""Initialize the parser."""
|
||||
self.tag = tag
|
||||
self.parser = ParserCreate()
|
||||
self.parser.buffer_text = True
|
||||
self.parser.returns_unicode = True
|
||||
self.parser.StartElementHandler = self.start_element
|
||||
self.parser.EndElementHandler = self.end_element
|
||||
self.parser.CharacterDataHandler = self.char_data
|
||||
|
||||
def parse(self, url_data):
|
||||
"""Parse XML URL data."""
|
||||
self.url_data = url_data
|
||||
self.loc = False
|
||||
self.url = u""
|
||||
data = url_data.get_content()
|
||||
isfinal = True
|
||||
self.parser.Parse(data, isfinal)
|
||||
|
||||
def start_element(self, name, attrs):
|
||||
"""Set tag status for start element."""
|
||||
self.in_tag = (name == self.tag)
|
||||
self.url = u""
|
||||
|
||||
def end_element(self, name):
|
||||
"""If end tag is our tag, call add_url()."""
|
||||
self.in_tag = False
|
||||
if name == self.tag:
|
||||
self.add_url()
|
||||
|
||||
def add_url(self):
|
||||
"""Add non-empty URLs to the queue."""
|
||||
if self.url:
|
||||
self.url_data.add_url(self.url, line=self.parser.CurrentLineNumber,
|
||||
column=self.parser.CurrentColumnNumber)
|
||||
self.url = u""
|
||||
|
||||
def char_data(self, data):
|
||||
"""If inside the wanted tag, append data to URL."""
|
||||
if self.loc:
|
||||
self.url += data
|
||||
|
||||
|
||||
def parse_sitemap(url_data):
|
||||
"""Parse XML sitemap data."""
|
||||
XmlTagUrlParser(u"loc").parse(url_data)
|
||||
|
||||
|
||||
def parse_sitemapindex(url_data):
|
||||
"""Parse XML sitemap index data."""
|
||||
XmlTagUrlParser(u"loc").parse(url_data)
|
||||
|
||||
9
tests/checker/data/sitemap.xml
Normal file
9
tests/checker/data/sitemap.xml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>http://www.example.com/</loc>
|
||||
<lastmod>2005-01-01</lastmod>
|
||||
<changefreq>monthly</changefreq>
|
||||
<priority>0.8</priority>
|
||||
</url>
|
||||
</urlset>
|
||||
4
tests/checker/data/sitemapindex.xml
Normal file
4
tests/checker/data/sitemapindex.xml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>http://example.com/foo.xml</loc></sitemap>
|
||||
</sitemapindex>
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -39,7 +39,9 @@ class TestFileutil (unittest.TestCase):
|
|||
self.assertEqual(linkcheck.fileutil.get_mtime(file_non_existing), 0)
|
||||
|
||||
def mime_test (self, filename, mime_expected):
|
||||
mime = linkcheck.fileutil.guess_mimetype(get_file(filename))
|
||||
absfilename = get_file(filename)
|
||||
with open(absfilename) as fd:
|
||||
mime = linkcheck.fileutil.guess_mimetype(absfilename, read=fd.read)
|
||||
self.assertEqual(mime, mime_expected)
|
||||
|
||||
def test_mime (self):
|
||||
|
|
@ -47,4 +49,6 @@ class TestFileutil (unittest.TestCase):
|
|||
self.mime_test(filename, "application/x-plist+safari")
|
||||
filename = os.path.join("plist_xml", "Bookmarks.plist")
|
||||
self.mime_test(filename, "application/x-plist+safari")
|
||||
self.mime_test("test.wml", "text/vnd.wap.wml")
|
||||
self.mime_test("file.wml", "text/vnd.wap.wml")
|
||||
self.mime_test("sitemap.xml", "application/xml+sitemap")
|
||||
self.mime_test("sitemapindex.xml", "application/xml+sitemapindex")
|
||||
|
|
|
|||
Loading…
Reference in a new issue