mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-27 01:24:42 +00:00
Added XML sitemap logger.
This commit is contained in:
parent
a8bd9c3c89
commit
932a07a9cf
10 changed files with 773 additions and 500 deletions
2
Makefile
2
Makefile
|
|
@ -207,7 +207,7 @@ doccheck:
|
|||
*.py
|
||||
|
||||
filescheck: localbuild
|
||||
for out in text html gml sql csv xml gxml dot; do \
|
||||
for out in text html gml sql csv xml gxml dot sitemap; do \
|
||||
./linkchecker -o$$out -F$$out --complete -r1 -C $(FILESCHECK_URL) || exit 1; \
|
||||
done
|
||||
|
||||
|
|
|
|||
|
|
@ -121,10 +121,10 @@ aufgelistet.
|
|||
Der \fIDATEINAME\fP und \fIENKODIERUNG\fP Teil wird beim Ausgabetyp \fBnone\fP
|
||||
ignoriert, ansonsten wird die Datei überschrieben falls sie existiert. Sie
|
||||
können diese Option mehr als einmal verwenden. Gültige Ausgabetypen sind
|
||||
\fBtext\fP, \fBhtml\fP, \fBsql\fP, \fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBnone\fP oder
|
||||
\fBblacklist\fP. Standard ist keine Dateiausgabe. \fIENKODIERUNG\fP gibt die
|
||||
Ausgabekodierung an,der Standard ist die Enkodierung der ausgewählten
|
||||
Spracheinstellung. Gültige Enkodierungen sind unter
|
||||
\fBtext\fP, \fBhtml\fP, \fBsql\fP, \fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBsitemap\fP,
|
||||
\fBnone\fP oder \fBblacklist\fP. Standard ist keine Dateiausgabe. \fIENKODIERUNG\fP
|
||||
gibt die Ausgabekodierung an,der Standard ist die Enkodierung der
|
||||
ausgewählten Spracheinstellung. Gültige Enkodierungen sind unter
|
||||
\fBhttp://docs.python.org/library/codecs.html#standard\-encodings\fP
|
||||
aufgelistet. Beachten Sie, dass Sie mit der Option \fB\-o none\fP jegliche
|
||||
Ausgaben auf der Konsole verhindern können.
|
||||
|
|
@ -137,8 +137,8 @@ Gebe keine Warnungen aus. Standard ist die Ausgabe von Warnungen.
|
|||
.TP
|
||||
\fB\-o\fP\fITYP\fP[\fB/\fP\fIENKODIERUNG\fP], \fB\-\-output=\fP\fITYP\fP[\fB/\fP\fIENKODIERUNG\fP]
|
||||
Gib Ausgabetyp als \fBtext\fP, \fBhtml\fP, \fBsql\fP, \fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP,
|
||||
\fBnone\fP oder \fBblacklist\fP an. Stadard Typ ist \fBtext\fP. Die verschiedenen
|
||||
Ausgabetypen sind unten dokumentiert.
|
||||
\fBsitemap\fP, \fBnone\fP oder \fBblacklist\fP an. Stadard Typ ist \fBtext\fP. Die
|
||||
verschiedenen Ausgabetypen sind unten dokumentiert.
|
||||
.br
|
||||
Das \fIENCODING\fP gibt die Ausgabekodierung an. Der Standard ist das der
|
||||
lokalen Spracheinstellung. Gültige Enkodierungen sind unter
|
||||
|
|
@ -277,6 +277,10 @@ Gebe Prüfresultat als GraphXML\-Datei aus.
|
|||
\fBxml\fP
|
||||
Gebe Prüfresultat als maschinenlesbare XML\-Datei aus.
|
||||
.TP
|
||||
\fBsitemap\fP
|
||||
Protokolliere Prüfergebnisse als XML Sitemap dessen Format unter
|
||||
\fBhttp://www.sitemaps.org/protocol.html\fP dokumentiert ist.
|
||||
.TP
|
||||
\fBsql\fP
|
||||
Gebe Prüfresultat als SQL Skript mit INSERT Befehlen aus. Ein
|
||||
Beispielskript, um die initiale SQL Tabelle zu erstellen ist unter
|
||||
|
|
|
|||
|
|
@ -432,6 +432,23 @@ Siehe [text] Sektion weiter oben.
|
|||
.TP
|
||||
\fBencoding=\fP\fISTRING\fP
|
||||
Siehe [text] Sektion weiter oben.
|
||||
.SS [sitemap]
|
||||
.TP
|
||||
\fBfilename=\fP\fISTRING\fP
|
||||
Siehe [text] Sektion weiter oben.
|
||||
.TP
|
||||
\fBparts=\fP\fISTRING\fP
|
||||
Siehe [text] Sektion weiter oben.
|
||||
.TP
|
||||
\fBencoding=\fP\fISTRING\fP
|
||||
Siehe [text] Sektion weiter oben.
|
||||
.TP
|
||||
\fBpriority=\fP\fINUMMER\fP
|
||||
Eine Nummer zwischen 0.0 und 1.0, welche die Priorität festlegt. Die
|
||||
Standardpriorität für die erste URL ist 1.0, für alle Kind\-URLs ist sie 0.5.
|
||||
.TP
|
||||
\fBfrequency=\fP[\fBalways\fP|\fBhourly\fP|\fBdaily\fP|\fBweekly\fP|\fBmonthly\fP|\fByearly\fP|\fBnever\fP]
|
||||
Die Häufigkeit mit der Seiten sich ändern.
|
||||
.
|
||||
.SH "AUSGABE PARTS"
|
||||
\fBall\fP (für alle Teile)
|
||||
|
|
|
|||
|
|
@ -114,7 +114,8 @@ The \fIFILENAME\fP and \fIENCODING\fP parts of the \fBnone\fP output type
|
|||
will be ignored, else if the file already exists, it will be overwritten.
|
||||
You can specify this option more than once. Valid file output types
|
||||
are \fBtext\fP, \fBhtml\fP, \fBsql\fP,
|
||||
\fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBnone\fP or \fBblacklist\fP.
|
||||
\fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBsitemap\fP, \fBnone\fP or
|
||||
\fBblacklist\fP.
|
||||
Default is no file output. The various output types are documented
|
||||
below. Note that you can suppress all console output
|
||||
with the option \fB\-o none\fP.
|
||||
|
|
@ -127,7 +128,8 @@ Don't log warnings. Default is to log warnings.
|
|||
.TP
|
||||
\fB\-o\fP\fITYPE\fP[\fB/\fP\fIENCODING\fP], \fB\-\-output=\fP\fITYPE\fP[\fB/\fP\fIENCODING\fP]
|
||||
Specify output type as \fBtext\fP, \fBhtml\fP, \fBsql\fP,
|
||||
\fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBnone\fP or \fBblacklist\fP.
|
||||
\fBcsv\fP, \fBgml\fP, \fBdot\fP, \fBxml\fP, \fBsitemap\fP, \fBnone\fP or
|
||||
\fBblacklist\fP.
|
||||
Default type is \fBtext\fP. The various output types are documented
|
||||
below.
|
||||
.br
|
||||
|
|
@ -263,6 +265,10 @@ Log check result as a GraphXML sitemap graph.
|
|||
\fBxml\fP
|
||||
Log check result as machine-readable XML.
|
||||
.TP
|
||||
\fBsitemap\fP
|
||||
Log check result as an XML sitemap whose protocol is documented at
|
||||
\fBhttp://www.sitemaps.org/protocol.html\fP.
|
||||
.TP
|
||||
\fBsql\fP
|
||||
Log check result as SQL script with INSERT commands. An example
|
||||
script to create the initial SQL table is included as create.sql.
|
||||
|
|
|
|||
|
|
@ -426,6 +426,23 @@ See [text] section above.
|
|||
.TP
|
||||
\fBencoding=\fP\fISTRING\fP
|
||||
See [text] section above.
|
||||
.SS \fB[sitemap]\fP
|
||||
.TP
|
||||
\fBfilename=\fP\fISTRING\fP
|
||||
See [text] section above.
|
||||
.TP
|
||||
\fBparts=\fP\fISTRING\fP
|
||||
See [text] section above.
|
||||
.TP
|
||||
\fBencoding=\fP\fISTRING\fP
|
||||
See [text] section above.
|
||||
.TP
|
||||
\fBpriority=\fP\fIFLOAT\fP
|
||||
A number between 0.0 and 1.0 determining the priority. The default
|
||||
priority for the first URL is 1.0, for all child URLs 0.5.
|
||||
.TP
|
||||
\fBfrequency=\fP[\fBalways\fP|\fBhourly\fP|\fBdaily\fP|\fBweekly\fP|\fBmonthly\fP|\fByearly\fP|\fBnever\fP]
|
||||
The frequence pages are changing with.
|
||||
.
|
||||
.SH "LOGGER PARTS"
|
||||
\fBall\fP (for all parts)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -204,6 +204,10 @@ class Configuration (dict):
|
|||
"filename": "linkchecker-out.dot",
|
||||
"encoding": "ascii",
|
||||
}
|
||||
self['sitemap'] = {
|
||||
"filename": "linkchecker-out.sitemap.xml",
|
||||
"encoding": "utf-8",
|
||||
}
|
||||
self['none'] = {}
|
||||
self['output'] = 'text'
|
||||
self['logger'] = None
|
||||
|
|
|
|||
|
|
@ -448,6 +448,7 @@ from .blacklist import BlacklistLogger
|
|||
from .gxml import GraphXMLLogger
|
||||
from .customxml import CustomXMLLogger
|
||||
from .none import NoneLogger
|
||||
from .sitemapxml import SitemapXmlLogger
|
||||
|
||||
|
||||
# default URL logger classes
|
||||
|
|
@ -461,6 +462,7 @@ Loggers = {
|
|||
"blacklist": BlacklistLogger,
|
||||
"gxml": GraphXMLLogger,
|
||||
"xml": CustomXMLLogger,
|
||||
"sitemap": SitemapXmlLogger,
|
||||
"none": NoneLogger,
|
||||
}
|
||||
# for easy printing: a comma separated logger list
|
||||
|
|
|
|||
108
linkcheck/logger/sitemapxml.py
Normal file
108
linkcheck/logger/sitemapxml.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
A sitemap XML logger.
|
||||
"""
|
||||
from . import xmllog
|
||||
|
||||
ChangeFreqs = (
|
||||
'always',
|
||||
'hourly',
|
||||
'daily',
|
||||
'weekly',
|
||||
'monthly',
|
||||
'yearly',
|
||||
'never',
|
||||
)
|
||||
|
||||
class SitemapXmlLogger (xmllog.XMLLogger):
|
||||
"""
|
||||
Sitemap XML output according to http://www.sitemaps.org/protocol.html
|
||||
"""
|
||||
|
||||
def __init__ (self, **args):
|
||||
"""
|
||||
Initialize graph node list and internal id counter.
|
||||
"""
|
||||
super(SitemapXmlLogger, self).__init__(**args)
|
||||
# All URLs must have the given prefix, which is determined
|
||||
# by the first logged URL.
|
||||
self.prefix = None
|
||||
if 'frequency' in args:
|
||||
if args['frequency'] not in ChangeFreqs:
|
||||
raise ValueError("Invalid change frequency %r" % args['frequency'])
|
||||
self.frequency = args['frequency']
|
||||
else:
|
||||
self.frequency = 'daily'
|
||||
self.priority = None
|
||||
if 'priority' in args:
|
||||
self.priority = float(args['priority'])
|
||||
|
||||
def start_output (self):
|
||||
"""
|
||||
Write start of checking info as xml comment.
|
||||
"""
|
||||
super(SitemapXmlLogger, self).start_output()
|
||||
self.xml_start_output()
|
||||
attrs = {u"xmlns": u"http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
self.xml_starttag(u'urlset', attrs)
|
||||
self.flush()
|
||||
|
||||
def filter_url(self, url_data):
|
||||
"""Determine if URL should not be logged in sitemap.
|
||||
Only valid HTML pages which start with the first logged URL are
|
||||
considered."""
|
||||
if not url_data.valid:
|
||||
return True
|
||||
if not url_data.url.startswith((u'http:', u'https:')):
|
||||
return True
|
||||
if not url_data.url.startswith(self.prefix):
|
||||
return True
|
||||
if url_data.content_type not in ('text/html', "application/xhtml+xml"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def log_url (self, url_data):
|
||||
"""
|
||||
Log URL data in sitemap format.
|
||||
"""
|
||||
if self.prefix is None:
|
||||
self.prefix = url_data.url
|
||||
priority = 1.0
|
||||
else:
|
||||
priority = 0.5
|
||||
if self.filter_url(url_data):
|
||||
return
|
||||
if self.priority is not None:
|
||||
priority = self.priority
|
||||
self.xml_starttag(u'url')
|
||||
self.xml_tag(u'loc', url_data.url)
|
||||
# use it when last modified is part of official URL
|
||||
#if url_data.last_modified:
|
||||
# self.xml_tag(u'lastmod', url_data.last_modified)
|
||||
self.xml_tag(u'changefreq', self.frequency)
|
||||
self.xml_tag(u'priority', "%.1f" % priority)
|
||||
self.xml_endtag(u'url')
|
||||
self.flush()
|
||||
|
||||
def end_output (self):
|
||||
"""
|
||||
Write XML end tag.
|
||||
"""
|
||||
self.xml_endtag(u"urlset")
|
||||
self.xml_end_output()
|
||||
self.close_fileoutput()
|
||||
Loading…
Reference in a new issue