Introduce check plugins, use Python requests for http/s connections, and some code cleanups and improvements.

This commit is contained in:
Bastian Kleineidam 2014-03-01 00:12:34 +01:00
parent adc17fbe77
commit 7b34be590b
194 changed files with 4817 additions and 8903 deletions

1
.gitignore vendored
View file

@ -36,3 +36,4 @@ Changelog.linkchecker*
/todo
/alexa*.log
/testresults.txt
/linkchecker.prof

View file

@ -18,11 +18,11 @@ DEBORIGFILE:=$(DEBUILDDIR)/$(LAPPNAME)_$(VERSION).orig.tar.xz
DEBPACKAGEDIR:=$(DEBUILDDIR)/$(APPNAME)-$(VERSION)
FILESCHECK_URL:=http://localhost/~calvin/
SRCDIR:=${HOME}/src
PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc
PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc/examples
MYPY_FILES_DIRS:=linkcheck/HtmlParser linkcheck/checker \
linkcheck/cache linkcheck/configuration linkcheck/director \
linkcheck/htmlutil linkcheck/logger linkcheck/network \
linkcheck/bookmarks \
linkcheck/bookmarks linkcheck/plugins linkcheck/parser \
linkcheck/gui/__init__.py \
linkcheck/gui/checker.py \
linkcheck/gui/contextmenu.py \
@ -192,7 +192,7 @@ filescheck: localbuild
done
update-copyright:
update-copyright --holder="Bastian Kleineidam"
update-copyright --holder="Bastian Kleineidam" $(PY_FILES_DIRS)
releasecheck: check update-certificates
@if egrep -i "xx\.|xxxx|\.xx" doc/changelog.txt > /dev/null; then \

View file

@ -17,7 +17,7 @@ create table linksdb (
name varchar(256),
checktime int,
dltime int,
dlsize int,
size int,
cached int,
level int not null,
modified varchar(256)

View file

@ -131,32 +131,18 @@
#threads=100
# connection timeout in seconds
#timeout=60
# check anchors?
#anchors=0
# Time to wait for checks to finish after the user aborts the first time
# (with Ctrl-C or the abort button).
#aborttimeout=300
# The recursion level determines how many times links inside pages are followed.
#recursionlevel=1
# supply a regular expression for which warnings are printed if found
# in any HTML files.
#warningregex=(Oracle DB Error|Page Not Found|badsite\.example\.com)
# Basic NNTP server. Overrides NNTP_SERVER environment variable.
# warn if size info exceeds given maximum of bytes
#warnsizebytes=2000
#nntpserver=
# check HTML or CSS syntax with the W3C online validator
#checkhtml=1
#checkcss=1
# scan URL content for viruses with ClamAV
#scanvirus=1
# ClamAV config file
#clamavconf=/etc/clamav/clamd.conf
# Send and store cookies
#cookies=1
# parse a cookiefile for initial cookie data
#cookiefile=/path/to/cookies.txt
# User-Agent header string to send to HTTP web servers
# Note that robots.txt are always checked with the original User-Agent.
#useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
# Pause the given number of seconds between two subsequent connection
# requests to the same host.
#pause=0
# When checking finishes, write a memory dump to a temporary file.
# The memory dump is written both when checking finishes normally
# and when checking gets canceled.
@ -175,22 +161,16 @@
# Check SSL certificates. Set to an absolute pathname for a custom
# CA cert bundle to use. Set to zero to disable SSL certificate verification.
#sslverify=1
# Check that SSL certificates are at least the given number of days valid.
# The number must not be negative.
# If the number of days is zero a warning is printed only for certificates
# that are already expired.
# The default number of days is 14.
#sslcertwarndays=14
# Stop checking new URLs after the given number of seconds. Same as if the
# user hits Ctrl-C after X seconds.
#maxrunseconds=600
# Maximum number of URLs to check. New URLs will not be queued after the
# given number of URLs is checked.
#maxnumurls=153
# Maximum number of connections to one single host for different connection types.
#maxconnectionshttp=10
#maxconnectionshttps=10
#maxconnectionsftp=2
# Maximum number of requests per second to one host.
#maxrequestspersecond=10
# Allowed URL schemes as a comma-separated list.
#allowedschemes=http,https
##################### filtering options ##########################
[filtering]
@ -211,11 +191,12 @@
# recognized warnings). Add a comma-separated list of warnings here
# that prevent a valid URL from being logged. Note that the warning
# will be logged in invalid URLs.
#ignorewarnings=url-unicode-domain,anchor-not-found
#ignorewarnings=url-unicode-domain
# Regular expression to add more URLs recognized as internal links.
# Default is that URLs given on the command line are internal.
#internlinks=^http://www\.example\.net/
# Check external links
#checkextern=1
##################### password authentication ##########################
@ -247,3 +228,30 @@
#loginextrafields=
# name1:value1
# name 2:value 2
############################ Plugins ###################################
#
# uncomment sections to enable plugins
# Check HTML anchors
#[AnchorCheck]
# Add country info to URLs
#[LocationInfo]
# Run W3C syntax checks
#[CssSyntaxCheck]
#[HtmlSyntaxCheck]
# Search for regular expression in page contents
#[RegexCheck]
#warningregex=Oracle Error
# Search for viruses in page contents
#[VirusCheck]
#clamavconf=/etc/clamav/clam.conf
# Check that SSL certificates are at least the given number of days valid.
#[SslCertificateCheck]
#sslcertwarndays=14

View file

@ -1,3 +1,34 @@
8.7 "" (released xx.xx.2014)
Features:
- checking: Support connection and content check plugins.
- checking: Move lots of custom checks like Antivirus and syntax
checks into plugins (see upgrading.txt for more info).
- checking: Add options to limit the number of requests per second,
allowed URL schemes and maximum file or download size.
Changes:
- checking: Use the Python requests module for HTTP and HTTPS requests.
- logging: Removed download, domains and robots.txt statistics.
- logging: HTML output is now in HTML5.
- checking: Removed 301 warning since 301 redirects are used
a lot without updating the old URL links.
- checking: Disallowed access by robots.txt is an info now, not
a warning. Otherwise it produces a lot of warnings which
is counter-productive.
- checking: Do not check SMTP connections for mailto: URLs anymore.
It resulted in lots of false warnings since spam prevention
usually disallows direct SMTP connections from unrecognized
client IPs.
- checking: Only internal URLs are checked as default. To check
external urls use --check-extern.
Fixes:
- logging: Status was printed every second regardless of the
configured wait time.
- checking: Several speed and memory usage improvements.
8.6 "About Time" (released 8.1.2014)
Changes:

1994
doc/de.po

File diff suppressed because it is too large Load diff

View file

@ -41,16 +41,15 @@ Antivirusprüfung
.IP \(bu
ein Kommandozeilenprogramm, GUI und web interface
.SH BEISPIELE
Der häufigste Gebrauchsfall prüft die angegebene Domäne rekursiv,
inklusive aller einzelnen nach außen zeigenden Verknüpfungen:
\fBlinkchecker http://www.example.net/\fP
The most common use checks the given domain recursively:
\fBlinkchecker http://www.example.com/\fP
.br
Beachten Sie dass dies die komplette Domäne überprüft, welche aus mehreren
tausend URLs bestehen kann. Benutzen Sie die Option \fB\-r\fP, um die
Rekursionstiefe zu beschränken.
.br
Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich geprüft:
\fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
\fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
.br
Überprüfung einer lokalen HTML Datei unter Unix:
\fBlinkchecker ../bla.html\fP
@ -61,8 +60,8 @@ Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich
Sie können den \fBhttp://\fP URL Anteil weglassen wenn die Domäne mit \fBwww.\fP beginnt:
\fBlinkchecker www.example.com\fP
.br
Sie können den \fBftp://\fP URL Anteil weglassen wenn die Domäne mit \fBftp.\fP beginnt:
\fBlinkchecker \-r0 ftp.example.org\fP
You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
\fBlinkchecker \-r0 ftp.example.com\fP
.br
Erzeuge einen Sitemap Graphen und konvertiere ihn mit dem graphviz dot Programm:
\fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
@ -88,19 +87,12 @@ positive Nummer an.
.TP
\fB\-V\fP, \fB\-\-version\fP
Gebe die Version aus und beende das Programm.
.TP
\fB\-\-list\-plugins\fP
Print available check plugins and exit.
.
.SS Ausgabeoptionen
.TP
\fB\-\-check\-css\fP
Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
.TP
\fB\-\-check\-html\fP
Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
.TP
\fB\-\-complete\fP
Gebe alle geprüften URLs aus. Standard ist es, doppelte URLs nur einmal
auszugeben.
.TP
\fB\-D\fP\fINAME\fP, \fB\-\-debug=\fP\fINAME\fP
Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
\fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP und \fBall\fP. Die Angabe
@ -144,12 +136,6 @@ lokalen Spracheinstellung. Gültige Enkodierungen sind aufgelistet unter
Keine Ausgabe, ein Alias für \fB\-o none\fP. Dies ist nur in Verbindung mit
\fB\-F\fP nützlich.
.TP
\fB\-\-scan\-virus\fP
Prüfe Inhalt von URLs auf Viren mit ClamAV.
.TP
\fB\-\-trace\fP
Trace\-Information ausgeben.
.TP
\fB\-v\fP, \fB\-\-verbose\fP
Gebe alle geprüften URLs aus. Standard ist es, nur fehlerhafte URLs und
Warnungen auszugeben.
@ -168,27 +154,15 @@ werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
Applikationsfehler)".
.br
Siehe Abschnitt \fBREGULAR EXPRESSIONS\fP für weitere Infos.
.TP
\fB\-\-warning\-size\-bytes=\fP\fINUMMER\fP
Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
Anzahl von Bytes übersteigt.
.
.SS "Optionen zum Prüfen"
.TP
\fB\-a\fP, \fB\-\-anchors\fP
Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
.TP
\fB\-C\fP, \fB\-\-cookies\fP
Akzeptiere und sende HTTP Cookies nach der RFC 2109. Lediglich Cookies, die
zum ursprünglichen Server zurückgesendet werden, werden akzeptiert.
Gesendete und akzeptierte Cookies werden als zusätzlicheLoginformation
aufgeführt.
.TP
\fB\-\-cookiefile=\fP\fIDATEINAME\fP
Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird weiter unten
erklärt.
.TP
\fB\-\-check\-extern\fP
Check external URLs.
.TP
\fB\-\-ignore\-url=\fP\fIREGEX\fP
URLs welche dem angegebenen regulären Ausdruck entsprechen werden ignoriert
und nicht geprüft.
@ -215,11 +189,6 @@ Liest ein Passwort von der Kommandozeile und verwende es für HTTP und FTP
Autorisierung. Für FTP ist das Standardpasswort \fBanonymous@\fP. Für HTTP gibt
es kein Standardpasswort. Siehe auch \fB\-u\fP.
.TP
\fB\-P\fP\fINUMMER\fP, \fB\-\-pause=\fP\fINUMMER\fP
Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
folgenden Verbindungen zum demselben Rechner. Standard ist keine Pause
zwischen Verbindungen.
.TP
\fB\-r\fP\fINUMMER\fP, \fB\-\-recursion\-level=\fP\fINUMMER\fP
Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
@ -301,17 +270,13 @@ Eine Cookie\-Datei enthält Standard HTTP\-Header (RFC 2616) mit den folgenden
möglichen Namen:
.
.TP
\fBScheme\fP (optional)
Setzt das Schema für das die Cookies gültig sind; Standardschema ist
\fBhttp\fP.
.TP
\fBHost\fP (erforderlich)
Setzt die Domäne für die die Cookies gültig sind.
.TP
\fBPath\fP (optional)
Gibt den Pfad für den die Cookies gültig sind; Standardpfad ist \fB/\fP.
.TP
\fBSet\-cookie\fP (optional)
\fBSet\-cookie\fP (required)
Setzt den Cookie Name/Wert. Kann mehrmals angegeben werden.
.PP
Mehrere Einträge sind durch eine Leerzeile zu trennen.
@ -325,7 +290,6 @@ Das untige Beispiel sendet zwei Cookies zu allen URLs die mit
Set\-cookie: ID="smee"
Set\-cookie: spam="egg"
Scheme: https
Host: example.org
Set\-cookie: baggage="elitist"; comment="hologram"
@ -362,12 +326,10 @@ beschrieben.
.
.TP
HTTP Verknüpfungen (\fBhttp:\fP, \fBhttps:\fP)
Nach Verbinden zu dem gegebenen HTTP\-Server wird der eingegebene Pfad oder
Query angefordert. Alle Umleitungen werden verfolgt, und falls ein
Benutzer/Passwort angegeben wurde werden diese falls notwendig als
Authorisierung benutzt. Permanent umgezogene Webseiten werden als Warnung
ausgegeben. Alle finalen HTTP Statuscodes, die nicht dem Muster 2xx
entsprechen, werden als Fehler ausgegeben.
After connecting to the given HTTP server the given path or query is
requested. All redirections are followed, and if user/password is given it
will be used as authorization when necessary. All final HTTP status codes
other than 2xx are errors.
.
Der Inhalt von HTML\-Seiten wird rekursiv geprüft.
.TP
@ -418,6 +380,19 @@ Nicht unterstützte Links (\*(lqjavascript:\*(lq, etc.)
Die komplette Liste von erkannten, aber nicht unterstützten Links ist in der
Quelldatei \fBlinkcheck/checker/unknownurl.py\fP. Die bekanntesten davon dürften JavaScript\-Links sein.
.SH PLUGINS
There are two plugin types: connection and content plugins.
.
Connection plugins are run after a successful connection to the URL host.
.
Content plugins are run if the URL type has content (mailto: URLs have no
content for example) and if the check is not forbidden (ie. by HTTP
robots.txt).
.
See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and their
documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
configuration file.
.SH Rekursion
Bevor eine URL rekursiv geprüft wird, hat diese mehrere Bedingungen zu
erfüllen. Diese werden in folgender Reihenfolge geprüft:

View file

@ -14,52 +14,14 @@ in einem INI\-Format geschrieben.
Die Standarddatei ist \fB~/.linkchecker/linkcheckerrc\fP unter Unix\-,
\fB%HOMEPATH%\e.linkchecker\elinkcheckerrc\fP unter Windows\-Systemen.
.SH EIGENSCHAFTEN
.SS [checking]
.TP
\fBanchors=\fP[\fB0\fP|\fB1\fP]
Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
.br
Kommandozeilenoption: \fB\-\-anchors\fP
.TP
\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
.br
Kommandozeilenoption: \fB\-\-check\-css\fP
.TP
\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
.br
Kommandozeilenoption: \fB\-\-check\-html\fP
.TP
\fBclamavconf=\fP\fIDateiname\fP
Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
.br
Kommandozeilenoption: keine
.TP
\fBcookiefile=\fP\fIDateiname\fP
Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird in
linkchecker(1) erklärt.
.br
Kommandozeilenoption: \fB\-\-cookiefile\fP
.TP
\fBcookies=\fP[\fB0\fP|\fB1\fP]
Akzeptiere und sende HTTP cookies.
.br
Kommandozeilenoption: \fB\-\-cookies\fP
.TP
\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
Schreibe einen Speicherabzug in eine temporäre Datei wenn die Prüfung
endet. Der Speicherabzug wird sowohl beim normalen Beenden der Prüfung als
auch wenn die Prüfung abgebrochen wird geschrieben.
.br
Der Speicherabzug funktioniert nur falls das Paket python\-meliae installiert
ist. Andernfalls wird eine Warnung angezeigt mit dem Hinweis dieses Paket zu
installieren.
.br
Kommandozeilenoption: keine
.TP
\fBlocalwebroot=\fP\fISTRING\fP
Beim Prüfen von absoluten URLs in lokalen Dateien wird das angegebene
Wurzelverzeichnis als Basis\-URL benutzt.
@ -78,23 +40,12 @@ korrekte Syntax des Links geprüft.
.br
Kommandozeilenoption: \fB\-\-nntp\-server\fP
.TP
\fBpause=\fP\fINUMBER\fP
Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
folgenden Verbindungen zum demselben Rechner.
.br
Kommandozeilenoption: \fB\-\-pause\fP
.TP
\fBrecursionlevel=\fP\fINUMBER\fP
Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
.br
Kommandozeilenoption: \fB\-\-recursion\-level\fP
.TP
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
Prüfe Inhalt von URLs auf Viren mit ClamAV.
.br
Kommandozeilenoption: \fB\-\-scan\-virus\fP
.TP
\fBthreads=\fP\fINUMBER\fP
Generiere nicht mehr als die angegebene Anzahl von Threads. Standard Anzahl
von Threads ist 100. Um Threads zu deaktivieren, geben Sie eine nicht
@ -108,6 +59,12 @@ Setze den Timeout für TCP\-Verbindungen in Sekunden. Der Standard Timeout ist
.br
Kommandozeilenoption: \fB\-\-timeout\fP
.TP
\fBaborttimeout=\fP\fINUMBER\fP
Time to wait for checks to finish after the user aborts the first time (with
Ctrl\-C or the abort button). The default abort timeout is 300 seconds.
.br
Kommandozeilenoption: \fB\-\-timeout\fP
.TP
\fBuseragent=\fP\fISTRING\fP
Gibt den User\-Agent an, der zu HTTP\-Servern geschickt wird,
z.B. "Mozilla/4.0". Der Standard ist "LinkChecker/X.Y", wobei X.Y die
@ -115,23 +72,6 @@ aktuelle Version von LinkChecker ist.
.br
Kommandozeilenoption: \fB\-\-user\-agent\fP
.TP
\fBwarningregex=\fP=\fIREGEX\fP
Definieren Sie einen regulären Ausdruck der eine Warnung ausgibt falls er
auf den Inhalt einer geprüften URL zutrifft. Dies gilt nur für gültige
Seiten deren Inhalt wir bekommen können.
.br
Benutzen Sie dies, um nach Seiten zu suchen, welche bestimmte Fehler
enthalten, zum Beispiel "Diese Seite wurde entfernt" oder "Oracle
Applikationsfehler".
.br
Kommandozeilenoption: \fB\-\-warning\-regex\fP
.TP
\fBwarnsizebytes=\fP\fINUMBER\fP
Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
Anzahl von Bytes übersteigt.
.br
Kommandozeilenoption: \fB\-\-warning\-size\-bytes\fP
.TP
\fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIdateiname\fP]
Falls der Wert Null ist werden SSL Zertifikate nicht überprüft. Falls er auf
Eins gesetzt wird (der Standard) werden SSL Zertifikate mit der gelieferten
@ -140,15 +80,6 @@ zur Prüfung verwendet.
.br
Kommandozeilenoption: keine
.TP
\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
Prüfe ob SSL\-Zertifikate mindestens die angegebene Anzahl an Tagen gültig
sind. Die Anzahl darf nicht negativ sein. Falls die Anzahl Null ist wird
eine Warnung nur für Zertifikate ausgegeben, die schon abgelaufen sind.
.br
The Standardanzahl an Tagen ist 14.
.br
Kommandozeilenoption: keine
.TP
\fBmaxrunseconds=\fP\fINUMBER\fP
Hört nach der angegebenen Anzahl von Sekunden auf, neue URLs zu prüfen. Dies
ist dasselbe als wenn der Benutzer nach der gegebenen Anzahl von Sekunden
@ -167,26 +98,11 @@ Standard ist alle URLs anzunehmen und zu prüfen.
.br
Kommandozeilenoption: keine
.TP
\fBmaxconnectionshttp=\fP\fINUMBER\fP
Maximale Anzahl an HTTP\-Verbindungen.
.br
Der Standard ist 10.
.br
Kommandozeilenoption: keine
\fBmaxrequestspersecond=\fP\fINUMBER\fP
Limit the maximum number of requests per second to one host.
.TP
\fBmaxconnectionshttps=\fP\fINUMBER\fP
Maximale Anzahl an HTTPS\-Verbindungen.
.br
Der Standard ist 10.
.br
Kommandozeilenoption: keine
.TP
\fBmaxconnectionsftp=\fP\fINUMBER\fP
Maximale Anzahl an FTP\-Verbindungen.
.br
Der Standard ist 2.
.br
Kommandozeilenoption: keine
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
Allowed URL schemes as comma\-separated list.
.SS [filtering]
.TP
\fBignore=\fP\fIREGEX\fP (MULTILINE)
@ -212,6 +128,11 @@ Prüfe URLs die auf den regulären Ausdruck zutreffen, aber führe keine
Rekursion durch.
.br
Kommandozeilenoption: \fB\-\-no\-follow\-url\fP
.TP
\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
Check external links. Default is to check internal links only.
.br
Command line option: \fB\-\-checkextern\fP
.SS [authentication]
.TP
\fBentry=\fP\fIREGEX\fP \fIBENUTZER\fP [\fIPASSWORT\fP] (MULTILINE)
@ -232,9 +153,8 @@ wird Authentifizierung für http[s] und ftp Verknüpfungen benutzt.
Kommandozeilenoption: \fB\-u\fP, \fB\-p\fP
.TP
\fBloginurl=\fP\fIURL\fP
Eine Anmelde\-URL, die vor der Prüfung besucht wird. Benötigt einen Eintrag
zur Authentifizierung und impliziert die Benutzung von Cookies, weil die
meisten Anmeldungen heutzutage Cookies benutzen.
A login URL to be visited before checking. Also needs authentication data
set for it.
.TP
\fBloginuserfield=\fP\fINAME\fP
Der Name für das Benutzer CGI\-Feld. Der Standardname ist \fBlogin\fP.
@ -247,12 +167,6 @@ Optional zusätzliche CGI Namen/Werte\-Paare. Die Default\-Werte werden
automatisch übermittelt.
.SS [output]
.TP
\fBcomplete=\fP[\fB0\fP|\fB1\fP]
Falls gesetzt, gebe alle geprüften URLs aus, sogar Duplikate. Standard ist
es, URLs nur einmal auszugeben.
.br
Kommandozeilenoption: \fB\-\-complete\fP
.TP
\fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
\fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP, \fBthread\fP und \fBall\fP. Die
@ -528,6 +442,52 @@ ignoriert, müssen aber eingerückt sein.
[filtering]
ignorewarnings=http\-moved\-permanent
.SH PLUGINS
All plugins have a separate section. If the section appears in the
configuration file the plugin is enabled. Some plugins read extra options
in their section.
.SS [AnchorCheck]
Checks validity of HTML anchors.
.SS [LocationInfo]
Adds the country and if possible city name of the URL host as info. Needs
GeoIP or pygeoip and a local country or city lookup DB installed.
.SS [RegexCheck]
Define a regular expression which prints a warning if it matches any content
of the checked link. This applies only to valid pages, so we can get their
content.
Use this to check for pages that contain some form of error message, for
example 'This page has moved' or 'Oracle Application error'.
Man beachte, dass mehrere Werte in dem regulären Ausdruck kombiniert
werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
Applikationsfehler)".
.SS [SslCertificateCheck]
Check SSL certificate expiration date. Only internal https: links will be
checked. A domain will only be checked once to avoid duplicate warnings.
.TP
\fBsslcertwarndays=\fP\fINUMBER\fP
Configures the expiration warning time in days.
.SS [HtmlSyntaxCheck]
Check the syntax of HTML pages with the online W3C HTML validator. See
http://validator.w3.org/docs/api.html.
.SS [CssSyntaxCheck]
Check the syntax of HTML pages with the online W3C CSS validator. See
http://jigsaw.w3.org/css\-validator/manual.html#expert.
.SS [VirusCheck]
Checks the page content for virus infections with clamav. A local clamav
daemon must be installed.
.TP
\fBclamavconf=\fP\fIDateiname\fP
Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
.
.SH WARNUNGEN
Die folgenden Warnungen werden vom Konfigurationseintrag 'ignorewarnings'
@ -543,57 +503,21 @@ Der file: Pfad ist nicht derselbe wie der Systempfad.
\fBftp\-missing\-slash\fP
Der ftp: URL fehlt ein abschließender Schrägstrich.
.TP
\fBhttp\-auth\-unknonwn\fP
Nicht unterstützte HTTP Authentifizierungsmethode.
.TP
\fBhttp\-cookie\-store\-error\fP
Ein Fehler trat auf während des Speicherns eines Cookies.
.TP
\fBhttp\-decompress\-error\fP
Ein Fehler trat beim Dekomprimieren des URL Inhalts auf.
.TP
\fBhttp\-empty\-content\fP
Die URL besitzt keinen Inhalt.
.TP
\fBhttp\-moved\-permanent\fP
Die URL wurde dauerhaft verschoben.
.TP
\fBhttp\-robots\-denied\fP
Die http: URL\-Überprüfung wurde verweigert.
.TP
\fBhttp\-unsupported\-encoding\fP
Der URL\-Inhalt ist in einer unbekannten Kodierung verfasst.
.TP
\fBhttp\-wrong\-redirect\fP
Die URL wurde zu einem anderen URL\-Typ umgeleitet.
.TP
\fBhttps\-certificate\-error\fP
Das SSL\-Zertifikat ist ungültig oder abgelaufen.
.TP
\fBignore\-url\fP
Die URL wurde ignoriert.
.TP
\fBmail\-no\-connection\fP
Es konnte keine Verbindung zu einem MX\-Rechner hergestellt werden.
.TP
\fBmail\-no\-mx\-host\fP
Der MX Mail\-Rechner konnte nicht gefunden werden.
.TP
\fBmail\-unverified\-address\fP
Die mailto: Addresse konnte nicht überprüft werden.
.TP
\fBnntp\-no\-newsgroup\fP
Die NNTP Nachrichtengruppe konnte nicht gefunden werden.
.TP
\fBnntp\-no\-server\fP
Es wurde kein NNTP Server gefunden.
.TP
\fBurl\-anchor\-not\-found\fP
URL Anker wurde nicht gefunden.
.TP
\fBurl\-content\-size\-unequal\fP
Der URL Inhaltsgrößenangabe und die Download\-Größe sind unterschiedlich.
.TP
\fBurl\-content\-size\-zero\fP
Der URL Inhaltsgrößenangabe ist Null.
.TP
@ -609,9 +533,6 @@ Konnte den Inhalt der URL nicht bekommen.
\fBurl\-obfuscated\-ip\fP
Die IP\-Adresse ist verschleiert.
.TP
\fBurl\-warnregex\-found\fP
Der reguläre Ausdruck für Warnungen wurde in den URL Inhalten gefunden.
.TP
\fBurl\-whitespace\fP
Die URL %(url)s enthält Leerzeichen am Anfang oder Ende.

View file

@ -33,15 +33,14 @@ Antivirus check
.IP \(bu
a command line, GUI and web interface
.SH EXAMPLES
The most common use checks the given domain recursively, plus any
URL pointing outside of the domain:
\fBlinkchecker http://www.example.net/\fP
The most common use checks the given domain recursively:
\fBlinkchecker http://www.example.com/\fP
.br
Beware that this checks the whole site which can have thousands of URLs.
Use the \fB\-r\fP option to restrict the recursion depth.
.br
Don't check \fBmailto:\fP URLs. All other links are checked as usual:
\fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
\fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
.br
Checking a local HTML file on Unix:
\fBlinkchecker ../bla.html\fP
@ -53,7 +52,7 @@ You can skip the \fBhttp://\fP url part if the domain starts with \fBwww.\fP:
\fBlinkchecker www.example.com\fP
.br
You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
\fBlinkchecker \-r0 ftp.example.org\fP
\fBlinkchecker \-r0 ftp.example.com\fP
.br
Generate a sitemap graph and convert it with the graphviz dot utility:
\fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
@ -77,18 +76,12 @@ of threads is 100. To disable threading specify a non-positive number.
.TP
\fB\-V\fP, \fB\-\-version\fP
Print version and exit.
.TP
\fB\-\-list\-plugins\fP
Print available check plugins and exit.
.
.SS Output options
.TP
\fB\-\-check\-css\fP
Check syntax of CSS URLs with the W3C online validator.
.TP
\fB\-\-check\-html\fP
Check syntax of HTML URLs with the W3C online validator.
.TP
\fB\-\-complete\fP
Log all URLs, including duplicates. Default is to log duplicate URLs only once.
.TP
\fB\-D\fP\fISTRING\fP, \fB\-\-debug=\fP\fISTRING\fP
Print debugging output for the given logger.
Available loggers are \fBcmdline\fP, \fBchecking\fP,
@ -139,12 +132,6 @@ that of your locale. Valid encodings are listed at
Quiet operation, an alias for \fB\-o none\fP.
This is only useful with \fB\-F\fP.
.TP
\fB\-\-scan\-virus\fP
Scan content of URLs for viruses with ClamAV.
.TP
\fB\-\-trace\fP
Print tracing information.
.TP
\fB\-v\fP, \fB\-\-verbose\fP
Log all checked URLs. Default is to log only errors and warnings.
.TP
@ -160,27 +147,15 @@ Note that multiple values can be combined in the regular expression,
for example "(This page has moved|Oracle Application error)".
.br
See section \fBREGULAR EXPRESSIONS\fP for more info.
.TP
\fB\-\-warning\-size\-bytes=\fP\fINUMBER\fP
Print a warning if content size info is available and exceeds the given
number of \fIbytes\fP.
.
.SS Checking options
.TP
\fB\-a\fP, \fB\-\-anchors\fP
Check HTTP anchor references. Default is not to check anchors.
This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
.TP
\fB\-C\fP, \fB\-\-cookies\fP
Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information.
.TP
\fB\-\-cookiefile=\fP\fIFILENAME\fP
Read a file with initial cookie data. The cookie data
format is explained below.
.TP
\fB\-\-check\-extern
Check external URLs.
.TP
\fB\-\-ignore\-url=\fP\fIREGEX\fP
URLs matching the given regular expression will be ignored and not checked.
.br
@ -206,10 +181,6 @@ Read a password from console and use it for HTTP and FTP authorization.
For FTP the default password is \fBanonymous@\fP. For HTTP there is
no default password. See also \fB\-u\fP.
.TP
\fB\-P\fP\fINUMBER\fP, \fB\-\-pause=\fP\fINUMBER\fP
Pause the given number of seconds between two subsequent connection
requests to the same host. Default is no pause between requests.
.TP
\fB\-r\fP\fINUMBER\fP, \fB\-\-recursion\-level=\fP\fINUMBER\fP
Check recursively all links up to given depth.
A negative depth will enable infinite recursion.
@ -291,16 +262,13 @@ A cookie file contains standard HTTP header (RFC 2616) data with the
following possible names:
.
.TP
\fBScheme\fP (optional)
Sets the scheme the cookies are valid for; default scheme is \fBhttp\fP.
.TP
\fBHost\fP (required)
Sets the domain the cookies are valid for.
.TP
\fBPath\fP (optional)
Gives the path the cookies are value for; default path is \fB/\fP.
.TP
\fBSet-cookie\fP (optional)
\fBSet-cookie\fP (required)
Set cookie name/value. Can be given more than once.
.PP
Multiple entries are separated by a blank line.
@ -314,7 +282,6 @@ with \fBhttps://example.org/\fP:
Set-cookie: ID="smee"
Set-cookie: spam="egg"
Scheme: https
Host: example.org
Set-cookie: baggage="elitist"; comment="hologram"
@ -353,7 +320,6 @@ After connecting to the given HTTP server the given path
or query is requested. All redirections are followed, and
if user/password is given it will be used as authorization
when necessary.
Permanently moved pages issue a warning.
All final HTTP status codes other than 2xx are errors.
.
HTML page contents are checked for recursion.
@ -412,6 +378,20 @@ Unsupported links (``javascript:``, etc.)
in the \fBlinkcheck/checker/unknownurl.py\fP source file.
The most prominent of them should be JavaScript links.
.SH PLUGINS
There are two plugin types: connection and content plugins.
.
Connection plugins are run after a successful connection to the
URL host.
.
Content plugins are run if the URL type has content
(mailto: URLs have no content for example) and if the check is not
forbidden (ie. by HTTP robots.txt).
.
See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and
their documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
configuration file.
.SH RECURSION
Before descending recursively into a URL, it has to fulfill several
conditions. They are checked in this order:

View file

@ -9,51 +9,14 @@ The file is written in an INI-style format.
The default file location is \fB~/.linkchecker/linkcheckerrc\fP on Unix,
\fB%HOMEPATH%\\.linkchecker\\linkcheckerrc\fP on Windows systems.
.SH SETTINGS
.SS \fB[checking]\fP
.TP
\fBanchors=\fP[\fB0\fP|\fB1\fP]
Check HTTP anchor references. Default is not to check anchors.
This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
.br
Command line option: \fB\-\-anchors\fP
.TP
\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
Check syntax of CSS URLs with the W3C online validator.
.br
Command line option: \fB\-\-check\-css\fP
.TP
\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
Check syntax of HTML URLs with the W3C online validator.
.br
Command line option: \fB\-\-check\-html\fP
.TP
\fBclamavconf=\fP\fIfilename\fP
Filename of \fBclamd.conf\fP config file.
.br
Command line option: none
.TP
\fBcookiefile=\fP\fIfilename\fP
Read a file with initial cookie data. The cookie data
format is explained in linkchecker(1).
.br
Command line option: \fB\-\-cookiefile\fP
.TP
\fBcookies=\fP[\fB0\fP|\fB1\fP]
Accept and send HTTP cookies.
.br
Command line option: \fB\-\-cookies\fP
.TP
\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
When checking finishes, write a memory dump to a temporary file.
The memory dump is written both when checking finishes normally
and when checking gets canceled.
.br
The memory dump only works if the python-meliae package is installed.
Otherwise a warning is printed to install it.
.br
Command line option: none
.TP
\fBlocalwebroot=\fP\fISTRING\fP
When checking absolute URLs inside local files, the given root directory
is used as base URL.
@ -71,12 +34,6 @@ only the syntax of the link is checked.
.br
Command line option: \fB\-\-nntp\-server\fP
.TP
\fBpause=\fP\fINUMBER\fP
Pause the given number of seconds between two subsequent connection
requests to the same host.
.br
Command line option: \fB\-\-pause\fP
.TP
\fBrecursionlevel=\fP\fINUMBER\fP
Check recursively all links up to given depth.
A negative depth will enable infinite recursion.
@ -84,11 +41,6 @@ Default depth is infinite.
.br
Command line option: \fB\-\-recursion\-level\fP
.TP
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
Scan content of URLs for viruses with ClamAV.
.br
Command line option: \fB\-\-scan\-virus\fP
.TP
\fBthreads=\fP\fINUMBER\fP
Generate no more than the given number of threads. Default number
of threads is 100. To disable threading specify a non-positive number.
@ -101,6 +53,13 @@ is 60 seconds.
.br
Command line option: \fB\-\-timeout\fP
.TP
\fBaborttimeout=\fP\fINUMBER\fP
Time to wait for checks to finish after the user aborts the first time
(with Ctrl-C or the abort button).
The default abort timeout is 300 seconds.
.br
Command line option: \fB\-\-timeout\fP
.TP
\fBuseragent=\fP\fISTRING\fP
Specify the User-Agent string to send to the HTTP server, for example
"Mozilla/4.0". The default is "LinkChecker/X.Y" where X.Y is the current
@ -108,22 +67,6 @@ version of LinkChecker.
.br
Command line option: \fB\-\-user\-agent\fP
.TP
\fBwarningregex=\fP=\fIREGEX\fP
Define a regular expression which prints a warning if it matches any
content of the checked link.
This applies only to valid pages, so we can get their content.
.br
Use this to check for pages that contain some form of error, for example
"This page has moved" or "Oracle Application Server error".
.br
Command line option: \fB\-\-warning\-regex\fP
.TP
\fBwarnsizebytes=\fP\fINUMBER\fP
Print a warning if content size info is available and exceeds the given
number of \fIbytes\fP.
.br
Command line option: \fB\-\-warning\-size\-bytes\fP
.TP
\fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIfilename\fP]
If set to zero disables SSL certificate checking.
If set to one (the default) enables SSL certificate checking with
@ -132,16 +75,6 @@ will be used as the certificate file.
.br
Command line option: none
.TP
\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
Check that SSL certificates are at least the given number of days valid.
The number must not be negative.
If the number of days is zero a warning is printed only for certificates
that are already expired.
.br
The default number of days is 14.
.br
Command line option: none
.TP
\fBmaxrunseconds=\fP\fINUMBER\fP
Stop checking new URLs after the given number of seconds. Same as if the
user stops (by hitting Ctrl-C or clicking the abort buttin in the GUI)
@ -159,26 +92,11 @@ The default is to queue and check all URLs.
.br
Command line option: none
.TP
\fBmaxconnectionshttp=\fP\fINUMBER\fP
Maximum number of connections to HTTP servers.
.br
The default is 10.
.br
Command line option: none
\fBmaxrequestspersecond=\fP\fINUMBER\fP
Limit the maximum number of requests per second to one host.
.TP
\fBmaxconnectionshttps=\fP\fINUMBER\fP
Maximum number of connections to HTTPS servers.
.br
The default is 10.
.br
Command line option: none
.TP
\fBmaxconnectionsftp=\fP\fINUMBER\fP
Maximum number of connections to FTP servers.
.br
The default is 2.
.br
Command line option: none
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
Allowed URL schemes as comma-separated list.
.SS \fB[filtering]\fP
.TP
\fBignore=\fP\fIREGEX\fP (MULTILINE)
@ -203,6 +121,11 @@ Check but do not recurse into URLs matching the given regular
expressions.
.br
Command line option: \fB\-\-no\-follow\-url\fP
.TP
\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
Check external links. Default is to check internal links only.
.br
Command line option: \fB\-\-checkextern\fP
.SS \fB[authentication]\fP
.TP
\fBentry=\fP\fIREGEX\fP \fIUSER\fP [\fIPASS\fP] (MULTILINE)
@ -224,8 +147,7 @@ Command line option: \fB\-u\fP, \fB\-p\fP
.TP
\fBloginurl=\fP\fIURL\fP
A login URL to be visited before checking. Also needs authentication
data set for it, and implies using cookies because most logins use
cookies nowadays.
data set for it.
.TP
\fBloginuserfield=\fP\fISTRING\fP
The name of the user CGI field. Default name is \fBlogin\fP.
@ -238,12 +160,6 @@ Optionally any additional CGI name/value pairs. Note that the default
values are submitted automatically.
.SS \fB[output]\fP
.TP
\fBcomplete=\fP[\fB0\fP|\fB1\fP]
If set log all checked URLs, even duplicates. Default is to log
duplicate URLs only once.
.br
Command line option: \fB\-\-complete\fP
.TP
\fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
Print debugging output for the given loggers.
Available loggers are \fBcmdline\fP, \fBchecking\fP,
@ -524,6 +440,53 @@ though they must still be indented.
[filtering]
ignorewarnings=http-moved-permanent
.SH PLUGINS
All plugins have a separate section. If the section
appears in the configuration file the plugin is enabled.
Some plugins read extra options in their section.
.SS \fB[AnchorCheck]\fP
Checks validity of HTML anchors.
.SS \fB[LocationInfo]\fP
Adds the country and if possible city name of the URL host as info.
Needs GeoIP or pygeoip and a local country or city lookup DB installed.
.SS \fB[RegexCheck]\fP
Define a regular expression which prints a warning if it matches
any content of the checked link. This applies only to valid pages,
so we can get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application error'.
Note that multiple values can be combined in the regular expression,
for example "(This page has moved|Oracle Application error)".
.SS \fB[SslCertificateCheck]\fP
Check SSL certificate expiration date. Only internal https: links
will be checked. A domain will only be checked once to avoid duplicate
warnings.
.TP
\fBsslcertwarndays=\fP\fINUMBER\fP
Configures the expiration warning time in days.
.SS \fB[HtmlSyntaxCheck]\fP
Check the syntax of HTML pages with the online W3C HTML validator.
See http://validator.w3.org/docs/api.html.
.SS \fB[CssSyntaxCheck]\fP
Check the syntax of HTML pages with the online W3C CSS validator.
See http://jigsaw.w3.org/css-validator/manual.html#expert.
.SS \fB[VirusCheck]\fP
Checks the page content for virus infections with clamav.
A local clamav daemon must be installed.
.TP
\fBclamavconf=\fP\fIfilename\fP
Filename of \fBclamd.conf\fP config file.
.
.SH WARNINGS
The following warnings are recognized in the 'ignorewarnings' config
@ -539,57 +502,21 @@ The file: path is not the same as the system specific path.
\fBftp-missing-slash\fP
The ftp: URL is missing a trailing slash.
.TP
\fBhttp-auth-unknonwn\fP
Unsupported HTTP authentication method.
.TP
\fBhttp-cookie-store-error\fP
An error occurred while storing a cookie.
.TP
\fBhttp-decompress-error\fP
An error occurred while decompressing the URL content.
.TP
\fBhttp-empty-content\fP
The URL had no content.
.TP
\fBhttp-moved-permanent\fP
The URL has moved permanently.
.TP
\fBhttp-robots-denied\fP
The http: URL checking has been denied.
.TP
\fBhttp-unsupported-encoding\fP
The URL content is encoded with an unknown encoding.
.TP
\fBhttp-wrong-redirect\fP
The URL has been redirected to an URL of a different type.
.TP
\fBhttps-certificate-error\fP
The SSL certificate is invalid or expired.
.TP
\fBignore-url\fP
The URL has been ignored.
.TP
\fBmail-no-connection\fP
No connection to a MX host could be established.
.TP
\fBmail-no-mx-host\fP
The mail MX host could not be found.
.TP
\fBmail-unverified-address\fP
The mailto: address could not be verified.
.TP
\fBnntp-no-newsgroup\fP
The NNTP newsgroup could not be found.
.TP
\fBnntp-no-server\fP
No NNTP server was found.
.TP
\fBurl-anchor-not-found\fP
URL anchor was not found.
.TP
\fBurl-content-size-unequal\fP
The URL content size and download size are unequal.
.TP
\fBurl-content-size-zero\fP
The URL content size is zero.
.TP
@ -605,9 +532,6 @@ Could not get the content of the URL.
\fBurl-obfuscated-ip\fP
The IP is obfuscated.
.TP
\fBurl-warnregex-found\fP
The warning regular expression was found in the URL contents.
.TP
\fBurl-whitespace\fP
The URL contains leading or trailing whitespace.

View file

@ -50,7 +50,9 @@ First, install the required software.
On Debian or Ubuntu systems, install the package qt4-dev-tools.
On Redhat systems, install the package qt-devel.
4. *Optional, for bash-completion:*
4. Python requests module from https://pypi.python.org/pypi/requests
5. *Optional, for bash-completion:*
argcomplete Python module from https://pypi.python.org/pypi/argcomplete
6. *Optional, for displaying country codes:*

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,43 @@
Upgrading
=========
Migrating from 8.x to 9.0
-------------------------
The Python requests module is now required.
Several checks have been moved to plugins (see below).
Plugins have to be enabled in the configuration file.
The following commandline and configuration options have been deprecated
and do not have any effect:
--anchors, anchors: moved to plugin AnchorCheck
--check-css, checkcss: moved to plugin CssSyntaxCheck
--check-html, checkhtml: moved to plugin HtmlSyntaxCheck
--complete: feature removed
--cookies, sendcookies, storecookies: cookies are sent/stored per default
--pause, wait: replaced with numrequestspersecond
--scan-virus, scanvirus: moved to plugin VirusCheck
--warning-regex: moved to plugin RegexCheck
--warning-size-bytes, warnsizebytes: feature removed
warnsslcertdaysvalid: moved to plugin SslCertificationCheck
The "html" logger generates HTML5 documents now.
The following warnings have been removed:
- http-auth-unauthorized: removed
- http-auth-unknonwn: removed
- http-decompress-error: removed
- http-robots-denied: downgraded to info
- http-moved-permanent: downgraded to info
- http-unsupported-encoding: removed
- https-certificate-error: is an error now
- mail-unverified-address: removed
- mail-no-connection: removed
- syntax-css: moved to plugin
- syntax-html: moved to plugin
- url-anchor-not-found: moved to plugin
- url-content-size-unequal: removed
- url-warnregex-found: moved to plugin
Migrating from 8.4 to 8.5
--------------------------
Custom output loggers have been changed.

View file

@ -21,8 +21,9 @@ Features
- honors robots.txt exclusion protocol
- Cookie support
- HTML5 support
- HTML and CSS syntax check
- Antivirus check
- [Plugin support](plugins.html)
allowing custom page checks. Currently available are
HTML and CSS syntax checks, Antivirus checks, and more.
- Different interfaces: command line, GUI and web interface
- ... and a lot more check options documented in the
[manual page](man1/linkchecker.1.html).

View file

@ -0,0 +1,11 @@
title: Plugin support
---
Plugin documentation
=====================
Standard plugins
=================
Custom plugins
===============

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -2612,7 +2612,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
#define YY_MORE_ADJ 0
#define YY_RESTORE_YY_MORE_OFFSET
#line 1 "htmllex.l"
/* Copyright (C) 2000-2012 Bastian Kleineidam
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -2951,6 +2951,10 @@ int yyget_lineno (yyscan_t yyscanner );
void yyset_lineno (int line_number ,yyscan_t yyscanner );
int yyget_column (yyscan_t yyscanner );
void yyset_column (int column_no ,yyscan_t yyscanner );
/* %if-bison-bridge */
YYSTYPE * yyget_lval (yyscan_t yyscanner );
@ -3132,7 +3136,7 @@ YY_DECL
/*********************** EOF ************************/
#line 3135 "htmllex.c"
#line 3139 "htmllex.c"
yylval = yylval_param;
@ -4683,7 +4687,7 @@ YY_RULE_SETUP
#line 1091 "htmllex.l"
ECHO;
YY_BREAK
#line 4686 "htmllex.c"
#line 4690 "htmllex.c"
case YY_END_OF_BUFFER:
{

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2000-2012 Bastian Kleineidam
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2009 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -68,7 +68,7 @@
/* Line 268 of yacc.c */
#line 1 "htmlparse.y"
/* Copyright (C) 2000-2011 Bastian Kleineidam
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
%{
/* Copyright (C) 2000-2011 Bastian Kleineidam
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View file

@ -1,4 +1,4 @@
/* Copyright (C) 2000-2010 Bastian Kleineidam
/* Copyright (C) 2000-2014 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View file

@ -68,12 +68,14 @@ LOG_CHECK = "linkcheck.check"
LOG_CACHE = "linkcheck.cache"
LOG_GUI = "linkcheck.gui"
LOG_THREAD = "linkcheck.thread"
LOG_PLUGIN = "linkcheck.plugin"
lognames = {
"cmdline": LOG_CMDLINE,
"checking": LOG_CHECK,
"cache": LOG_CACHE,
"gui": LOG_GUI,
"thread": LOG_THREAD,
"plugin": LOG_PLUGIN,
"all": LOG_ROOT,
}
lognamelist = ", ".join(repr(name) for name in lognames)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011-2012 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011-2012 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2012 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011-2012 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011-2012 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2009 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,223 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Store and retrieve open connections.
"""
import time
from .. import log, LOG_CACHE
from ..decorators import synchronized
from ..lock import get_lock, get_semaphore
from ..containers import enum
_lock = get_lock("connection")
_wait_lock = get_lock("connwait")
ConnectionTypes = ("ftp", "http", "https")
ConnectionState = enum("available", "busy")
def get_connection_id(connection):
"""Return unique id for connection object."""
return id(connection)
def is_expired(curtime, conn_data):
"""Test if connection is expired."""
return (curtime+5.0) >= conn_data[2]
class ConnectionPool (object):
"""Thread-safe cache, storing a set of connections for URL retrieval."""
def __init__ (self, limits, wait=0):
"""
Initialize an empty connection dictionary which will have the form:
{(type, host, port) -> (lock, {id -> [connection, state, expiration time]})}
Connection can be any open connection object (HTTP, FTP, ...).
State is of type ConnectionState (either 'available' or 'busy').
Expiration time is the point of time in seconds when this
connection will be timed out.
The type is the connection type and an either 'ftp' or 'http'.
The host is the hostname as string, port the port number as an integer.
For each type, the maximum number of connections to one single host is defined
in limits.
"""
# open connections
self.connections = {}
# {host -> due time}
self.times = {}
# {host -> wait}
self.host_waits = {}
if wait < 0:
raise ValueError("negative wait value %d" % wait)
self.wait = wait
# {connection type -> max number of connections to one host}
self.limits = limits
@synchronized(_wait_lock)
def host_wait (self, host, wait):
"""Set a host specific time to wait between requests."""
if wait < 0:
raise ValueError("negative wait value %d" % wait)
self.host_waits[host] = wait
@synchronized(_wait_lock)
def wait_for_host (self, host):
"""Honor wait time for given host."""
t = time.time()
if host in self.times:
due_time = self.times[host]
if due_time > t:
wait = due_time - t
log.debug(LOG_CACHE,
"waiting for %.01f seconds on connection to %s", wait, host)
time.sleep(wait)
t = time.time()
self.times[host] = t + self.host_waits.get(host, self.wait)
def _add (self, type, host, port, create_connection):
"""Add connection to the pool with given parameters.
@param type: the connection scheme (eg. http)
@ptype type: string
@param host: the hostname
@ptype host: string
@param port: the port number
@ptype port: int
@param create_connection: function to create a new connection object
@ptype create_connection: callable
@return: newly created connection
@rtype: HTTP(S)Connection or FTPConnection
"""
self.wait_for_host(host)
connection = create_connection(type, host, port)
cid = get_connection_id(connection)
expiration = None
conn_data = [connection, 'busy', expiration]
key = (type, host, port)
if key in self.connections:
lock, entries = self.connections[key]
entries[cid] = conn_data
else:
lock = get_semaphore("%s:%d" % (host, port), self.limits[type])
lock.acquire()
log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
entries = {cid: conn_data}
self.connections[key] = (lock, entries)
return connection
@synchronized(_lock)
def get (self, type, host, port, create_connection):
"""Get open connection if available or create a new one.
@param type: connection type
@ptype type: ConnectionType
@param host: hostname
@ptype host: string
@param port: port number
@ptype port: int
@return: Open connection object or None if none is available.
@rtype None or FTPConnection or HTTP(S)Connection
"""
assert type in ConnectionTypes, 'invalid type %r' % type
# 65536 == 2**16
assert 0 < port < 65536, 'invalid port number %r' % port
key = (type, host, port)
if key not in self.connections:
return self._add(type, host, port, create_connection)
lock, entries = self.connections[key]
if not lock.acquire(False):
log.debug(LOG_CACHE, "wait for %s connection to %s:%d",
type, host, port)
return lock
log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
# either a connection is available or a new one can be created
t = time.time()
delete_entries = []
try:
for id, conn_data in entries.items():
if conn_data[1] == ConnectionState.available:
if is_expired(t, conn_data):
delete_entries.append(id)
else:
conn_data[1] = ConnectionState.busy
log.debug(LOG_CACHE,
"reusing connection %s timing out in %.01f seconds",
key, (conn_data[2] - t))
return conn_data[0]
finally:
for id in delete_entries:
del entries[id]
# make a new connection
return self._add(type, host, port, create_connection)
@synchronized(_lock)
def release (self, type, host, port, connection, expiration=None):
"""Release a used connection."""
key = (type, host, port)
if key in self.connections:
lock, entries = self.connections[key]
id = get_connection_id(connection)
if id in entries:
log.debug(LOG_CACHE, "Release lock for %s://%s:%d and expiration %s", type, host, port, expiration)
# if the connection is reusable, set it to available, else delete it
if expiration is None:
del entries[id]
else:
entries[id][1] = ConnectionState.available
entries[id][2] = expiration
lock.release()
else:
log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d from entries %s", type, host, port, entries.keys())
else:
log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d", type, host, port)
@synchronized(_lock)
def remove_expired (self):
"""Remove expired or soon to be expired connections from this pool."""
t = time.time()
for lock, entries in self.connections.values():
delete_entries = []
for id, conn_data in entries.items():
if conn_data[1] == 'available' and (t+5.0) >= conn_data[2]:
try_close(conn_data[0])
delete_entries.add(id)
for id in delete_entries:
del entries[id]
lock.release()
log.debug(LOG_CACHE, "released lock for id %s", id)
@synchronized(_lock)
def clear (self):
"""Remove all connections from this cache, even if busy."""
for lock, entries in self.connections.values():
for conn_data in entries.values():
try_close(conn_data[0])
self.connections.clear()
def try_close (connection):
"""Close and remove a connection (not thread-safe, internal use only)."""
try:
connection.close()
except Exception:
# ignore close errors
pass

View file

@ -1,83 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Store and retrieve cookies.
"""
from .. import log, LOG_CACHE, cookies
from ..decorators import synchronized
from ..lock import get_lock
_lock = get_lock("cookie")
class CookieJar (object):
"""Cookie storage, implementing the cookie handling policy."""
def __init__ (self):
"""Initialize empty cookie cache."""
# Store all cookies in a set.
self.cache = set()
@synchronized(_lock)
def add (self, headers, scheme, host, path):
"""Parse cookie values, add to cache."""
errors = []
for h in headers.getallmatchingheaders("Set-Cookie"):
# RFC 2109 (Netscape) cookie type
name, value = h.split(':', 1)
try:
cookie = cookies.NetscapeCookie(value, scheme, host, path)
if cookie in self.cache:
self.cache.remove(cookie)
if not cookie.is_expired():
self.cache.add(cookie)
except cookies.CookieError as msg:
errmsg = "Invalid cookie %r for %s:%s%s: %s" % (
h, scheme, host, path, msg)
errors.append(errmsg)
for h in headers.getallmatchingheaders("Set-Cookie2"):
# RFC 2965 cookie type
name, value = h.split(':', 1)
try:
cookie = cookies.Rfc2965Cookie(value, scheme, host, path)
if cookie in self.cache:
self.cache.remove(cookie)
if not cookie.is_expired():
self.cache.add(cookie)
except cookies.CookieError as msg:
errmsg = "Invalid cookie2 %r for %s:%s%s: %s" % (
h, scheme, host, path, msg)
errors.append(errmsg)
return errors
@synchronized(_lock)
def get (self, scheme, host, port, path):
"""Cookie cache getter function. Return ordered list of cookies
which match the given host, port and path.
Cookies with more specific paths are listed first."""
cookies = [x for x in self.cache if x.check_expired() and \
x.is_valid_for(scheme, host, port, path)]
# order cookies with more specific (ie. longer) paths first
cookies.sort(key=lambda c: len(c.attributes['path']), reverse=True)
log.debug(LOG_CACHE, "Found %d cookies for host %r path %r",
len(cookies), host, path)
return cookies
@synchronized(_lock)
def __str__ (self):
"""Return stored cookies as string."""
return "<CookieJar with %s>" % self.cache

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2012 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -23,9 +23,6 @@ from time import time as _time
from .. import log, LOG_CACHE
LARGE_QUEUE_THRESHOLD = 1000
FRONT_CHUNK_SIZE = 100
class Timeout (StandardError):
"""Raised by join()"""
pass
@ -55,8 +52,8 @@ class UrlQueue (object):
self.all_tasks_done = threading.Condition(self.mutex)
self.unfinished_tasks = 0
self.finished_tasks = 0
self.in_progress = {}
self.seen = {}
self.in_progress = 0
self.seen = set()
self.shutdown = False
# Each put() decreases the number of allowed puts.
# This way we can restrict the number of URLs that are checked.
@ -103,24 +100,29 @@ class UrlQueue (object):
if remaining <= 0.0:
raise Empty()
self.not_empty.wait(remaining)
url_data = self.queue.popleft()
if url_data.has_result:
# Already checked and copied from cache.
pass
else:
key = url_data.cache_url_key
assert key is not None
self.in_progress[key] = url_data
return url_data
self.in_progress += 1
return self.queue.popleft()
def put (self, item):
"""Put an item into the queue.
Block if necessary until a free slot is available.
"""
if self.put_denied(item):
return
with self.mutex:
self._put(item)
self.not_empty.notify()
def put_denied(self, url_data):
"""Determine if put() will not append the item on the queue.
@return True (reliable) or False (unreliable)
"""
if self.shutdown or self.allowed_puts == 0:
return True
if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen:
return True
return False
def _put (self, url_data):
"""Put URL in queue, increase number of unfished tasks."""
if self.shutdown:
@ -133,17 +135,16 @@ class UrlQueue (object):
self.allowed_puts -= 1
log.debug(LOG_CACHE, "queueing %s", url_data)
key = url_data.cache_url_key
# cache key is None for URLs with invalid syntax
assert key is not None or url_data.has_result, "invalid cache key in %s" % url_data
if key in self.seen:
self.seen[key] += 1
if key is not None:
# do not check duplicate URLs
if key is not None:
if key in self.seen:
# don't check duplicate URLs
return
else:
self.seen[key] = 0
self.queue.append(url_data)
self.seen.add(key)
self.unfinished_tasks += 1
if url_data.has_result:
self.queue.appendleft(url_data)
else:
self.queue.append(url_data)
def task_done (self, url_data):
"""
@ -163,17 +164,11 @@ class UrlQueue (object):
with self.all_tasks_done:
log.debug(LOG_CACHE, "task_done %s", url_data)
# check for aliases (eg. through HTTP redirections)
if hasattr(url_data, "aliases"):
for key in url_data.aliases:
if key in self.seen:
self.seen[key] += 1
else:
self.seen[key] = 0
key = url_data.cache_url_key
if key in self.in_progress:
del self.in_progress[key]
if hasattr(url_data, "aliases") and url_data.aliases:
self.seen.update(url_data.aliases)
self.finished_tasks += 1
self.unfinished_tasks -= 1
self.in_progress -= 1
if self.unfinished_tasks <= 0:
if self.unfinished_tasks < 0:
raise ValueError('task_done() called too many times')
@ -216,7 +211,5 @@ class UrlQueue (object):
def status (self):
"""Get tuple (finished tasks, in progress, queue size)."""
with self.mutex:
return (self.finished_tasks,
len(self.in_progress), len(self.queue))
# no need to acquire self.mutex since the numbers are unreliable anyways.
return (self.finished_tasks, self.in_progress, len(self.queue))

View file

@ -101,43 +101,46 @@ def get_url_from (base_url, recursion_level, aggregate,
base_ref = strformat.unicode_safe(base_ref)
name = strformat.unicode_safe(name)
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
scheme = None
if not (url or name):
# use filename as base url, with slash as path seperator
name = base_url.replace("\\", "/")
if parent_content_type == 'application/x-httpd-php' and \
'<?' in base_url and '?>' in base_url and url.startswith('file:'):
# ignore but warn about URLs from local PHP files with execution directives
elif ":" in url:
scheme = url.split(":", 1)[0].lower()
allowed_schemes = aggregate.config["allowedschemes"]
# ignore local PHP files with execution directives
local_php = (parent_content_type == 'application/x-httpd-php' and
'<?' in base_url and '?>' in base_url and scheme == 'file')
if local_php or (allowed_schemes and scheme not in allowed_schemes):
klass = ignoreurl.IgnoreUrl
else:
assume_local_file = recursion_level == 0
klass = get_urlclass_from(url, assume_local_file=assume_local_file)
assume_local_file = (recursion_level == 0)
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(base_url, recursion_level, aggregate,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name, extern=extern)
def get_urlclass_from (url, assume_local_file=False):
"""Return checker class for given URL. If URL does not start
with a URL scheme and assume_local_file is True, assume that
the given URL is a local file."""
if url.startswith("http:"):
def get_urlclass_from (scheme, assume_local_file=False):
"""Return checker class for given URL scheme. If the scheme
cannot be matched and assume_local_file is True, assume a local file.
"""
if scheme in ("http", "https"):
klass = httpurl.HttpUrl
elif url.startswith("ftp:"):
elif scheme == "ftp":
klass = ftpurl.FtpUrl
elif url.startswith("file:"):
elif scheme == "file":
klass = fileurl.FileUrl
elif url.startswith("telnet:"):
elif scheme == "telnet":
klass = telneturl.TelnetUrl
elif url.startswith("mailto:"):
elif scheme == "mailto":
klass = mailtourl.MailtoUrl
elif url.startswith("https:"):
klass = httpsurl.HttpsUrl
elif url.startswith(("nntp:", "news:", "snews:")):
elif scheme in ("nntp", "news", "snews"):
klass = nntpurl.NntpUrl
elif url.startswith('dns:'):
elif scheme == "dns":
klass = dnsurl.DnsUrl
elif unknownurl.is_unknown_url(url):
elif scheme and unknownurl.is_unknown_scheme(scheme):
klass = unknownurl.UnknownUrl
elif assume_local_file:
klass = fileurl.FileUrl
@ -168,4 +171,4 @@ def get_index_html (urls):
# all the URL classes
from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
httpsurl, mailtourl, telneturl, nntpurl, ignoreurl)
mailtourl, telneturl, nntpurl, ignoreurl)

View file

@ -21,8 +21,8 @@ import socket
import select
import nntplib
import ftplib
import httplib as orighttplib
from .. import LinkCheckerError, httplib2 as httplib
import requests
from .. import LinkCheckerError
from dns.exception import DNSException
# Catch these exception on syntax checks.
@ -45,9 +45,8 @@ ExcCacheList = [
nntplib.error_perm,
nntplib.error_proto,
EOFError,
# http error
httplib.error,
orighttplib.error,
# http errors
requests.exceptions.RequestException,
# ftp errors
ftplib.error_reply,
ftplib.error_temp,
@ -75,39 +74,25 @@ ExcList = ExcCacheList + ExcNoCacheList
# some constants
URL_MAX_LENGTH = 2000
URL_WARN_LENGTH = 255
URL_WARN_LENGTH = 1024
# the warnings
WARN_URL_EFFECTIVE_URL = "url-effective-url"
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
WARN_URL_TOO_LONG = "url-too-long"
WARN_URL_WHITESPACE = "url-whitespace"
WARN_FILE_MISSING_SLASH = "file-missing-slash"
WARN_FILE_SYSTEM_PATH = "file-system-path"
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
WARN_HTTP_ROBOTS_DENIED = "http-robots-denied"
WARN_HTTP_MOVED_PERMANENT = "http-moved-permanent"
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
WARN_HTTP_DECOMPRESS_ERROR = "http-decompress-error"
WARN_HTTP_UNSUPPORTED_ENCODING = "http-unsupported-encoding"
WARN_HTTP_AUTH_UNKNOWN = "http-auth-unknonwn"
WARN_HTTP_AUTH_UNAUTHORIZED = "http-auth-unauthorized"
WARN_HTTPS_CERTIFICATE = "https-certificate-error"
WARN_IGNORE_URL = "ignore-url"
WARN_MAIL_NO_MX_HOST = "mail-no-mx-host"
WARN_MAIL_UNVERIFIED_ADDRESS = "mail-unverified-address"
WARN_MAIL_NO_CONNECTION = "mail-no-connection"
WARN_NNTP_NO_SERVER = "nntp-no-server"
WARN_NNTP_NO_NEWSGROUP = "nntp-no-newsgroup"
WARN_SYNTAX_HTML = "syntax-html"
WARN_SYNTAX_CSS = "syntax-css"
# registered warnings
Warnings = {
@ -115,41 +100,20 @@ Warnings = {
_("The effective URL is different from the original."),
WARN_URL_ERROR_GETTING_CONTENT:
_("Could not get the content of the URL."),
WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
WARN_URL_WARNREGEX_FOUND:
_("The warning regular expression was found in the URL contents."),
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
WARN_FILE_SYSTEM_PATH:
_("The file: path is not the same as the system specific path."),
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
WARN_HTTP_ROBOTS_DENIED: _("The http: URL checking has been denied."),
WARN_HTTP_MOVED_PERMANENT: _("The URL has moved permanently."),
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
WARN_HTTP_COOKIE_STORE_ERROR:
_("An error occurred while storing a cookie."),
WARN_HTTP_DECOMPRESS_ERROR:
_("An error occurred while decompressing the URL content."),
WARN_HTTP_UNSUPPORTED_ENCODING:
_("The URL content is encoded with an unknown encoding."),
WARN_HTTP_AUTH_UNKNOWN:
_("Unsupported HTTP authentication method."),
WARN_HTTP_AUTH_UNAUTHORIZED:
_("Unauthorized access without HTTP authentication."),
WARN_HTTPS_CERTIFICATE: _("The SSL certificate is invalid or expired."),
WARN_IGNORE_URL: _("The URL has been ignored."),
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
WARN_MAIL_UNVERIFIED_ADDRESS:
_("The mailto: address could not be verified."),
WARN_MAIL_NO_CONNECTION:
_("No connection to a MX host could be established."),
WARN_NNTP_NO_SERVER: _("No NNTP server was found."),
WARN_NNTP_NO_NEWSGROUP: _("The NNTP newsgroup could not be found."),
WARN_URL_OBFUSCATED_IP: _("The IP is obfuscated."),
WARN_SYNTAX_HTML: _("HTML syntax error."),
WARN_SYNTAX_CSS: _("CSS syntax error."),
}

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -25,7 +25,7 @@ import urllib
import urllib2
from datetime import datetime
from . import urlbase, get_index_html, get_url_from
from . import urlbase, get_index_html
from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil
from ..bookmarks import firefox
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
@ -163,8 +163,6 @@ class FileUrl (urlbase.UrlBase):
return
filename = self.get_os_filename()
self.size = fileutil.get_size(filename)
if self.dlsize == -1:
self.dlsize = self.size
self.modified = datetime.utcfromtimestamp(fileutil.get_mtime(filename))
def check_connection (self):
@ -203,16 +201,13 @@ class FileUrl (urlbase.UrlBase):
def read_content (self):
"""Return file content, or in case of directories a dummy HTML file
with links to the files."""
if self.size > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
if self.is_directory():
data = get_index_html(get_files(self.get_os_filename()))
if isinstance(data, unicode):
data = data.encode("iso8859-1", "ignore")
size = len(data)
else:
data, size = super(FileUrl, self).read_content()
return data, size
data = super(FileUrl, self).read_content()
return data
def is_html (self):
"""Check if file is a HTML file."""
@ -272,27 +267,6 @@ class FileUrl (urlbase.UrlBase):
log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
return False
def parse_url (self):
"""Parse file contents for new links to check."""
if self.is_directory():
self.parse_html()
elif firefox.has_sqlite and firefox.extension.search(self.url):
self.parse_firefox()
else:
mime = self.get_content_type()
key = self.ContentMimetypes[mime]
getattr(self, "parse_"+key)()
self.add_num_url_info()
def parse_firefox (self):
"""Parse a Firefox3 bookmark file."""
log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
filename = self.get_os_filename()
for url, name in firefox.parse_bookmark_file(filename):
url_data = get_url_from(url, self.recursion_level+1,
self.aggregate, parent_url=self.url, name=name)
self.aggregate.urlqueue.put(url_data)
def get_content_type (self):
"""Return URL content type, or an empty string if content
type could not be found."""
@ -326,6 +300,5 @@ class FileUrl (urlbase.UrlBase):
webroot = self.aggregate.config["localwebroot"]
if webroot and url and url.startswith(u"/"):
url = webroot + url[1:]
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.",
webroot, url)
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
super(FileUrl, self).add_url(url, line=line, column=column, name=name, base=base)

View file

@ -22,11 +22,11 @@ import ftplib
from cStringIO import StringIO
from .. import log, LOG_CHECK, LinkCheckerError, fileutil
from . import proxysupport, httpurl, internpaturl, get_index_html, pooledconnection
from . import proxysupport, httpurl, internpaturl, get_index_html
from .const import WARN_FTP_MISSING_SLASH
class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with ftp scheme.
"""
@ -70,14 +70,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
def login (self):
"""Log into ftp server and check the welcome message."""
def create_connection(scheme, host, port):
"""Create a new ftp connection."""
connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
if log.is_debug(LOG_CHECK):
connection.set_debuglevel(1)
return connection
scheme, host, port = self.get_netloc()
self.get_pooled_connection(scheme, host, port, create_connection)
self.url_connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
if log.is_debug(LOG_CHECK):
self.url_connection.set_debuglevel(1)
try:
self.url_connection.connect(self.host, self.port)
_user, _password = self.get_user_password()
@ -92,6 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
# note that the info may change every time a user logs in,
# so don't add it to the url_data info.
log.debug(LOG_CHECK, "FTP info %s", info)
pass
else:
raise LinkCheckerError(_("Got no answer from FTP server"))
except EOFError as msg:
@ -105,6 +101,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
features = self.url_connection.sendcmd("FEAT")
except ftplib.error_perm as msg:
log.debug(LOG_CHECK, "Ignoring error when getting FTP features: %s" % msg)
pass
else:
log.debug(LOG_CHECK, "FTP features %s", features)
if " UTF-8" in features.splitlines():
@ -176,7 +173,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
"""See if URL target is parseable for recursion."""
if self.is_directory():
return True
ctype = self.get_content_type(self.get_content)
ctype = self.get_content_type()
if ctype in self.ContentMimetypes:
return True
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
@ -188,20 +185,11 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
path = self.urlparts[2]
return (not path) or path.endswith('/')
def parse_url (self):
"""Parse URL target for links."""
if self.is_directory():
self.parse_html()
return
key = self.ContentMimetypes[self.get_content_type(self.get_content)]
getattr(self, "parse_"+key)()
self.add_num_url_info()
def get_content_type (self, read=None):
def get_content_type (self):
"""Return URL content type, or an empty string if content
type could not be found."""
if self.content_type is None:
self.content_type = fileutil.guess_mimetype(self.url, read=read)
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
return self.content_type
def read_content (self):
@ -210,6 +198,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
if self.is_directory():
self.url_connection.cwd(self.filename)
self.files = self.get_files()
# XXX limit number of files?
data = get_index_html(self.files)
else:
# download file in BINARY mode
@ -217,20 +206,20 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
buf = StringIO()
def stor_data (s):
"""Helper method storing given data"""
self.aggregate.add_download_data(self.cache_content_key, s)
# limit the download size
if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
if (buf.tell() + len(s)) > self.max_size:
raise LinkCheckerError(_("FTP file size too large"))
buf.write(s)
self.url_connection.retrbinary(ftpcmd, stor_data)
data = buf.getvalue()
buf.close()
return data, len(data)
return data
def close_connection (self):
"""Release the open connection from the connection pool."""
if self.url_connection is None:
return
scheme, host, port = self.get_netloc()
self.aggregate.connections.release(scheme, host, port, self.url_connection)
self.url_connection = None
if self.url_connection is not None:
try:
self.url_connection.quit()
except Exception:
pass
self.url_connection = None

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2012 Bastian Kleineidam
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,179 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle https links.
"""
import time
from . import httpurl
from .const import WARN_HTTPS_CERTIFICATE
from .. import log, LOG_CHECK, strformat
class HttpsUrl (httpurl.HttpUrl):
"""
Url link with https scheme.
"""
def local_check (self):
"""
Check connection if SSL is supported, else ignore.
"""
if httpurl.supportHttps:
super(HttpsUrl, self).local_check()
else:
self.add_info(_("%s URL ignored.") % self.scheme.capitalize())
def get_http_object (self, scheme, host, port):
"""Open a HTTP connection and check the SSL certificate."""
super(HttpsUrl, self).get_http_object(scheme, host, port)
self.check_ssl_certificate(self.url_connection.sock, host)
def check_ssl_certificate(self, ssl_sock, host):
"""Run all SSL certificate checks that have not yet been done.
OpenSSL already checked the SSL notBefore and notAfter dates.
"""
if not hasattr(ssl_sock, "getpeercert"):
# the URL was a HTTPS -> HTTP redirect
return
cert = ssl_sock.getpeercert()
log.debug(LOG_CHECK, "Got SSL certificate %s", cert)
if not cert:
return
if 'subject' in cert:
self.check_ssl_hostname(ssl_sock, cert, host)
else:
msg = _('certificate did not include "subject" information')
self.add_ssl_warning(ssl_sock, msg)
if 'notAfter' in cert:
self.check_ssl_valid_date(ssl_sock, cert)
else:
msg = _('certificate did not include "notAfter" information')
self.add_ssl_warning(ssl_sock, msg)
def check_ssl_hostname(self, ssl_sock, cert, host):
"""Check the hostname against the certificate according to
RFC2818.
"""
try:
match_hostname(cert, host)
except CertificateError as msg:
self.add_ssl_warning(ssl_sock, msg)
def check_ssl_valid_date(self, ssl_sock, cert):
"""Check if the certificate is still valid, or if configured check
if it's at least a number of days valid.
"""
import ssl
checkDaysValid = self.aggregate.config["warnsslcertdaysvalid"]
try:
notAfter = ssl.cert_time_to_seconds(cert['notAfter'])
except ValueError as msg:
msg = _('invalid certficate "notAfter" value %r') % cert['notAfter']
self.add_ssl_warning(ssl_sock, msg)
return
curTime = time.time()
# Calculate seconds until certifcate expires. Can be negative if
# the certificate is already expired.
secondsValid = notAfter - curTime
if secondsValid < 0:
msg = _('certficate is expired on %s') % cert['notAfter']
self.add_ssl_warning(ssl_sock, msg)
elif checkDaysValid > 0 and \
secondsValid < (checkDaysValid * strformat.SECONDS_PER_DAY):
strSecondsValid = strformat.strduration_long(secondsValid)
msg = _('certificate is only %s valid') % strSecondsValid
self.add_ssl_warning(ssl_sock, msg)
def add_ssl_warning(self, ssl_sock, msg):
"""Add a warning message about an SSL certificate error."""
cipher_name, ssl_protocol, secret_bits = ssl_sock.cipher()
err = _(u"SSL warning: %(msg)s. Cipher %(cipher)s, %(protocol)s.")
attrs = dict(msg=msg, cipher=cipher_name, protocol=ssl_protocol)
self.add_warning(err % attrs, tag=WARN_HTTPS_CERTIFICATE)
# Copied from ssl.py in Python 3:
# Wrapper module for _ssl, providing some additional facilities
# implemented in Python. Written by Bill Janssen.
import re
class CertificateError(ValueError):
"""Raised on certificate errors."""
pass
def _dnsname_to_pat(dn, max_wildcards=1):
"""Convert a DNS certificate name to a hostname matcher."""
pats = []
for frag in dn.split(r'.'):
if frag.count('*') > max_wildcards:
# Issue #17980: avoid denials of service by refusing more
# than one wildcard per fragment. A survery of established
# policy among SSL implementations showed it to be a
# reasonable choice.
raise CertificateError(
"too many wildcards in certificate DNS name: " + repr(dn))
if frag == '*':
# When '*' is a fragment by itself, it matches a non-empty dotless
# fragment.
pats.append('[^.]+')
else:
# Otherwise, '*' matches any dotless fragment.
frag = re.escape(frag)
pats.append(frag.replace(r'\*', '[^.]*'))
return re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE)
def match_hostname(cert, hostname):
"""Verify that *cert* (in decoded format as returned by
SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 rules
are mostly followed, but IP addresses are not accepted for *hostname*.
CertificateError is raised on failure. On success, the function
returns nothing.
"""
if not cert:
raise ValueError("empty or no certificate")
dnsnames = []
san = cert.get('subjectAltName', ())
for key, value in san:
if key == 'DNS':
if _dnsname_to_pat(value).match(hostname):
return
dnsnames.append(value)
if not dnsnames:
# The subject is only checked when there is no dNSName entry
# in subjectAltName
for sub in cert.get('subject', ()):
for key, value in sub:
# XXX according to RFC 2818, the most specific Common Name
# must be used.
if key == 'commonName':
if _dnsname_to_pat(value).match(hostname):
return
dnsnames.append(value)
if len(dnsnames) > 1:
raise CertificateError("hostname %r "
"doesn't match either of %s"
% (hostname, ', '.join(map(repr, dnsnames))))
elif len(dnsnames) == 1:
raise CertificateError("hostname %r "
"doesn't match %r"
% (hostname, dnsnames[0]))
else:
raise CertificateError("no appropriate commonName or "
"subjectAltName fields were found")

View file

@ -18,26 +18,14 @@
Handle http links.
"""
import urlparse
import os
import errno
import zlib
import socket
import rfc822
import time
import requests
from cStringIO import StringIO
from datetime import datetime
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
httplib2 as httplib, LinkCheckerError, httputil, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
get_url_from, pooledconnection)
from .. import (log, LOG_CHECK, strformat,
url as urlutil, LinkCheckerError)
from . import (internpaturl, proxysupport, httpheaders as headers)
# import warnings
from .const import WARN_HTTP_ROBOTS_DENIED, \
WARN_HTTP_MOVED_PERMANENT, \
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
WARN_HTTP_AUTH_UNKNOWN, WARN_HTTP_AUTH_UNAUTHORIZED
from .const import WARN_HTTP_EMPTY_CONTENT
# assumed HTTP header encoding
HEADER_ENCODING = "iso-8859-1"
@ -46,18 +34,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
# helper alias
unicode_safe = strformat.unicode_safe
supportHttps = hasattr(httplib, "HTTPSConnection")
SUPPORTED_ENCODINGS = ('x-gzip', 'gzip', 'deflate')
# Accept-Encoding header value
ACCEPT_ENCODING = ",".join(SUPPORTED_ENCODINGS)
# Accept-Charset header value
ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.3"
# Accept mime type header value
ACCEPT = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with http scheme.
"""
@ -67,28 +44,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
Initialize HTTP specific variables.
"""
super(HttpUrl, self).reset()
self.max_redirects = 5
self.has301status = False
# flag if connection is persistent
self.persistent = False
# URLs seen through 301/302 redirections
# URLs seen through redirections
self.aliases = []
# initialize check data
self.headers = None
self.headers = {}
self.auth = None
self.cookies = []
# temporary data filled when reading redirections
self._data = None
# flag telling if GET method is allowed; determined by robots.txt
self.method_get_allowed = True
# HttpResponse object
self.response = None
def allows_robots (self, url):
"""
Fetch and parse the robots.txt of given url. Checks if LinkChecker
can get the requested resource content. HEAD requests however are
still allowed.
can get the requested resource content.
@param url: the url to be requested
@type url: string
@ -98,9 +63,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
rb = self.aggregate.robots_txt
callback = self.aggregate.connections.host_wait
return rb.allows_url(roboturl, url, self.proxy, user, password,
callback=callback)
#callback = self.aggregate.connections.host_wait
return rb.allows_url(roboturl, self.url, self.proxy, user, password)
def add_size_info (self):
"""Get size of URL content from HTTP header."""
@ -110,8 +74,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
# the content data is always decoded.
try:
self.size = int(self.getheader("Content-Length"))
if self.dlsize == -1:
self.dlsize = self.size
except (ValueError, OverflowError):
pass
else:
@ -134,164 +96,56 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
- 5xx: Server Error - The server failed to fulfill an apparently
valid request
"""
self.session = self.aggregate.get_request_session()
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
self.construct_auth()
# check robots.txt
if not self.allows_robots(self.url):
# remove all previously stored results
self.add_warning(
_("Access denied by robots.txt, skipping content checks."),
tag=WARN_HTTP_ROBOTS_DENIED)
self.method_get_allowed = False
# first try with HEAD
self.method = "HEAD"
self.add_info(_("Access denied by robots.txt, checked only syntax."))
self.set_result(_("syntax OK"))
self.do_check_content = False
return
# check the http connection
self.check_http_connection()
# redirections might have changed the URL
self.url = urlutil.urlunsplit(self.urlparts)
# check response
if self.response is not None:
self.check_response()
self.close_response()
request = self.build_request()
self.send_request(request)
self.follow_redirections(request)
self.check_response()
def check_http_connection (self):
"""
Check HTTP connection and return get response and a flag
if the check algorithm had to fall back to the GET method.
def build_request(self):
"""Build a prepared request object."""
clientheaders = {
"User-Agent": self.aggregate.config["useragent"],
"DNT": "1",
}
if (self.parent_url and
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
clientheaders["Referer"] = self.parent_url
kwargs = dict(
method='GET',
url=self.url,
headers=clientheaders,
)
if self.auth:
kwargs['auth'] = self.auth
log.debug(LOG_CHECK, "Prepare request with %s", kwargs)
request = requests.Request(**kwargs)
return self.session.prepare_request(request)
@return: response or None if url is already handled
@rtype: HttpResponse or None
"""
while True:
# XXX refactor this
self.close_response()
try:
self._try_http_response()
except httplib.BadStatusLine as msg:
# some servers send empty HEAD replies
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.fallback_to_get()
continue
raise
except socket.error as msg:
# some servers reset the connection on HEAD requests
if self.method == "HEAD" and self.method_get_allowed and \
msg[0] == errno.ECONNRESET:
self.fallback_to_get()
continue
raise
uheaders = unicode_safe(self.headers, encoding=HEADER_ENCODING)
log.debug(LOG_CHECK, "Headers: %s", uheaders)
# proxy enforcement (overrides standard proxy)
if self.response.status == 305 and self.headers:
oldproxy = (self.proxy, self.proxyauth)
newproxy = self.getheader("Location")
if newproxy:
self.add_info(_("Enforced proxy `%(name)s'.") %
{"name": newproxy})
self.set_proxy(newproxy)
self.close_response()
if self.proxy is None:
self.set_result(
_("Missing 'Location' header with enforced proxy status 305, aborting."),
valid=False)
return
elif not self.proxy:
self.set_result(
_("Empty 'Location' header value with enforced proxy status 305, aborting."),
valid=False)
return
self._try_http_response()
# restore old proxy settings
self.proxy, self.proxyauth = oldproxy
try:
tries = self.follow_redirections()
except httplib.BadStatusLine as msg:
# some servers send empty HEAD replies
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.fallback_to_get()
continue
raise
if tries == -1:
log.debug(LOG_CHECK, "already handled")
self.close_response()
self.do_check_content = False
return
if tries >= self.max_redirects:
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
self.fallback_to_get()
continue
self.set_result(_("more than %d redirections, aborting") %
self.max_redirects, valid=False)
self.close_response()
self.do_check_content = False
return
if self.do_fallback(self.response.status):
self.fallback_to_get()
continue
# user authentication
if self.response.status == 401:
authenticate = self.getheader('WWW-Authenticate')
if authenticate is None:
# Either the server intentionally blocked this request,
# or there is a form on this page which requires
# manual user/password input.
# Either way, this is a warning.
self.add_warning(_("Unauthorized access without HTTP authentication."),
tag=WARN_HTTP_AUTH_UNAUTHORIZED)
return
if not authenticate.startswith("Basic"):
# LinkChecker only supports Basic authorization
args = {"auth": authenticate}
self.add_warning(
_("Unsupported HTTP authentication `%(auth)s', " \
"only `Basic' authentication is supported.") % args,
tag=WARN_HTTP_AUTH_UNKNOWN)
return
if not self.auth:
self.construct_auth()
if self.auth:
continue
break
def do_fallback(self, status):
"""Check for fallback according to response status.
@param status: The HTTP response status
@ptype status: int
@return: True if checker should use GET, else False
@rtype: bool
"""
if self.method == "HEAD":
# Some sites do not support HEAD requests, for example
# youtube sends a 404 with HEAD, 200 with GET. Doh.
# A 405 "Method not allowed" status should also use GET.
if status >= 400:
log.debug(LOG_CHECK, "Method HEAD error %d, falling back to GET", status)
return True
# Other sites send 200 with HEAD, but 404 with GET. Bummer.
poweredby = self.getheader('X-Powered-By', u'')
server = self.getheader('Server', u'')
# Some servers (Zope, Apache Coyote/Tomcat, IIS have wrong
# content type with HEAD. This seems to be a common problem.
if (poweredby.startswith('Zope') or server.startswith('Zope')
or server.startswith('Apache-Coyote')
or ('ASP.NET' in poweredby and 'Microsoft-IIS' in server)):
return True
return False
def fallback_to_get(self):
"""Set method to GET and clear aliases."""
self.close_response()
self.close_connection()
self.method = "GET"
self.aliases = []
self.urlparts = strformat.url_unicode_split(self.url)
self.build_url_parts()
def send_request(self, request):
"""Send request and store response in self.url_connection."""
# throttle the number of requests to each host
self.aggregate.wait_for_host(self.urlparts[1])
kwargs = dict(
stream=True,
timeout=self.aggregate.config["timeout"],
allow_redirects=False,
)
if self.scheme == "https" and self.aggregate.config["sslverify"]:
kwargs["verify"] = self.aggregate.config["sslverify"]
log.debug(LOG_CHECK, "Send request with %s", kwargs)
self.url_connection = self.session.send(request, **kwargs)
self.headers = self.url_connection.headers
def construct_auth (self):
"""Construct HTTP Basic authentication credentials if there
@ -301,162 +155,34 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
return
_user, _password = self.get_user_password()
if _user is not None and _password is not None:
credentials = httputil.encode_base64("%s:%s" % (_user, _password))
self.auth = "Basic " + credentials
log.debug(LOG_CHECK, "Using basic authentication")
self.auth = (_user, _password)
def get_content_type (self):
"""Return content MIME type or empty string."""
if self.content_type is None:
if self.headers:
self.content_type = headers.get_content_type(self.headers)
else:
self.content_type = u""
if not self.content_type:
self.content_type = headers.get_content_type(self.headers)
return self.content_type
def follow_redirections (self, set_result=True):
def follow_redirections(self, request):
"""Follow all redirections of http response."""
log.debug(LOG_CHECK, "follow all redirections")
redirected = self.url
tries = 0
while self.response.status in [301, 302] and self.headers and \
tries < self.max_redirects:
num = self.follow_redirection(set_result, redirected)
if num == -1:
return num
redirected = urlutil.urlunsplit(self.urlparts)
tries += num
return tries
def follow_redirection (self, set_result, redirected):
"""Follow one redirection of http response."""
newurl = self.getheader("Location",
self.getheader("Uri", u""))
# make new url absolute and unicode
newurl = urlparse.urljoin(redirected, unicode_safe(newurl))
log.debug(LOG_CHECK, "Redirected to %r", newurl)
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
# norm base url - can raise UnicodeError from url.idna_encode()
redirected, is_idn = urlbase.url_norm(newurl)
log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
urlparts = strformat.url_unicode_split(redirected)
if not self.check_redirection_scheme(redirected, urlparts, set_result):
return -1
if not self.check_redirection_newscheme(redirected, urlparts, set_result):
return -1
if not self.check_redirection_domain(redirected, urlparts,
set_result):
return -1
if not self.check_redirection_robots(redirected, set_result):
return -1
num = self.check_redirection_recursion(redirected, set_result)
if num != 0:
return num
if set_result:
self.check301status()
self.close_response()
self.close_connection()
# remember redirected url as alias
self.aliases.append(redirected)
if self.anchor:
urlparts[4] = self.anchor
# note: urlparts has to be a list
self.urlparts = urlparts
self.build_url_parts()
# store cookies from redirect response
self.store_cookies()
# new response data
self._try_http_response()
return 1
def check_redirection_scheme (self, redirected, urlparts, set_result):
"""Return True if redirection scheme is ok, else False."""
if urlparts[0] in ('ftp', 'http', 'https'):
return True
# For security reasons do not allow redirects to protocols
# other than HTTP, HTTPS or FTP.
if set_result:
self.add_warning(
_("Redirection to url `%(newurl)s' is not allowed.") %
{'newurl': redirected})
self.set_result(_("syntax OK"))
return False
def check_redirection_domain (self, redirected, urlparts, set_result):
"""Return True if redirection domain is ok, else False."""
# XXX does not support user:pass@netloc format
if urlparts[1] != self.urlparts[1]:
# URL domain changed
if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
# Add intern patterns for redirection of URLs given by the
# user for HTTP schemes.
self.add_intern_pattern(url=redirected)
return True
# check extern filter again
self.extern = None
self.set_extern(redirected)
if self.extern[0] and self.extern[1]:
if set_result:
self.check301status()
self.add_info(_("The redirected URL is outside of the domain "
"filter, checked only syntax."))
self.set_result(_("filtered"))
return False
return True
def check_redirection_robots (self, redirected, set_result):
"""Check robots.txt allowance for redirections. Return True if
allowed, else False."""
if self.allows_robots(redirected):
return True
if set_result:
self.add_warning(
_("Access to redirected URL denied by robots.txt, "
"checked only syntax."), tag=WARN_HTTP_ROBOTS_DENIED)
self.set_result(_("syntax OK"))
return False
def check_redirection_recursion (self, redirected, set_result):
"""Check for recursive redirect. Return zero if no recursion
detected, max_redirects for recursion with HEAD request,
-1 otherwise."""
all_seen = [self.cache_url_key] + self.aliases
if redirected not in all_seen:
return 0
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
# fall back to the original url and use GET
return self.max_redirects
if set_result:
urls = "\n => ".join(all_seen + [redirected])
self.set_result(_("recursive redirection encountered:\n %(urls)s") %
{"urls": urls}, valid=False)
return -1
def check_redirection_newscheme (self, redirected, urlparts, set_result):
"""Check for HTTP(S)/FTP redirection. Return True for
redirection with same scheme, else False."""
if urlparts[0] != self.urlparts[0]:
# changed scheme
newobj = get_url_from(
redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name)
if set_result:
self.set_result(_("syntax OK"))
# append new object to queue
self.aggregate.urlqueue.put(newobj)
return False
raise LinkCheckerError(_('Cannot redirect to different scheme without result'))
return True
def check301status (self):
"""If response page has been permanently moved add a warning."""
if self.response.status == 301 and not self.has301status:
self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
" should update this link."),
tag=WARN_HTTP_MOVED_PERMANENT)
self.has301status = True
kwargs = dict(
stream=True,
)
response = None
for response in self.session.resolve_redirects(self.url_connection, request, **kwargs):
newurl = response.url
log.debug(LOG_CHECK, "Redirected to %r", newurl)
self.aliases.append(newurl)
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
urlparts = strformat.url_unicode_split(newurl)
if response is not None:
self.urlparts = urlparts
self.build_url_parts()
self.url_connection = response
self.headers = response.headers
self.url = urlutil.urlunsplit(urlparts)
self.scheme = urlparts[0].lower()
def getheader (self, name, default=None):
"""Get decoded header value.
@ -471,271 +197,29 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
def check_response (self):
"""Check final result and log it."""
if self.response.status >= 400:
self.set_result(u"%r %s" % (self.response.status, self.response.reason),
if self.url_connection.status_code >= 400:
self.set_result(u"%d %s" % (self.url_connection.status_code, self.url_connection.reason),
valid=False)
else:
if self.response.status == 204:
if self.url_connection.status_code == 204:
# no content
self.add_warning(self.response.reason,
self.add_warning(self.url_connection.reason,
tag=WARN_HTTP_EMPTY_CONTENT)
# store cookies for valid links
self.store_cookies()
if self.response.status >= 200:
self.set_result(u"%r %s" % (self.response.status, self.response.reason))
if self.url_connection.status_code >= 200:
self.set_result(u"%r %s" % (self.url_connection.status_code, self.url_connection.reason))
else:
self.set_result(_("OK"))
modified = rfc822.parsedate(self.getheader('Last-Modified', u''))
if modified:
self.modified = datetime.utcfromtimestamp(time.mktime(modified))
def _try_http_response (self):
"""Try to get a HTTP response object. For persistent
connections that the server closed unexpected, a new connection
will be opened.
"""
try:
self._get_http_response()
except socket.error as msg:
if msg.args[0] == 32 and self.persistent:
# server closed persistent connection - retry
log.debug(LOG_CHECK, "Server closed connection: retry")
self.persistent = False
self._get_http_response()
else:
raise
except httplib.BadStatusLine as msg:
if self.persistent:
# server closed connection - retry
log.debug(LOG_CHECK, "Empty status line: retry")
self.persistent = False
self._get_http_response()
else:
raise
def _get_http_response (self):
"""Send HTTP request and get response object."""
scheme, host, port = self.get_netloc()
log.debug(LOG_CHECK, "Connecting to %r", host)
self.get_http_object(scheme, host, port)
self.add_connection_request()
self.add_connection_headers()
self.response = self.url_connection.getresponse(buffering=True)
self.headers = self.response.msg
self.content_type = None
self.persistent = not self.response.will_close
if self.persistent and self.method == "HEAD":
# Some servers send page content after a HEAD request,
# but only after making the *next* request. This breaks
# protocol synchronisation. Workaround here is to close
# the connection after HEAD.
# Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
self.persistent = False
# Note that for POST method the connection should also be closed,
# but this method is never used.
# If possible, use official W3C HTTP response name
if self.response.status in httplib.responses:
self.response.reason = httplib.responses[self.response.status]
if self.response.reason:
self.response.reason = unicode_safe(self.response.reason)
log.debug(LOG_CHECK, "Response: %s %s", self.response.status, self.response.reason)
def add_connection_request(self):
"""Add connection request."""
# the anchor fragment is not part of a HTTP URL, see
# http://tools.ietf.org/html/rfc2616#section-3.2.2
anchor = ''
if self.proxy:
path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1],
self.urlparts[2], self.urlparts[3], anchor))
else:
path = urlutil.urlunsplit(('', '', self.urlparts[2],
self.urlparts[3], anchor))
self.url_connection.putrequest(self.method, path, skip_host=True,
skip_accept_encoding=True)
def add_connection_headers(self):
"""Add connection header."""
# be sure to use the original host as header even for proxies
self.url_connection.putheader("Host", self.urlparts[1])
if self.auth:
# HTTP authorization
self.url_connection.putheader("Authorization", self.auth)
if self.proxyauth:
self.url_connection.putheader("Proxy-Authorization",
self.proxyauth)
if (self.parent_url and
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
self.url_connection.putheader("Referer", self.parent_url)
self.url_connection.putheader("User-Agent",
self.aggregate.config["useragent"])
# prefer compressed content
self.url_connection.putheader("Accept-Encoding", ACCEPT_ENCODING)
# prefer UTF-8 encoding
self.url_connection.putheader("Accept-Charset", ACCEPT_CHARSET)
# prefer parseable mime types
self.url_connection.putheader("Accept", ACCEPT)
# send do-not-track header
self.url_connection.putheader("DNT", "1")
if self.aggregate.config['sendcookies']:
self.send_cookies()
self.url_connection.endheaders()
def store_cookies (self):
"""Save cookies from response headers."""
if self.aggregate.config['storecookies']:
for c in self.cookies:
self.add_info(_("Sent Cookie: %(cookie)s.") %
{"cookie": c.client_header_value()})
errors = self.aggregate.cookies.add(self.headers,
self.urlparts[0], self.urlparts[1], self.urlparts[2])
if errors:
self.add_warning(
_("Could not store cookies from headers: %(error)s.") %
{'error': "\n".join(errors)},
tag=WARN_HTTP_COOKIE_STORE_ERROR)
def send_cookies (self):
"""Add cookie headers to request."""
scheme = self.urlparts[0]
host = self.urlparts[1]
port = urlutil.default_ports.get(scheme, 80)
host, port = urlutil.splitport(host, port)
path = self.urlparts[2] or u"/"
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
if not self.cookies:
return
# add one cookie header with all cookie data
# this is limited by maximum header length
headername = "Cookie"
headervalue = ""
max_value_len = headers.MAX_HEADER_BYTES - len(headername) - 2
for c in self.cookies:
cookievalue = c.client_header_value()
if "version" in c.attributes:
# add separate header for explicit versioned cookie
if headervalue:
self.url_connection.putheader(headername, headervalue)
self.url_connection.putheader(headername, cookievalue)
headervalue = ""
continue
if headervalue:
cookievalue = "; " + cookievalue
if (len(headervalue) + len(cookievalue)) < max_value_len:
headervalue += cookievalue
else:
log.debug(LOG_CHECK, "Discard too-long cookie %r", cookievalue)
if headervalue:
log.debug(LOG_CHECK, "Sending cookie header %s:%s", headername, headervalue)
self.url_connection.putheader(headername, headervalue)
def get_http_object (self, scheme, host, port):
"""
Open a HTTP connection.
@param host: the host to connect to
@ptype host: string of the form <host>[:<port>]
@param scheme: 'http' or 'https'
@ptype scheme: string
@return: None
"""
self.close_connection()
def create_connection(scheme, host, port):
"""Create a new http or https connection."""
kwargs = dict(port=port, strict=True, timeout=self.aggregate.config["timeout"])
if scheme == "http":
h = httplib.HTTPConnection(host, **kwargs)
elif scheme == "https" and supportHttps:
devel_dir = os.path.join(configuration.configdata.install_data, "config")
sslverify = self.aggregate.config["sslverify"]
if sslverify:
if sslverify is not True:
kwargs["ca_certs"] = sslverify
else:
kwargs["ca_certs"] = configuration.get_share_file(devel_dir, 'ca-certificates.crt')
h = httplib.HTTPSConnection(host, **kwargs)
else:
msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
raise LinkCheckerError(msg)
if log.is_debug(LOG_CHECK):
h.set_debuglevel(1)
return h
self.get_pooled_connection(scheme, host, port, create_connection)
self.url_connection.connect()
def read_content (self):
"""Get content of the URL target. The content data is cached after
the first call to this method.
@return: URL content, decompressed and decoded
@rtype: string
"""
assert self.method_get_allowed, 'unallowed content read'
if self.method != "GET" or self.response is None:
self.method = "GET"
self._try_http_response()
num = self.follow_redirections(set_result=False)
if not (0 <= num <= self.max_redirects):
raise LinkCheckerError(_("Redirection error"))
# Re-read size info, since the GET request result could be different
# than a former HEAD request.
self.add_size_info()
if self.size > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
self.charset = headers.get_charset(self.headers)
return self._read_content()
def _read_content (self):
"""Read URL contents."""
data = self.response.read(self.MaxFilesizeBytes+1)
if len(data) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
dlsize = len(data)
self.aggregate.add_download_data(self.cache_content_key, data)
encoding = headers.get_content_encoding(self.headers)
if encoding in SUPPORTED_ENCODINGS:
try:
if encoding == 'deflate':
f = StringIO(zlib.decompress(data))
else:
f = gzip.GzipFile('', 'rb', 9, StringIO(data))
except zlib.error as msg:
log.debug(LOG_CHECK, "Error %s data of len %d", encoding, len(data))
self.add_warning(_("Decompress error %(err)s") %
{"err": str(msg)},
tag=WARN_HTTP_DECOMPRESS_ERROR)
f = StringIO(data)
try:
data = f.read()
finally:
f.close()
return data, dlsize
def encoding_supported (self):
"""Check if page encoding is supported."""
encoding = headers.get_content_encoding(self.headers)
if encoding and encoding not in SUPPORTED_ENCODINGS and \
encoding != 'identity':
self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
{"encoding": encoding},
tag=WARN_HTTP_UNSUPPORTED_ENCODING)
return False
return True
def can_get_content(self):
"""Check if it's allowed to read content."""
return self.method_get_allowed
def content_allows_robots (self):
"""Check if it's allowed to read content before execution."""
if not self.method_get_allowed:
return False
return super(HttpUrl, self).content_allows_robots()
def check_warningregex (self):
"""Check if it's allowed to read content before execution."""
if self.method_get_allowed:
super(HttpUrl, self).check_warningregex()
def read_content(self):
"""Return data and data size for this URL.
Can be overridden in subclasses."""
maxbytes = self.aggregate.config["maxfilesizedownload"]
buf = StringIO()
for data in self.url_connection.iter_content(chunk_size=self.ReadChunkBytes):
if buf.tell() + len(data) > maxbytes:
raise LinkCheckerError(_("File size too large"))
buf.write(data)
return buf.getvalue()
def is_html (self):
"""
@ -748,22 +232,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
if not self.valid:
return False
mime = self.get_content_type()
if self.ContentMimetypes.get(mime) != "html":
return False
if self.headers:
return self.encoding_supported()
return True
return self.ContentMimetypes.get(mime) == "html"
def is_css (self):
"""Return True iff content of this url is CSS stylesheet."""
if not self.valid:
return False
mime = self.get_content_type()
if self.ContentMimetypes.get(mime) != "css":
return False
if self.headers:
return self.encoding_supported()
return True
return self.ContentMimetypes.get(mime) == "css"
def is_http (self):
"""
@ -781,30 +257,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
@return: True if content is parseable
@rtype: bool
"""
if not (self.valid and self.headers):
if not self.valid:
return False
ctype = self.get_content_type()
if ctype not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
return False
return self.encoding_supported()
def parse_url (self):
"""
Parse file contents for new links to check.
"""
ctype = self.get_content_type()
if self.is_html():
self.parse_html()
elif self.is_css():
self.parse_css()
elif ctype == "application/x-shockwave-flash":
self.parse_swf()
elif ctype == "application/msword":
self.parse_word()
elif ctype == "text/vnd.wap.wml":
self.parse_wml()
self.add_num_url_info()
return True
def get_robots_txt_url (self):
"""
@ -814,28 +273,3 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
@rtype: string
"""
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
def close_response(self):
"""Close the HTTP response object."""
if self.response is None:
return
self.response.close()
self.response = None
def close_connection (self):
"""Release the connection from the connection pool. Persistent
connections will not be closed.
"""
log.debug(LOG_CHECK, "Closing %s", self.url_connection)
if self.url_connection is None:
# no connection is open
return
# add to cached connections
scheme, host, port = self.get_netloc()
if self.persistent and self.url_connection.is_idle():
expiration = time.time() + headers.http_keepalive(self.headers)
else:
self.close_response()
expiration = None
self.aggregate.connections.release(scheme, host, port, self.url_connection, expiration=expiration)
self.url_connection = None

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2012 Bastian Kleineidam
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -21,16 +21,13 @@ Handle for mailto: links.
import re
import urllib
import urlparse
import smtplib
import socket
from email._parseaddr import AddressList
from . import urlbase
from .. import log, LOG_CHECK, strformat, url as urlutil
from dns import resolver
from ..network import iputil
from .const import WARN_MAIL_NO_MX_HOST, \
WARN_MAIL_UNVERIFIED_ADDRESS, WARN_MAIL_NO_CONNECTION
from .const import WARN_MAIL_NO_MX_HOST
def getaddresses (addr):
@ -287,78 +284,9 @@ class MailtoUrl (urlbase.UrlBase):
# debug output
log.debug(LOG_CHECK, "found %d MX mailhosts:", len(answers))
for preference, host in mxdata:
log.debug(LOG_CHECK,
"MX host %r, preference %d", host, preference)
# connect
self.check_smtp_connect(mxdata, username, domain)
def check_smtp_connect (self, mxdata, username, domain):
"""
Connect to SMTP servers and check emails.
@param mxdata: list of (preference, host) tuples to check for
@type mxdata: list
@param username: the username to verify
@type username: string
"""
smtpconnect = 0
for preference, host in mxdata:
try:
log.debug(LOG_CHECK,
"SMTP check for %r (preference %d)", host, preference)
self.url_connection = smtplib.SMTP(timeout=self.aggregate.config["timeout"])
if log.is_debug(LOG_CHECK):
self.url_connection.set_debuglevel(1)
self.url_connection.connect(host)
log.debug(LOG_CHECK, "SMTP connected!")
smtpconnect = 1
self.url_connection.helo()
mailaddress = "%s@%s" % (username, domain)
status, info = self.url_connection.verify(mailaddress)
log.debug(LOG_CHECK, "SMTP info %d %r", status, info)
d = {
'info': "%d %s" % (status, str(info)),
'mail': mailaddress,
}
if status == 250:
self.add_info(_("Verified address %(mail)s: %(info)s.") % d)
# check for 25x status code which means that the address
# could not be verified, but is sent anyway
elif 250 < status < 260:
self.add_info(_("Unverified but presumably valid"
" address %(mail)s: %(info)s.") % d)
else:
self.add_warning(_("Unverified address: %(info)s.") % d,
tag=WARN_MAIL_UNVERIFIED_ADDRESS)
except smtplib.SMTPException as msg:
self.add_warning(
_("MX mail host %(host)s did not accept connections: "
"%(error)s.") % {'host': host, 'error': str(msg)},
tag=WARN_MAIL_NO_CONNECTION)
if smtpconnect:
break
if not smtpconnect:
self.set_result(_("Could not connect, but syntax is correct"),
overwrite=True)
else:
self.set_result(_("Found MX mail host %(host)s") % {'host': host},
overwrite=True)
def close_connection (self):
"""
Close a possibly opened SMTP connection.
"""
if self.url_connection is None:
# no connection is open
return
connection = self.url_connection
self.url_connection = None
try:
connection.quit()
except (smtplib.SMTPException, socket.error):
# ignore close errors
# socket.error is raised for example on timeouts
log.debug(LOG_CHECK, "MX host %r, preference %d", host, preference)
pass
self.set_result(_("Valid mail address syntax"))
def set_cache_keys (self):
"""

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,40 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Mixin class for URLs that pool connections.
"""
class PooledConnection (object):
"""Support for connection pooling."""
def get_pooled_connection(self, scheme, host, port, create_connection):
"""Get a connection from the connection pool."""
get_connection = self.aggregate.connections.get
while True:
connection = get_connection(scheme, host, port, create_connection)
if hasattr(connection, 'acquire'):
# It's a connection lock object.
# This little trick avoids polling: wait for another
# connection to be released by acquiring the lock.
connection.acquire()
# The lock is immediately released since the calling
# connections.get() acquires it again.
connection.release()
else:
self.url_connection = connection
break

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -20,7 +20,6 @@ Handle uncheckable URLs.
import re
from . import urlbase
from .const import WARN_IGNORE_URL
# from http://www.iana.org/assignments/uri-schemes.html
ignored_schemes_permanent = r"""
@ -124,7 +123,7 @@ ignored_schemes_other = r"""
"""
ignored_schemes = "^(%s%s%s%s):" % (
ignored_schemes = "^(%s%s%s%s)$" % (
ignored_schemes_permanent,
ignored_schemes_provisional,
ignored_schemes_historical,
@ -132,7 +131,7 @@ ignored_schemes = "^(%s%s%s%s):" % (
)
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
is_unknown_url = ignored_schemes_re.search
is_unknown_scheme = ignored_schemes_re.match
class UnknownUrl (urlbase.UrlBase):
@ -140,19 +139,16 @@ class UnknownUrl (urlbase.UrlBase):
def local_check (self):
"""Only logs that this URL is unknown."""
if self.extern[0] and self.extern[1]:
self.add_info(_("Outside of domain filter, checked only syntax."))
elif self.ignored():
self.add_warning(_("%(scheme)s URL ignored.") %
{"scheme": self.scheme.capitalize()},
tag=WARN_IGNORE_URL)
if self.ignored():
self.add_info(_("%(scheme)s URL ignored.") %
{"scheme": self.scheme.capitalize()})
else:
self.set_result(_("URL is unrecognized or has invalid syntax"),
valid=False)
def ignored (self):
"""Return True if this URL scheme is ignored."""
return ignored_schemes_re.search(self.url)
return is_unknown_scheme(self.scheme)
def can_get_content (self):
"""Unknown URLs have no content.

View file

@ -26,21 +26,19 @@ import time
import errno
import socket
import select
from cStringIO import StringIO
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip,
fileutil, get_link_pat)
from .. import (log, LOG_CHECK, LOG_CACHE,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat, parser)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
WARN_URL_WHITESPACE,
WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
WARN_SYNTAX_HTML, WARN_SYNTAX_CSS,
ExcList, ExcSyntaxList, ExcNoCacheList)
# helper alias
@ -71,17 +69,6 @@ def url_norm (url, encoding=None):
raise LinkCheckerError(msg)
def getXmlText (parent, tag):
"""Return XML content of given tag in parent element."""
elem = parent.getElementsByTagName(tag)[0]
# Yes, the DOM standard is awful.
rc = []
for node in elem.childNodes:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
class UrlBase (object):
"""An URL with additional information like validity etc."""
@ -103,8 +90,8 @@ class UrlBase (object):
"text/vnd.wap.wml": "wml",
}
# Set maximum file size for downloaded files in bytes.
MaxFilesizeBytes = 1024*1024*5
# Read in 16kb chunks
ReadChunkBytes = 1024*16
def __init__ (self, base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=-1, column=-1,
@ -173,8 +160,6 @@ class UrlBase (object):
self.urlparts = None
# the scheme, host, port and anchor part of url
self.scheme = self.host = self.port = self.anchor = None
# list of parsed anchors
self.anchors = []
# the result message string and flag
self.result = u""
self.has_result = False
@ -190,8 +175,6 @@ class UrlBase (object):
self.modified = None
# download time
self.dltime = -1
# download size
self.dlsize = -1
# check time
self.checktime = 0
# connection object
@ -211,8 +194,6 @@ class UrlBase (object):
self.do_check_content = True
# MIME content type
self.content_type = None
# number of URLs in page content
self.num_urls = 0
def set_result (self, msg, valid=True, overwrite=False):
"""
@ -229,6 +210,8 @@ class UrlBase (object):
log.warn(LOG_CHECK, "Empty result for %s", self)
self.result = msg
self.valid = valid
# free content data
self.data = None
def get_title (self):
"""Return title of page the URL refers to.
@ -246,30 +229,6 @@ class UrlBase (object):
self.title = title
return self.title
def set_title_from_content (self):
"""Set title of page the URL refers to.from page content."""
if not self.valid:
return
try:
handler = linkparse.TitleFinder()
except tuple(ExcList):
return
parser = htmlsax.parser(handler)
handler.parser = parser
if self.charset:
parser.encoding = self.charset
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
if handler.title:
self.title = handler.title
def is_parseable (self):
"""
Return True iff content of this url is parseable.
@ -287,15 +246,15 @@ class UrlBase (object):
return False
def is_http (self):
"""
Return True for http:// URLs.
"""
"""Return True for http:// URLs."""
return False
def is_file (self):
"""
Return True for file:// URLs.
"""
"""Return True for file:// URLs."""
return False
def is_directory(self):
"""Return True if current URL represents a directory."""
return False
def is_local(self):
@ -318,45 +277,6 @@ class UrlBase (object):
if s not in self.info:
self.info.append(s)
def copy_from_cache (self, cache_data):
"""
Fill attributes from cache data.
"""
self.url = cache_data["url"]
self.result = cache_data["result"]
self.has_result = True
anchor_changed = (self.anchor != cache_data["anchor"])
for tag, msg in cache_data["warnings"]:
# do not copy anchor warnings, since the current anchor
# might have changed
if anchor_changed and tag == WARN_URL_ANCHOR_NOT_FOUND:
continue
self.add_warning(msg, tag=tag)
for info in cache_data["info"]:
self.add_info(info)
self.valid = cache_data["valid"]
self.dltime = cache_data["dltime"]
self.dlsize = cache_data["dlsize"]
self.anchors = cache_data["anchors"]
self.content_type = cache_data["content_type"]
if anchor_changed and self.valid:
# recheck anchor
self.check_anchor()
def get_cache_data (self):
"""Return all data values that should be put in the cache."""
return {"url": self.url,
"result": self.result,
"warnings": self.warnings,
"info": self.info,
"valid": self.valid,
"dltime": self.dltime,
"dlsize": self.dlsize,
"anchors": self.anchors,
"anchor": self.anchor,
"content_type": self.get_content_type(),
}
def set_cache_keys (self):
"""
Set keys for URL checking and content recursion.
@ -367,11 +287,7 @@ class UrlBase (object):
assert isinstance(self.cache_content_key, unicode), self
log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
# construct cache key
if self.aggregate.config["anchors"]:
# add anchor to cache key
self.cache_url_key = urlutil.urlunsplit(self.urlparts[:4]+[self.anchor or u""])
else:
self.cache_url_key = self.cache_content_key
self.cache_url_key = self.cache_content_key
assert isinstance(self.cache_url_key, unicode), self
log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)
@ -442,9 +358,9 @@ class UrlBase (object):
self.url = urlutil.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = strformat.url_unicode_split(self.url)
self.build_url_parts()
# and unsplit again
self.url = urlutil.urlunsplit(self.urlparts)
self.build_url_parts()
def build_url_parts (self):
"""Set userinfo, host, port and anchor from self.urlparts.
@ -452,22 +368,28 @@ class UrlBase (object):
"""
# check userinfo@host:port syntax
self.userinfo, host = urllib.splituser(self.urlparts[1])
# set host lowercase
if self.userinfo:
self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
else:
self.urlparts[1] = host.lower()
# safe anchor for later checking
self.anchor = self.urlparts[4]
port = urlutil.default_ports.get(self.scheme, 0)
self.host, self.port = urlutil.splitport(host, port=port)
if self.port is None:
host, port = urlutil.splitport(host, port=port)
if port is None:
raise LinkCheckerError(_("URL host %(host)r has invalid port") %
{"host": host})
self.port = port
# set host lowercase
self.host = host.lower()
if self.scheme in scheme_requires_host:
if not self.host:
raise LinkCheckerError(_("URL has empty hostname"))
self.check_obfuscated_ip()
if not self.port or self.port == urlutil.default_ports.get(self.scheme):
host = self.host
else:
host = "%s:%d" % (self.host, self.port)
if self.userinfo:
self.urlparts[1] = "%s@%s" % (self.userinfo, host)
else:
self.urlparts[1] = host
# safe anchor for later checking
self.anchor = self.urlparts[4]
def check_obfuscated_ip (self):
"""Warn if host of this URL is obfuscated IP address."""
@ -476,9 +398,10 @@ class UrlBase (object):
if iputil.is_obfuscated_ip(self.host):
ips = iputil.resolve_host(self.host)
if ips:
self.host = ips[0]
self.add_warning(
_("URL %(url)s has obfuscated IP address %(ip)s") % \
{"url": self.base_url, "ip": ips.pop()},
{"url": self.base_url, "ip": ips[0]},
tag=WARN_URL_OBFUSCATED_IP)
def check (self):
@ -499,19 +422,6 @@ class UrlBase (object):
# close/release possible open connection
self.close_connection()
def add_country_info (self):
"""Try to ask GeoIP database for country info."""
if self.host:
country = geoip.get_country(self.host)
if country:
self.add_info(_("URL is located in %(country)s.") %
{"country": _(country)})
def add_size_info (self):
"""Store size of URL content from meta info into self.size.
Must be implemented in subclasses."""
pass
def local_check (self):
"""Local check function can be overridden in subclasses."""
log.debug(LOG_CHECK, "Checking %s", self)
@ -524,35 +434,28 @@ class UrlBase (object):
try:
self.check_connection()
self.add_size_info()
self.add_country_info()
self.aggregate.plugin_manager.run_connection_plugins(self)
except tuple(ExcList) as exc:
value = self.handle_exception()
# make nicer error msg for unknown hosts
if isinstance(exc, socket.error) and exc.args[0] == -2:
value = _('Hostname not found')
# make nicer error msg for bad status line
elif isinstance(exc, httplib.BadStatusLine):
value = _('Bad HTTP response %(line)r') % {"line": str(value)}
elif isinstance(exc, UnicodeError):
# idna.encode(host) failed
value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': str(value)}
self.set_result(unicode_safe(value), valid=False)
self.checktime = time.time() - check_start
if self.do_check_content:
# check content and recursion
try:
self.check_content()
if self.valid and self.can_get_content():
self.aggregate.plugin_manager.run_content_plugins(self)
if self.allows_recursion():
self.parse_url()
# check content size
self.check_size()
parser.parse_url(self)
except tuple(ExcList):
value = self.handle_exception()
# make nicer error msg for bad status line
if isinstance(value, httplib.BadStatusLine):
value = _('Bad HTTP response %(line)r') % {"line": str(value)}
self.add_warning(_("could not get content: %(msg)s") %
{"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)
self.checktime = time.time() - check_start
def close_connection (self):
"""
@ -595,6 +498,17 @@ class UrlBase (object):
"""
self.url_connection = urllib2.urlopen(self.url)
def add_size_info (self):
"""Set size of URL content (if any)..
Should be overridden in subclasses."""
maxbytes = self.aggregate.config["maxfilesizedownload"]
if self.size > maxbytes:
self.add_warning(
_("Content size %(size)s is larger than %(maxbytes)s.") %
dict(size=strformat.strsize(self.size),
maxbytes=strformat.strsize(maxbytes)),
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
def allows_recursion (self):
"""
Return True iff we can recurse into the url's content.
@ -617,6 +531,9 @@ class UrlBase (object):
if self.extern[0]:
log.debug(LOG_CHECK, "... no, extern.")
return False
if self.size > self.aggregate.config["maxfilesizeparse"]:
log.debug(LOG_CHECK, "... no, maximum parse size.")
return False
if not self.content_allows_robots():
log.debug(LOG_CHECK, "... no, robots.")
return False
@ -628,6 +545,7 @@ class UrlBase (object):
Return False if the content of this URL forbids robots to
search for recursive links.
"""
# XXX cleanup
if not self.is_html():
return True
if not (self.is_http() or self.is_file()):
@ -644,63 +562,12 @@ class UrlBase (object):
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
# break cyclic dependencies
handler.parser = None
parser.handler = None
return handler.follow
def get_anchors (self):
"""Store anchors for this URL. Precondition: this URL is
an HTML resource."""
log.debug(LOG_CHECK, "Getting HTML anchors %s", self)
self.find_links(self.add_anchor, tags=linkparse.AnchorTags)
def find_links (self, callback, tags=None):
"""Parse into content and search for URLs to check.
Found URLs are added to the URL queue.
"""
# construct parser object
handler = linkparse.LinkFinder(callback, tags=tags)
parser = htmlsax.parser(handler)
if self.charset:
parser.encoding = self.charset
handler.parser = parser
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
def add_anchor (self, url, line, column, name, base):
"""Add anchor URL."""
self.anchors.append((url, line, column, name, base))
def check_anchor (self):
"""If URL is valid, parseable and has an anchor, check it.
A warning is logged and True is returned if the anchor is not found.
"""
if not (self.anchor and self.aggregate.config["anchors"] and
self.valid and self.is_html()):
return
log.debug(LOG_CHECK, "checking anchor %r in %s", self.anchor, self.anchors)
enc = lambda anchor: urlutil.url_quote_part(anchor, encoding=self.encoding)
if any(x for x in self.anchors if enc(x[0]) == self.anchor):
return
if self.anchors:
anchornames = sorted(set(u"`%s'" % x[0] for x in self.anchors))
anchors = u", ".join(anchornames)
else:
anchors = u"-"
args = {"name": self.anchor, "anchors": anchors}
msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args,
_("Available anchors: %(anchors)s.") % args)
self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND)
return True
def set_extern (self, url):
"""
Match URL against extern and intern link patterns. If no pattern
@ -728,9 +595,15 @@ class UrlBase (object):
log.debug(LOG_CHECK, "Intern URL %r", url)
self.extern = (0, 0)
return
log.debug(LOG_CHECK, "Explicit extern URL %r", url)
self.extern = (1, 0)
return
if self.aggregate.config['checkextern']:
self.extern = (1, 0)
else:
self.extern = (1, 1)
if self.extern[0] and self.extern[1]:
self.add_info(_("The URL is outside of the domain "
"filter, checked only syntax."))
if not self.has_result:
self.set_result(_("filtered"))
def get_content_type (self):
"""Return content MIME type or empty string.
@ -741,188 +614,35 @@ class UrlBase (object):
def can_get_content (self):
"""Indicate wether url get_content() can be called."""
return True
return self.size <= self.aggregate.config["maxfilesizedownload"]
def get_content (self):
"""Precondition: url_connection is an opened URL."""
if self.data is None:
log.debug(LOG_CHECK, "Get content of %r", self.url)
t = time.time()
self.data, self.dlsize = self.read_content()
self.data = self.read_content()
self.size = len(self.data)
self.dltime = time.time() - t
if self.size == 0:
self.add_warning(_("Content size is zero."),
tag=WARN_URL_CONTENT_SIZE_ZERO)
return self.data
def read_content (self):
"""Return data and data size for this URL.
Can be overridden in subclasses."""
if self.size > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
data = self.url_connection.read(self.MaxFilesizeBytes+1)
if len(data) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
if not self.is_local():
self.aggregate.add_download_data(self.cache_content_key, data)
return data, len(data)
def read_content(self):
"""Return data for this URL. Can be overridden in subclasses."""
buf = StringIO()
data = self.read_content_chunk()
while data:
if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
raise LinkCheckerError(_("File size too large"))
buf.write(data)
data = self.read_content_chunk()
return buf.getvalue()
def check_content (self):
"""Check content data for warnings, syntax errors, viruses etc."""
if not (self.valid and self.can_get_content()):
return
if self.is_html():
self.set_title_from_content()
if self.aggregate.config["anchors"]:
self.get_anchors()
self.check_anchor()
self.check_warningregex()
# is it an intern URL?
if not self.extern[0]:
# check HTML/CSS syntax
if self.aggregate.config["checkhtml"] and self.is_html():
self.check_html()
if self.aggregate.config["checkcss"] and self.is_css():
self.check_css()
# check with clamav
if self.aggregate.config["scanvirus"]:
self.scan_virus()
def check_warningregex (self):
"""Check if content matches a given regular expression."""
config = self.aggregate.config
warningregex = config["warningregex"]
if not (warningregex and self.valid and self.is_parseable()):
return
log.debug(LOG_CHECK, "checking content for warning regex")
try:
content = self.get_content()
curpos = 0
curline = 1
# add warnings for found matches, up to the maximum allowed number
for num, match in enumerate(warningregex.finditer(content)):
# calculate line number for match
curline += content.count('\n', curpos, match.start())
curpos = match.start()
# add a warning message
msg = _("Found %(match)r at line %(line)d in link contents.")
self.add_warning(msg %
{"match": match.group(), "line": curline},
tag=WARN_URL_WARNREGEX_FOUND)
# check for maximum number of warnings
if num >= config["warningregex_max"]:
break
except tuple(ExcList):
value = self.handle_exception()
self.set_result(unicode_safe(value), valid=False)
def check_size (self):
"""Check content size if it is zero or larger than a given
maximum size.
"""
if self.dlsize == 0:
self.add_warning(_("Content size is zero."),
tag=WARN_URL_CONTENT_SIZE_ZERO)
else:
maxbytes = self.aggregate.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(
_("Content size %(dlsize)s is larger than %(maxbytes)s.") %
{"dlsize": strformat.strsize(self.dlsize),
"maxbytes": strformat.strsize(maxbytes)},
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size:
self.add_warning(_("Download size (%(dlsize)d Byte) "
"does not equal content size (%(size)d Byte).") %
{"dlsize": self.dlsize,
"size": self.size},
tag=WARN_URL_CONTENT_SIZE_UNEQUAL)
def check_w3_errors (self, xml, w3type):
"""Add warnings for W3C HTML or CSS errors in xml format.
w3type is either "W3C HTML" or "W3C CSS"."""
from xml.dom.minidom import parseString
dom = parseString(xml)
for error in dom.getElementsByTagName('m:error'):
warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
attrs = {
"w3type": w3type,
"line": getXmlText(error, "m:line"),
"column": getXmlText(error, "m:col"),
"msg": getXmlText(error, "m:message"),
}
tag = WARN_SYNTAX_HTML if w3type == "W3C HTML" else WARN_SYNTAX_CSS
self.add_warning(warnmsg % attrs, tag=tag)
def check_html (self):
"""Check HTML syntax of this page (which is supposed to be HTML)
with the online W3C HTML validator documented at
http://validator.w3.org/docs/api.html
"""
self.aggregate.check_w3_time()
try:
body = {'fragment': self.get_content(), 'output': 'soap12'}
data = urllib.urlencode(body)
u = urllib2.urlopen('http://validator.w3.org/check', data)
if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
return
self.check_w3_errors(u.read(), "W3C HTML")
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("HTML W3C validation caused error: %(msg)s ") %
{"msg": err})
def check_css (self):
"""Check CSS syntax of this page (which is supposed to be CSS)
with the online W3C CSS validator documented at
http://jigsaw.w3.org/css-validator/manual.html#expert
"""
self.aggregate.check_w3_time()
try:
host = 'jigsaw.w3.org'
path = '/css-validator/validator'
params = {
'text': "div {}",
'warning': '2',
'output': 'soap12',
}
fields = params.items()
content_type, body = httputil.encode_multipart_formdata(fields)
h = httplib.HTTPConnection(host)
h.putrequest('POST', path)
h.putheader('Content-Type', content_type)
h.putheader('Content-Length', str(len(body)))
h.endheaders()
h.send(body)
r = h.getresponse(True)
if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
return
self.check_w3_errors(r.read(), "W3C HTML")
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("CSS W3C validation caused error: %(msg)s ") %
{"msg": err})
def scan_virus (self):
"""Scan content for viruses."""
infected, errors = clamav.scan(self.get_content())
for msg in infected:
self.add_warning(u"Virus scan infection: %s" % msg)
for msg in errors:
self.add_warning(u"Virus scan error: %s" % msg)
def parse_url (self):
"""
Parse url content and search for recursive links.
Default parse type is html.
"""
self.parse_html()
self.add_num_url_info()
def read_content_chunk(self):
"""Read one chunk of content from this URL."""
return self.url_connection.read(self.ReadChunkBytes)
def get_user_password (self):
"""Get tuple (user, password) from configured authentication.
@ -933,16 +653,8 @@ class UrlBase (object):
return urllib.splitpasswd(self.userinfo)
return self.aggregate.config.get_user_password(self.url)
def parse_html (self):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
log.debug(LOG_CHECK, "Parsing HTML %s", self)
self.find_links(self.add_url)
def add_url (self, url, line=0, column=0, name=u"", base=None):
"""Queue URL data for checking."""
self.num_urls += 1
if base:
base_ref = urlutil.url_norm(base)[0]
else:
@ -954,108 +666,6 @@ class UrlBase (object):
# Only queue URLs which have a result or are not strict extern.
self.aggregate.urlqueue.put(url_data)
def add_num_url_info(self):
"""Add number of URLs parsed to info."""
if self.num_urls > 0:
attrs = {"num": self.num_urls}
msg = _n("%(num)d URL parsed.", "%(num)d URLs parsed.", self.num_urls)
self.add_info(msg % attrs)
def parse_opera (self):
"""Parse an opera bookmark file."""
log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)
from ..bookmarks.opera import parse_bookmark_data
for url, name, lineno in parse_bookmark_data(self.get_content()):
self.add_url(url, line=lineno, name=name)
def parse_chromium (self):
"""Parse a Chromium or Google Chrome bookmark file."""
log.debug(LOG_CHECK, "Parsing Chromium bookmarks %s", self)
from ..bookmarks.chromium import parse_bookmark_data
for url, name in parse_bookmark_data(self.get_content()):
self.add_url(url, name=name)
def parse_safari (self):
"""Parse a Safari bookmark file."""
log.debug(LOG_CHECK, "Parsing Safari bookmarks %s", self)
from ..bookmarks.safari import parse_bookmark_data
for url, name in parse_bookmark_data(self.get_content()):
self.add_url(url, name=name)
def parse_text (self):
"""Parse a text file with one url per line; comment and blank
lines are ignored."""
log.debug(LOG_CHECK, "Parsing text %s", self)
lineno = 0
for line in self.get_content().splitlines():
lineno += 1
line = line.strip()
if not line or line.startswith('#'):
continue
self.add_url(line, line=lineno)
def parse_css (self):
"""
Parse a CSS file for url() patterns.
"""
log.debug(LOG_CHECK, "Parsing CSS %s", self)
lineno = 0
linkfinder = linkparse.css_url_re.finditer
strip_comments = linkparse.strip_c_comments
for line in strip_comments(self.get_content()).splitlines():
lineno += 1
for mo in linkfinder(line):
column = mo.start("url")
url = strformat.unquote(mo.group("url").strip())
self.add_url(url, line=lineno, column=column)
def parse_swf (self):
"""Parse a SWF file for URLs."""
linkfinder = linkparse.swf_url_re.finditer
for mo in linkfinder(self.get_content()):
url = mo.group()
self.add_url(url)
def parse_word (self):
"""Parse a word file for hyperlinks."""
if not winutil.has_word():
return
filename = self.get_temp_filename()
# open word file and parse hyperlinks
try:
app = winutil.get_word_app()
try:
doc = winutil.open_wordfile(app, filename)
if doc is None:
raise winutil.Error("could not open word file %r" % filename)
try:
for link in doc.Hyperlinks:
self.add_url(link.Address, name=link.TextToDisplay)
finally:
winutil.close_wordfile(doc)
finally:
winutil.close_word_app(app)
except winutil.Error, msg:
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
def parse_wml (self):
"""Parse into WML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
log.debug(LOG_CHECK, "Parsing WML %s", self)
self.find_links(self.add_url, tags=linkparse.WmlTags)
def get_temp_filename (self):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
prefix='lc_')
try:
fd.write(self.get_content())
finally:
fd.close()
return filename
def serialized (self, sep=os.linesep):
"""
Return serialized url check data as unicode string.
@ -1103,7 +713,7 @@ class UrlBase (object):
if pat:
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
self.aggregate.config['internlinks'].append(get_link_pat(pat))
except UnicodeError, msg:
except UnicodeError as msg:
res = _("URL has unparsable domain name: %(domain)s") % \
{"domain": msg}
self.set_result(res, valid=False)
@ -1151,7 +761,7 @@ class UrlBase (object):
Number of seconds needed to check this link, default: zero.
- url_data.dltime: int
Number of seconds needed to download URL content, default: -1
- url_data.dlsize: int
- url_data.size: int
Size of downloaded URL content, default: -1
- url_data.info: list of unicode
Additional information about this URL.
@ -1181,7 +791,7 @@ class UrlBase (object):
domain=(self.urlparts[1] if self.urlparts else u""),
checktime=self.checktime,
dltime=self.dltime,
dlsize=self.dlsize,
size=self.size,
info=self.info,
line=self.line,
column=self.column,
@ -1211,7 +821,7 @@ urlDataAttr = [
'domain',
'checktime',
'dltime',
'dlsize',
'size',
'info',
'modified',
'line',

View file

@ -20,7 +20,7 @@ Utility functions suitable for command line clients.
from __future__ import print_function
import sys
import argparse
from . import checker, fileutil, strformat
from . import checker, fileutil, strformat, plugins
from .director import console
@ -42,6 +42,19 @@ def print_version(exit_code=0):
sys.exit(exit_code)
def print_plugins(folders, exit_code=0):
"""Print available plugins and exit."""
modules = plugins.get_plugin_modules(folders)
pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__)
for pluginclass in pluginclasses:
print(pluginclass.__name__)
doc = strformat.wrap(pluginclass.__doc__, 80)
print(strformat.indent(doc))
print()
sys.exit(exit_code)
def print_usage (msg, exit_code=2):
"""Print a program msg text to stderr and exit."""
program = sys.argv[0]

View file

@ -27,7 +27,7 @@ import urlparse
import shutil
import socket
import _LinkChecker_configdata as configdata
from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames, clamav,
from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames,
get_config_dir, fileutil, configdict)
from . import confparse
from ..decorators import memoized
@ -75,6 +75,9 @@ Modules = (
def get_modules_info ():
"""Return list of unicode strings with detected module info."""
lines = []
# requests
import requests
lines.append(u"Requests: %s" % requests.__version__)
# PyQt
try:
from PyQt4 import QtCore
@ -129,53 +132,48 @@ class Configuration (dict):
Initialize the default options.
"""
super(Configuration, self).__init__()
self['trace'] = False
self["verbose"] = False
self["complete"] = False
self["warnings"] = True
self["ignorewarnings"] = []
self['quiet'] = False
self["anchors"] = False
self["externlinks"] = []
self["internlinks"] = []
# on ftp, password is set by Pythons ftplib
## checking options
self["allowedschemes"] = []
self['cookiefile'] = None
self["debugmemory"] = False
self["localwebroot"] = None
self["maxfilesizeparse"] = 1*1024*1024
self["maxfilesizedownload"] = 5*1024*1024
self["maxnumurls"] = None
self["maxrunseconds"] = None
self["maxrequestspersecond"] = 10
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
self["proxy"] = urllib.getproxies()
self["sslverify"] = True
self["threads"] = 100
self["timeout"] = 60
self["aborttimeout"] = 300
self["recursionlevel"] = -1
self["useragent"] = UserAgent
## authentication
self["authentication"] = []
self["loginurl"] = None
self["loginuserfield"] = "login"
self["loginpasswordfield"] = "password"
self["loginextrafields"] = {}
self["proxy"] = urllib.getproxies()
self["recursionlevel"] = -1
self["wait"] = 0
self['sendcookies'] = False
self['storecookies'] = False
self['cookiefile'] = None
self["status"] = False
self["status_wait_seconds"] = 5
## filtering
self["externlinks"] = []
self["ignorewarnings"] = []
self["internlinks"] = []
self["checkextern"] = False
## plugins
self["pluginfolders"] = get_plugin_folders()
self["enabledplugins"] = []
## output
self['trace'] = False
self['quiet'] = False
self["verbose"] = False
self["warnings"] = True
self["fileoutput"] = []
self['output'] = 'text'
self["status"] = False
self["status_wait_seconds"] = 5
self['logger'] = None
self["warningregex"] = None
self["warningregex_max"] = 5
self["warnsizebytes"] = None
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
self["threads"] = 100
# socket timeout in seconds
self["timeout"] = 60
self["checkhtml"] = False
self["checkcss"] = False
self["scanvirus"] = False
self["clamavconf"] = clamav.canonical_clamav_conf()
self["useragent"] = UserAgent
self["debugmemory"] = False
self["localwebroot"] = None
self["sslverify"] = True
self["warnsslcertdaysvalid"] = 14
self["maxrunseconds"] = None
self["maxnumurls"] = None
self["maxconnectionshttp"] = 10
self["maxconnectionshttps"] = 10
self["maxconnectionsftp"] = 2
self.loggers = {}
from ..logger import LoggerClasses
for c in LoggerClasses:
@ -302,29 +300,15 @@ class Configuration (dict):
def sanitize (self):
"Make sure the configuration is consistent."
if self["anchors"]:
self.sanitize_anchors()
if self['logger'] is None:
self.sanitize_logger()
if self['scanvirus']:
self.sanitize_scanvirus()
if self['storecookies'] or self['cookiefile']:
self.sanitize_cookies()
if self['loginurl']:
self.sanitize_loginurl()
self.sanitize_proxies()
self.sanitize_plugins()
# set default socket timeout
socket.setdefaulttimeout(self['timeout'])
def sanitize_anchors (self):
"""Make anchor configuration consistent."""
if not self["warnings"]:
self["warnings"] = True
from ..checker.const import Warnings
self["ignorewarnings"] = Warnings.keys()
if 'url-anchor-not-found' in self["ignorewarnings"]:
self["ignorewarnings"].remove('url-anchor-not-found')
def sanitize_logger (self):
"""Make logger configuration consistent."""
if not self['output']:
@ -332,24 +316,6 @@ class Configuration (dict):
self['output'] = 'text'
self['logger'] = self.logger_new(self['output'])
def sanitize_scanvirus (self):
"""Ensure clamav is installed for virus checking."""
try:
clamav.init_clamav_conf(self['clamavconf'])
except clamav.ClamavError:
log.warn(LOG_CHECK,
_("Clamav could not be initialized"))
self['scanvirus'] = False
def sanitize_cookies (self):
"""Make cookie configuration consistent."""
if not self['sendcookies']:
log.warn(LOG_CHECK, _("activating sendcookies."))
self['sendcookies'] = True
if not self['storecookies']:
log.warn(LOG_CHECK, _("activating storecookies."))
self['storecookies'] = True
def sanitize_loginurl (self):
"""Make login configuration consistent."""
url = self["loginurl"]
@ -377,9 +343,6 @@ class Configuration (dict):
log.warn(LOG_CHECK,
_("disabling login URL %(url)s.") % {"url": url})
self["loginurl"] = None
elif not self['storecookies']:
# login URL implies storing and sending cookies
self['storecookies'] = self['sendcookies'] = True
def sanitize_proxies (self):
"""Try to read additional proxy settings which urllib does not
@ -395,6 +358,39 @@ class Configuration (dict):
if ftp_proxy:
self["proxy"]["ftp"] = ftp_proxy
def sanitize_plugins(self):
"""Ensure each plugin is configurable."""
for plugin in self["enabledplugins"]:
if plugin not in self:
self[plugin] = {}
def get_plugin_folders():
"""Get linkchecker plugin folders. Default is ~/.linkchecker/plugins/."""
folders = []
defaultfolder = normpath("~/.linkchecker/plugins")
if not os.path.exists(defaultfolder) and not Portable:
try:
make_userdir(defaultfolder)
except StandardError as errmsg:
msg = _("could not create plugin directory %(dirname)r: %(errmsg)r")
args = dict(dirname=defaultfolder, errmsg=errmsg)
log.warn(LOG_CHECK, msg % args)
if os.path.exists(defaultfolder):
folders.append(defaultfolder)
return folders
def make_userdir(child):
"""Create a child directory."""
userdir = os.path.dirname(child)
if not os.path.isdir(userdir):
if os.name == 'nt':
# Windows forbids filenames with leading dot unless
# a trailing dot is added.
userdir += "."
os.mkdir(userdir, 0700)
def get_user_config():
"""Get the user configuration filename.
@ -413,13 +409,7 @@ def get_user_config():
not Portable:
# copy the initial configuration to the user configuration
try:
userdir = os.path.dirname(userconf)
if not os.path.isdir(userdir):
if os.name == 'nt':
# Windows forbids filenames with leading dot unless
# a trailing dot is added.
userdir += "."
os.mkdir(userdir, 0700)
make_userdir(userconf)
shutil.copy(initialconf, userconf)
except StandardError as errmsg:
msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r")
@ -445,6 +435,7 @@ def get_gconf_http_proxy ():
return "%s:%d" % (host, port)
except StandardError as msg:
log.debug(LOG_CHECK, "error getting HTTP proxy from gconf: %s", msg)
pass
return None
@ -464,6 +455,7 @@ def get_gconf_ftp_proxy ():
return "%s:%d" % (host, port)
except StandardError as msg:
log.debug(LOG_CHECK, "error getting FTP proxy from gconf: %s", msg)
pass
return None
@ -478,6 +470,7 @@ def get_kde_http_proxy ():
return data.get("http_proxy")
except StandardError as msg:
log.debug(LOG_CHECK, "error getting HTTP proxy from KDE: %s", msg)
pass
def get_kde_ftp_proxy ():
@ -491,6 +484,7 @@ def get_kde_ftp_proxy ():
return data.get("ftp_proxy")
except StandardError as msg:
log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg)
pass
# The following KDE functions are largely ported and ajusted from
# Google Chromium:

View file

@ -17,9 +17,8 @@
"""Parse configuration files"""
import ConfigParser
import re
import os
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins
def read_multiline (value):
@ -53,16 +52,17 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
failed_files = set(files) - set(self.read_ok)
log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
# Read all the configuration parameters from the given files.
self.read_output_config()
self.read_checking_config()
self.read_authentication_config()
self.read_filtering_config()
self.read_output_config()
self.read_plugin_config()
except Exception as msg:
raise LinkCheckerError(
_("Error parsing configuration: %s") % unicode(msg))
def read_string_option (self, section, option, allowempty=False):
"""Read a sring option."""
"""Read a string option."""
if self.has_option(section, option):
value = self.get(section, option)
if not allowempty and not value:
@ -106,11 +106,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
if self.getboolean(section, "verbose"):
self.config["verbose"] = True
self.config["warnings"] = True
if self.has_option(section, "complete"):
if self.getboolean(section, "complete"):
self.config["complete"] = True
self.config["verbose"] = True
self.config["warnings"] = True
if self.has_option(section, "quiet"):
if self.getboolean(section, "quiet"):
self.config['output'] = 'none'
@ -141,37 +136,24 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
self.read_int_option(section, "threads", min=-1)
self.config['threads'] = max(0, self.config['threads'])
self.read_int_option(section, "timeout", min=1)
self.read_boolean_option(section, "anchors")
self.read_int_option(section, "aborttimeout", min=1)
self.read_int_option(section, "recursionlevel", min=-1)
if self.has_option(section, "warningregex"):
val = self.get(section, "warningregex")
if val:
self.config["warningregex"] = re.compile(val)
self.read_int_option(section, "warnsizebytes", min=1)
self.read_string_option(section, "nntpserver")
self.read_string_option(section, "useragent")
self.read_int_option(section, "pause", key="wait", min=0)
for name in ("http", "https", "ftp"):
self.read_int_option(section, "maxconnections%s" % name, min=1)
self.read_check_options(section)
def read_check_options (self, section):
"""Read check* options."""
self.read_boolean_option(section, "checkhtml")
self.read_boolean_option(section, "checkcss")
self.read_boolean_option(section, "scanvirus")
self.read_boolean_option(section, "clamavconf")
self.read_int_option(section, "maxrequestspersecond", min=1)
self.read_int_option(section, "maxnumurls", min=0)
self.read_int_option(section, "maxfilesizeparse", min=1)
self.read_int_option(section, "maxfilesizedownload", min=1)
if self.has_option(section, "allowedschemes"):
self.config['allowedschemes'] = [x.strip().lower() for x in \
self.get(section, 'allowedschemes').split(',')]
self.read_boolean_option(section, "debugmemory")
if self.has_option(section, "cookies"):
self.config["sendcookies"] = self.config["storecookies"] = \
self.getboolean(section, "cookies")
self.read_string_option(section, "cookiefile")
self.read_string_option(section, "localwebroot")
try:
self.read_boolean_option(section, "sslverify")
except ValueError:
self.read_string_option(section, "sslverify")
self.read_int_option(section, "warnsslcertdaysvalid", min=1)
self.read_int_option(section, "maxrunseconds", min=0)
def read_authentication_config (self):
@ -198,7 +180,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
raise LinkCheckerError(_("invalid login URL `%s'. Only " \
"HTTP and HTTPS URLs are supported.") % val)
self.config["loginurl"] = val
self.config["storecookies"] = self.config["sendcookies"] = True
self.read_string_option(section, "loginuserfield")
self.read_string_option(section, "loginpasswordfield")
# read login extra fields
@ -231,7 +212,7 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
"""
section = "filtering"
if self.has_option(section, "ignorewarnings"):
self.config['ignorewarnings'] = [f.strip() for f in \
self.config['ignorewarnings'] = [f.strip().lower() for f in \
self.get(section, 'ignorewarnings').split(',')]
if self.has_option(section, "ignore"):
for line in read_multiline(self.get(section, "ignore")):
@ -244,3 +225,14 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
if self.has_option(section, "internlinks"):
pat = get_link_pat(self.get(section, "internlinks"))
self.config["internlinks"].append(pat)
self.read_boolean_option(section, "checkextern")
def read_plugin_config(self):
"""Read plugin-specific configuration values."""
folders = self.config["pluginfolders"]
modules = plugins.get_plugin_modules(folders)
for pluginclass in plugins.get_plugin_classes(modules):
section = pluginclass.__name__
if self.has_section(section):
self.config["enabledplugins"].append(section)
self.config[section] = pluginclass.read_config(self)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2012 Bastian Kleineidam
# Copyright (C) 2004-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -15,510 +15,13 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parsing and storing of cookies. See [1]RFC 2965 and [2]RFC 2109.
The reason for this module is that neither the cookielib nor the Cookie
modules included in the Python standard library provide a usable interface
for programmable cookie handling.
This module provides parsing of cookies for all formats specified by
the above RFCs, plus smart methods handling data conversion and formatting.
And a cookie storage class is provided.
[1] http://www.faqs.org/rfcs/rfc2965.html
[2] http://www.faqs.org/rfcs/rfc2109.html
Parsing of cookies.
"""
import time
import string
import re
import cookielib
import httplib
import requests
from cStringIO import StringIO
from . import strformat
_nulljoin = ''.join
_semispacejoin = '; '.join
_spacejoin = ' '.join
class CookieError (StandardError):
"""Thrown for invalid cookie syntax or conflicting/impossible values."""
pass
_LegalChars = string.ascii_letters + string.digits + "!#$%&'*+-.^_`|~:"
_Translator = {
'\000' : '\\000', '\001' : '\\001', '\002' : '\\002',
'\003' : '\\003', '\004' : '\\004', '\005' : '\\005',
'\006' : '\\006', '\007' : '\\007', '\010' : '\\010',
'\011' : '\\011', '\012' : '\\012', '\013' : '\\013',
'\014' : '\\014', '\015' : '\\015', '\016' : '\\016',
'\017' : '\\017', '\020' : '\\020', '\021' : '\\021',
'\022' : '\\022', '\023' : '\\023', '\024' : '\\024',
'\025' : '\\025', '\026' : '\\026', '\027' : '\\027',
'\030' : '\\030', '\031' : '\\031', '\032' : '\\032',
'\033' : '\\033', '\034' : '\\034', '\035' : '\\035',
'\036' : '\\036', '\037' : '\\037',
# Because of the way browsers really handle cookies (as opposed
# to what the RFC says) we also encode , and ;
',' : '\\054', ';' : '\\073',
'"' : '\\"', '\\' : '\\\\',
'\177' : '\\177', '\200' : '\\200', '\201' : '\\201',
'\202' : '\\202', '\203' : '\\203', '\204' : '\\204',
'\205' : '\\205', '\206' : '\\206', '\207' : '\\207',
'\210' : '\\210', '\211' : '\\211', '\212' : '\\212',
'\213' : '\\213', '\214' : '\\214', '\215' : '\\215',
'\216' : '\\216', '\217' : '\\217', '\220' : '\\220',
'\221' : '\\221', '\222' : '\\222', '\223' : '\\223',
'\224' : '\\224', '\225' : '\\225', '\226' : '\\226',
'\227' : '\\227', '\230' : '\\230', '\231' : '\\231',
'\232' : '\\232', '\233' : '\\233', '\234' : '\\234',
'\235' : '\\235', '\236' : '\\236', '\237' : '\\237',
'\240' : '\\240', '\241' : '\\241', '\242' : '\\242',
'\243' : '\\243', '\244' : '\\244', '\245' : '\\245',
'\246' : '\\246', '\247' : '\\247', '\250' : '\\250',
'\251' : '\\251', '\252' : '\\252', '\253' : '\\253',
'\254' : '\\254', '\255' : '\\255', '\256' : '\\256',
'\257' : '\\257', '\260' : '\\260', '\261' : '\\261',
'\262' : '\\262', '\263' : '\\263', '\264' : '\\264',
'\265' : '\\265', '\266' : '\\266', '\267' : '\\267',
'\270' : '\\270', '\271' : '\\271', '\272' : '\\272',
'\273' : '\\273', '\274' : '\\274', '\275' : '\\275',
'\276' : '\\276', '\277' : '\\277', '\300' : '\\300',
'\301' : '\\301', '\302' : '\\302', '\303' : '\\303',
'\304' : '\\304', '\305' : '\\305', '\306' : '\\306',
'\307' : '\\307', '\310' : '\\310', '\311' : '\\311',
'\312' : '\\312', '\313' : '\\313', '\314' : '\\314',
'\315' : '\\315', '\316' : '\\316', '\317' : '\\317',
'\320' : '\\320', '\321' : '\\321', '\322' : '\\322',
'\323' : '\\323', '\324' : '\\324', '\325' : '\\325',
'\326' : '\\326', '\327' : '\\327', '\330' : '\\330',
'\331' : '\\331', '\332' : '\\332', '\333' : '\\333',
'\334' : '\\334', '\335' : '\\335', '\336' : '\\336',
'\337' : '\\337', '\340' : '\\340', '\341' : '\\341',
'\342' : '\\342', '\343' : '\\343', '\344' : '\\344',
'\345' : '\\345', '\346' : '\\346', '\347' : '\\347',
'\350' : '\\350', '\351' : '\\351', '\352' : '\\352',
'\353' : '\\353', '\354' : '\\354', '\355' : '\\355',
'\356' : '\\356', '\357' : '\\357', '\360' : '\\360',
'\361' : '\\361', '\362' : '\\362', '\363' : '\\363',
'\364' : '\\364', '\365' : '\\365', '\366' : '\\366',
'\367' : '\\367', '\370' : '\\370', '\371' : '\\371',
'\372' : '\\372', '\373' : '\\373', '\374' : '\\374',
'\375' : '\\375', '\376' : '\\376', '\377' : '\\377'
}
def quote(str, LegalChars=_LegalChars):
r"""Quote a string for use in a cookie header.
If the string does not need to be double-quoted, then just return the
string. Otherwise, surround the string in doublequotes and quote
(with a \) special characters.
"""
if all(c in LegalChars for c in str):
return str
else:
return '"' + _nulljoin(_Translator.get(s, s) for s in str) + '"'
_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]")
_QuotePatt = re.compile(r"[\\].")
def unquote(str):
"""Remove string quoting."""
# If there aren't any doublequotes,
# then there can't be any special characters. See RFC 2109.
if len(str) < 2:
return str
if str[0] != '"' or str[-1] != '"':
return str
# We have to assume that we must decode this string.
# Down to work.
# Remove the "s
str = str[1:-1]
# Check for special sequences. Examples:
# \012 --> \n
# \" --> "
#
i = 0
n = len(str)
res = []
while 0 <= i < n:
o_match = _OctalPatt.search(str, i)
q_match = _QuotePatt.search(str, i)
if not o_match and not q_match: # Neither matched
res.append(str[i:])
break
# else:
j = k = -1
if o_match:
j = o_match.start(0)
if q_match:
k = q_match.start(0)
if q_match and (not o_match or k < j): # QuotePatt matched
res.append(str[i:k])
res.append(str[k+1])
i = k + 2
else: # OctalPatt matched
res.append(str[i:j])
res.append(chr(int(str[j+1:j+4], 8)))
i = j + 4
return _nulljoin(res)
has_embedded_dot = re.compile(r"[a-zA-Z0-9]\.[a-zA-Z]").search
# Pattern for finding cookie snatched from Pythons Cookie.py
# Modification: allow whitespace in values.
_LegalCharsPatt = r"[\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=]"
_CookiePattern = re.compile(r"""
(?x) # This is a verbose pattern
(?P<key> # Start of group 'key'
""" + _LegalCharsPatt + r"""+? # Any word of at least one letter
) # End of group 'key'
( # Optional group: there may not be a value.
\s*=\s* # Equal Sign
(?P<val> # Start of group 'val'
"(?:[^\\"]|\\.)*" # Any doublequoted string
| # or
\w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr
| # or
""" + _LegalCharsPatt + r"""* # Any word or empty string
) # End of group 'val'
)? # End of optional value group
\s* # Any number of spaces.
(\s+|;|$) # Ending either at space, semicolon, or EOS.
""")
class HttpCookie (object):
"""A cookie consists of one name-value pair with attributes.
Each attribute consists of a predefined name (see attribute_names)
and a value (which is optional for some attributes)."""
# A mapping from the lowercase variant on the left to the
# appropriate traditional formatting on the right.
attribute_names = {
# Old Netscape attribute
"expires": "expires",
# Defined by RFC 2109
"path": "Path",
"comment": "Comment",
"domain": "Domain",
"max-age": "Max-Age",
"secure": "secure",
"version": "Version",
# Additional attributes defined by RFC 2965
"commenturl": "CommentURL",
"discard": "Discard",
"port": "Port",
# httponly to protect against XSS attacks
"httponly": "httponly",
}
def __init__ (self, name, value, attributes=None):
"""Store name, value and attributes. Also calculates expiration
if given in attributes."""
self.name = name
self.value = value
if attributes is None:
self.attributes = {}
else:
self.attributes = attributes
self.calculate_expiration()
def calculate_expiration (self):
"""If "max-age" or "expires" attributes are given, calculate
the time when this cookie expires.
Stores the time value in self.expires, or None if this cookie
does not expire.
"""
# default: do not expire
self.expire = None
if "max-age" in self.attributes:
now = time.time()
try:
maxage = int(self.attributes["max-age"])
if maxage == 0:
# Expire immediately: subtract 1 to be sure since
# some clocks have only full second precision.
self.expire = now - 1
else:
self.expire = now + maxage
except (ValueError, OverflowError):
# note: even self.now + maxage can overflow
pass
elif "expires" in self.attributes:
expiration_date = self.attributes["expires"]
try:
self.expire = cookielib.http2time(expiration_date)
except ValueError:
# see http://bugs.python.org/issue16181
raise CookieError("Invalid expiration date in %r" % expiration_date)
def is_expired (self, now=None):
"""Return True if this cookie is expired, else False."""
if self.expire is None:
# Does not expire.
return False
if now is None:
now = time.time()
return now > self.expire
def __repr__ (self):
"""Return cookie name, value and attributes as string."""
attrs = "; ".join("%s=%r"%(k, v) for k, v in self.attributes.items())
return "<%s %s=%r; %s>" % (self.__class__.__name__,
self.name, self.value, attrs)
def is_valid_for (self, scheme, host, port, path):
"""Check validity of this cookie against the desired scheme,
host and path."""
if self.check_expired() and \
self.check_domain(host) and \
self.check_port(port) and \
self.check_path(path) and \
self.check_secure(scheme):
return True
return False
def check_expired (self):
"""Return False if cookie is expired, else True."""
return not self.is_expired()
def check_domain (self, domain):
"""Return True if given domain matches this cookie, else False."""
if "domain" not in self.attributes:
return False
cdomain = self.attributes["domain"]
if domain == cdomain:
# equality matches
return True
if "." not in domain and domain == cdomain[1:]:
# "localhost" and ".localhost" match
return True
if not domain.endswith(cdomain):
# any suffix matches
return False
if "." in domain[:-(len(cdomain)+1)]:
# prefix must be dot-free
return False
return True
def check_port (self, port):
"""Return True if given port matches this cookie, else False.
For now, this returns always True."""
return True
def check_path (self, path):
"""Return True if given path matches this cookie, else False."""
if "path" not in self.attributes:
return False
return path.startswith(self.attributes["path"])
def check_secure (self, scheme):
"""Return True if given Scheme is allowed for this cookie, else
False."""
if "secure" in self.attributes:
return scheme == "https"
return True
def set_attribute (self, key, value):
"""Helper method to set attribute values. Called when parsing
cookie data.
The attribute key and value are checked, and CookieError is
raised in these cases."""
if self.attributes is None:
raise CookieError("no NAME=VALUE before attributes found")
key = key.lower()
if key not in self.attribute_names:
raise CookieError("invalid attribute %r" % key)
if value:
value = unquote(value)
else:
value = ""
if key == "domain":
value = value.lower()
if not value.startswith(".") and not has_embedded_dot(value):
if "." in value:
raise CookieError("invalid dot in domain %r" % value)
# supply a leading dot
value = "."+value
if key == "max-age":
try:
if int(value) < 0:
raise ValueError("Negative Max-Age")
except (OverflowError, ValueError):
raise CookieError("invalid Max-Age number: %r" % value)
if key == "port":
ports = value.split(",")
for port in ports:
try:
if not (0 <= int(port) <= 65535):
raise ValueError("Invalid port number")
except (OverflowError, ValueError):
raise CookieError("invalid port number: %r" % port)
self.attributes[key] = value
def parse (self, text, patt=_CookiePattern):
"""Parse cookie data."""
text = strformat.ascii_safe(text.rstrip('\r\n'))
# reset values
self.name = None
self.value = None
self.attributes = None
# Our starting point
i = 0
# Length of string
n = len(text)
while 0 <= i < n:
# Start looking for a key-value pair.
match = patt.search(text, i)
if not match:
# No more key-value pairs.
break
key, value = match.group("key"), match.group("val")
if value is None:
value = ""
i = match.end()
# Parse the key, value in case it's metainfo.
if self.name is None:
# Set name and value.
self.name = key
self.value = unquote(value)
self.attributes = {}
else:
if key.startswith("$"):
key = key[1:]
self.set_attribute(key, value)
if self.name is None:
raise CookieError("missing cookie name in %r" % text)
self.calculate_expiration()
def set_default_attributes (self, scheme, host, path):
"""Set domain and path attributes for given scheme, host and
path."""
scheme = strformat.ascii_safe(scheme)
host = strformat.ascii_safe(host)
path = strformat.ascii_safe(path)
if "domain" not in self.attributes:
self.attributes["domain"] = host.lower()
if "path" not in self.attributes:
i = path.rfind("/")
if i == -1:
path = "/"
else:
path = path[:i]
if not path:
path = "/"
self.attributes["path"] = path
if not self.check_domain(host):
cdomain = self.attributes["domain"]
raise CookieError("domain %r not for cookie %r" % (cdomain, host))
if not self.check_path(path):
cpath = self.attributes["path"]
raise CookieError("domain %r not for cookie %r" % (cpath, path))
if not self.check_secure(scheme):
raise CookieError("no secure scheme %r" % scheme)
def quote (self, key, value):
"""Quote value for given key."""
return quote(value)
def server_header_value (self):
"""Return HTTP header value to send to server."""
parts = ["%s=%s" % (self.name, quote(self.value))]
parts.extend(["%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
for k, v in self.attributes.items()])
return "; ".join(parts)
def client_header_value (self):
"""Return HTTP header value to send to client."""
parts = []
if "version" in self.attributes:
parts.append("$Version=%s" % quote(self.attributes["version"]))
parts.append("%s=%s" % (self.name, quote(self.value)))
parts.extend(["$%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
for k, v in self.attributes.items() if k != "version"])
return "; ".join(parts)
class NetscapeCookie (HttpCookie):
"""Parses RFC 2109 (Netscape) cookies."""
def __init__ (self, text, scheme, host, path):
"""Parse given cookie data."""
self.parse(text)
self.set_default_attributes(scheme, host, path)
def server_header_name (self):
"""Return "Set-Cookie" as server header name."""
return "Set-Cookie"
def __eq__ (self, other):
"""Compare equality of cookie."""
return (isinstance(other, NetscapeCookie) and
self.name.lower() == other.name.lower() and
self.attributes['domain'] == other.attributes['domain'] and
self.attributes['path'] == other.attributes['path'])
def __hash__ (self):
"""Cookie hash value"""
data = (
self.name.lower(),
self.attributes['domain'],
self.attributes['path'],
)
return hash(data)
class Rfc2965Cookie (HttpCookie):
"""Parses RFC 2965 cookies."""
def __init__ (self, text, scheme, host, path):
"""Parse given cookie data."""
self.parse(text)
self.set_default_attributes(scheme, host, path)
def check_port (self, port):
"""Return True if given port matches this cookie, else False."""
if "port" not in self.attributes:
return True
cport = self.attributes["port"]
return port in [int(x) for x in cport.split(",")]
def server_header_name (self):
"""Return "Set-Cookie2" as server header name."""
return "Set-Cookie2"
def quote (self, key, value):
"""Quote value for given key."""
if key == "port":
return quote(value, LegalChars="")
return quote(value)
def __eq__ (self, other):
"""Compare equality of cookie."""
return (isinstance(other, Rfc2965Cookie) and
self.name.lower() == other.name.lower() and
self.attributes['domain'].lower() ==
other.attributes['domain'].lower() and
self.attributes['path'] == other.attributes['path'])
def __hash__ (self):
"""Cookie hash value"""
data = (
self.name.lower(),
self.attributes['domain'].lower(),
self.attributes['path'],
)
return hash(data)
def from_file (filename):
@ -545,92 +48,21 @@ def from_file (filename):
def from_headers (strheader):
"""Parse cookie data from a string in HTTP header (RFC 2616) format.
@return: tuple (headers, scheme, host, path)
@return: list of cookies
@raises: ValueError for incomplete or invalid data
"""
res = []
fp = StringIO(strheader)
headers = httplib.HTTPMessage(fp, seekable=True)
if "Host" not in headers:
raise ValueError("Required header 'Host:' missing")
host = headers["Host"]
scheme = headers.get("Scheme", "http")
path= headers.get("Path", "/")
return (headers, scheme, host, path)
## Taken and adpated from the _mechanize package included in Twill.
def cookie_str(cookie):
"""Return string representation of Cookie."""
h = [(cookie.name, unquote(cookie.value)),
("path", cookie.path),
("domain", cookie.domain)]
if cookie.port is not None: h.append(("port", cookie.port))
#if cookie.path_specified: h.append(("path_spec", None))
#if cookie.port_specified: h.append(("port_spec", None))
#if cookie.domain_initial_dot: h.append(("domain_dot", None))
if cookie.secure: h.append(("secure", None))
if cookie.httponly: h.append(("httponly", None))
if cookie.expires: h.append(("expires",
time2isoz(float(cookie.expires))))
if cookie.discard: h.append(("discard", None))
if cookie.comment: h.append(("comment", cookie.comment))
if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
#if cookie.rfc2109: h.append(("rfc2109", None))
keys = cookie.nonstandard_attr_keys()
keys.sort()
for k in keys:
h.append((k, str(cookie.get_nonstandard_attr(k))))
h.append(("version", str(cookie.version)))
return join_header_words([h])
def time2isoz(t=None):
"""Return a string representing time in seconds since epoch, t.
If the function is called without an argument, it will use the current
time.
The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
representing Universal Time (UTC, aka GMT). An example of this format is:
1994-11-24 08:49:37Z
"""
if t is None: t = time.time()
year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
year, mon, mday, hour, min, sec)
join_escape_re = re.compile(r"([\"\\])")
def join_header_words(lists):
"""Do the inverse of the conversion done by split_header_words.
Takes a list of lists of (key, value) pairs and produces a single header
value. Attribute values are quoted if needed.
>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
'text/plain; charset="iso-8859/1"'
>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
'text/plain, charset="iso-8859/1"'
"""
headers = []
for pairs in lists:
attr = []
for k, v in pairs:
if v is not None:
if not re.search(r"^\w+$", v):
v = join_escape_re.sub(r"\\\1", v) # escape " and \
v = '"%s"' % v
if k is None: # Netscape cookies may have no name
k = v
else:
k = "%s=%s" % (k, v)
attr.append(k)
if attr: headers.append("; ".join(attr))
return ", ".join(headers)
for header in headers.getallmatchingheaders("Set-Cookie"):
headervalue = header.split(':', 1)[1]
for pairs in cookielib.split_header_words([headervalue]):
for name, value in pairs:
cookie = requests.cookies.create_cookie(name, value,
domain=host, path=path)
res.append(cookie)
return res

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2012 Bastian Kleineidam
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -19,13 +19,11 @@ Management of checking a queue of links with several threads.
"""
import os
import thread
import urlparse
from cStringIO import StringIO
from .. import log, LOG_CHECK, LinkCheckerInterrupt, cookies, dummy, \
fileutil, strformat
from ..cache import urlqueue, robots_txt, cookie, connection
import time
from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
fileutil, strformat, plugins
from ..cache import urlqueue, robots_txt
from . import aggregator, console
from ..httplib2 import HTTPMessage
def visit_loginurl (aggregate):
@ -53,7 +51,7 @@ def visit_loginurl (aggregate):
log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
{"url": url})
return
store_cookies(tc.get_browser().cj, aggregate.cookies, url)
#XXX store_cookies(tc.get_browser().cj, aggregate.cookies, url)
resulturl = tc.get_browser().get_url()
log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
# add result URL to check list
@ -107,18 +105,6 @@ def search_formname (fieldnames, tc):
return None
def store_cookies (cookiejar, cookiecache, url):
"""Store cookies in cookiejar into the cookiecache."""
cookielst = []
for c in cookiejar:
cookielst.append("Set-Cookie2: %s" % cookies.cookie_str(c))
log.debug(LOG_CHECK, "Store cookies %s", cookielst)
headers = HTTPMessage(StringIO("\r\n".join(cookielst)))
urlparts = urlparse.urlsplit(url)
scheme, host, path = urlparts[0:3]
cookiecache.add(headers, scheme, host, path)
def check_urls (aggregate):
"""Main check function; checks all configured URLs until interrupted
with Ctrl-C.
@ -194,14 +180,17 @@ def abort (aggregate):
break
except KeyboardInterrupt:
log.warn(LOG_CHECK, _("user abort; force shutdown"))
aggregate.logger.end_log_output()
abort_now()
def abort_now ():
"""Force exit of current process without cleanup."""
if os.name == 'posix':
# Unix systems can use sigkill
# Unix systems can use signals
import signal
os.kill(os.getpid(), signal.SIGTERM)
time.sleep(1)
os.kill(os.getpid(), signal.SIGKILL)
elif os.name == 'nt':
# NT has os.abort()
@ -214,8 +203,6 @@ def abort_now ():
def get_aggregate (config):
"""Get an aggregator instance with given configuration."""
_urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"])
connections = connection.ConnectionPool(config.get_connectionlimits(), wait=config["wait"])
cookies = cookie.CookieJar()
_robots_txt = robots_txt.RobotsTxt()
return aggregator.Aggregate(config, _urlqueue, connections,
cookies, _robots_txt)
plugin_manager = plugins.PluginManager(config)
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager)

View file

@ -17,54 +17,93 @@
"""
Aggregate needed object instances for checker threads.
"""
import time
import threading
from .. import log, LOG_CHECK, strformat
import thread
import requests
import time
import random
from .. import log, LOG_CHECK, strformat, cookies
from ..decorators import synchronized
from ..cache import urlqueue
from . import logger, status, checker, cleanup
from . import logger, status, checker, interrupt
_w3_time_lock = threading.Lock()
_threads_lock = threading.RLock()
_download_lock = threading.Lock()
_hosts_lock = threading.RLock()
def new_request_session(config):
"""Create a new request session."""
session = requests.Session()
# XXX proxies
if config["cookiefile"]:
for cookie in cookies.from_file(config["cookiefile"]):
session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
return session
class Aggregate (object):
"""Store thread-safe data collections for checker threads."""
def __init__ (self, config, urlqueue, connections, cookies, robots_txt):
def __init__ (self, config, urlqueue, robots_txt, plugin_manager):
"""Store given link checking objects."""
self.config = config
self.urlqueue = urlqueue
self.connections = connections
self.cookies = cookies
self.robots_txt = robots_txt
self.logger = logger.Logger(config)
self.threads = []
self.last_w3_call = 0
self.downloaded_bytes = 0
self.request_sessions = {}
self.robots_txt = robots_txt
self.plugin_manager = plugin_manager
self.times = {}
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
@synchronized(_threads_lock)
def start_threads (self):
"""Spawn threads for URL checking and status printing."""
if self.config["status"]:
t = status.Status(self.urlqueue, self.config.status_logger,
self.config["status_wait_seconds"],
self.config["maxrunseconds"])
self.config["status_wait_seconds"])
t.start()
self.threads.append(t)
if self.config["maxrunseconds"]:
t = interrupt.Interrupt(self.config["maxrunseconds"])
t.start()
self.threads.append(t)
t = cleanup.Cleanup(self.connections)
t.start()
self.threads.append(t)
num = self.config["threads"]
if num > 0:
for dummy in range(num):
t = checker.Checker(self.urlqueue, self.logger)
t.start()
t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
self.threads.append(t)
t.start()
else:
self.request_sessions[thread.get_ident()] = new_request_session(self.config)
checker.check_url(self.urlqueue, self.logger)
@synchronized(_threads_lock)
def add_request_session(self):
"""Add a request session for current thread."""
session = new_request_session(self.config)
self.request_sessions[thread.get_ident()] = session
@synchronized(_threads_lock)
def get_request_session(self):
"""Get the request session for current thread."""
return self.request_sessions[thread.get_ident()]
@synchronized(_hosts_lock)
def wait_for_host(self, host):
"""Throttle requests to one host."""
t = time.time()
if host in self.times:
due_time = self.times[host]
if due_time > t:
wait = due_time - t
time.sleep(wait)
t = time.time()
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
self.times[host] = t + wait_time
@synchronized(_threads_lock)
def print_active_threads (self):
"""Log all currently active threads."""
@ -77,8 +116,8 @@ class Aggregate (object):
first = False
log.info(LOG_CHECK, name[12:])
args = dict(
num=len(self.threads),
timeout=strformat.strduration_long(self.config["timeout"]),
num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
timeout=strformat.strduration_long(self.config["aborttimeout"]),
)
log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)
@ -98,7 +137,7 @@ class Aggregate (object):
"""Print still-active URLs and empty the URL queue."""
self.print_active_threads()
self.cancel()
timeout = self.config["timeout"]
timeout = self.config["aborttimeout"]
try:
self.urlqueue.join(timeout=timeout)
except urlqueue.Timeout:
@ -118,36 +157,9 @@ class Aggregate (object):
self.cancel()
for t in self.threads:
t.stop()
self.connections.clear()
self.gather_statistics()
@synchronized(_threads_lock)
def is_finished (self):
"""Determine if checking is finished."""
self.remove_stopped_threads()
return self.urlqueue.empty() and not self.threads
@synchronized(_w3_time_lock)
def check_w3_time (self):
"""Make sure the W3C validators are at most called once a second."""
if time.time() - self.last_w3_call < 1:
time.sleep(1)
self.last_w3_call = time.time()
@synchronized(_download_lock)
def add_download_data(self, url, data):
"""Add given downloaded data.
@param url: URL which data belongs to
@ptype url: unicode
@param data: downloaded data
@ptype data: string
"""
self.downloaded_bytes += len(data)
def gather_statistics(self):
"""Gather download and cache statistics and send them to the
logger.
"""
robots_txt_stats = self.robots_txt.hits, self.robots_txt.misses
download_stats = self.downloaded_bytes
self.logger.add_statistics(robots_txt_stats, download_stats)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2011 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -36,14 +36,17 @@ def check_url (urlqueue, logger):
class Checker (task.LoggedCheckedTask):
"""URL check thread."""
def __init__ (self, urlqueue, logger):
def __init__ (self, urlqueue, logger, add_request_session):
"""Store URL queue and logger."""
super(Checker, self).__init__(logger)
self.urlqueue = urlqueue
self.origname = self.getName()
self.add_request_session = add_request_session
def run_checked (self):
"""Check URLs in the queue."""
# construct per-thread HTTP/S requests session
self.add_request_session()
while not self.stopped(0):
self.check_url()

View file

@ -1,40 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2007-2011 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Cleanup task."""
import time
from . import task, console
class Cleanup (task.CheckedTask):
"""Cleanup task performing periodic cleanup of cached connections."""
def __init__ (self, connections):
"""Store urlqueue object."""
super(Cleanup, self).__init__()
self.connections = connections
def run_checked (self):
"""Print periodic status messages."""
self.start_time = time.time()
self.setName("Cleanup")
# clean every 15 seconds
while not self.stopped(15):
self.connections.remove_expired()
def internal_error (self):
"""Print internal error to console."""
console.internal_error()

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2013 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -0,0 +1,46 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Status message handling"""
import time
from . import task
from .. import log, LOG_CHECK, strformat
class Interrupt (task.CheckedTask):
"""Thread that raises KeyboardInterrupt after a specified duration.
This gives us a portable SIGALRM implementation.
The duration is checked every 5 seconds.
"""
WaitSeconds = 5
def __init__ (self, duration):
"""Initialize the task.
@param duration: raise KeyboardInterrupt after given number of seconds
@ptype duration: int
"""
super(Interrupt, self).__init__()
self.duration = duration
def run_checked (self):
"""Wait and raise KeyboardInterrupt after."""
self.start_time = time.time()
self.setName("Interrupt")
while not self.stopped(self.WaitSeconds):
duration = time.time() - self.start_time
if duration > self.duration:
log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration))
raise KeyboardInterrupt()

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2012 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -29,7 +29,6 @@ class Logger (object):
self.loggers = [config['logger']]
self.loggers.extend(config['fileoutput'])
self.verbose = config["verbose"]
self.complete = config["complete"]
self.warnings = config["warnings"]
def start_log_output (self):
@ -46,15 +45,8 @@ class Logger (object):
for logger in self.loggers:
logger.end_output()
def add_statistics(self, robots_txt_stats, download_stats):
"""Add statistics to logger."""
for logger in self.loggers:
logger.add_statistics(robots_txt_stats, download_stats)
def do_print (self, url_data):
"""Determine if URL entry should be logged or not."""
if self.complete:
return True
if self.verbose:
return True
if self.warnings and url_data.warnings:

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2012 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -22,7 +22,7 @@ from . import task
class Status (task.LoggedCheckedTask):
"""Thread that gathers and logs the status periodically."""
def __init__ (self, urlqueue, logger, wait_seconds, max_duration):
def __init__ (self, urlqueue, logger, wait_seconds):
"""Initialize the status logger task.
@param urlqueue: the URL queue
@ptype urlqueue: Urlqueue
@ -30,33 +30,27 @@ class Status (task.LoggedCheckedTask):
@ptype logger: console.StatusLogger
@param wait_seconds: interval in seconds to report status
@ptype wait_seconds: int
@param max_duration: abort checking after given number of seconds
@ptype max_duration: int or None
"""
super(Status, self).__init__(logger)
self.urlqueue = urlqueue
self.wait_seconds = wait_seconds
assert self.wait_seconds >= 1
self.first_wait = True
self.max_duration = max_duration
def run_checked (self):
"""Print periodic status messages."""
self.start_time = time.time()
self.setName("Status")
if not self.first_wait:
wait_seconds = self.wait_seconds
else:
# the first status should be after a second
self.first_wait = False
wait_seconds = 1
# the first status should be after a second
wait_seconds = 1
first_wait = True
while not self.stopped(wait_seconds):
self.log_status()
if first_wait:
wait_seconds = self.wait_seconds
first_wait = False
def log_status (self):
"""Log a status message."""
duration = time.time() - self.start_time
if self.max_duration is not None and duration > self.max_duration:
raise KeyboardInterrupt()
checked, in_progress, queue = self.urlqueue.status()
self.logger.log_status(checked, in_progress, queue, duration)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2011 Bastian Kleineidam
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -16,7 +16,7 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import thread
from ..decorators import notimplemented
from .. import log, LOG_CHECK, threader
from .. import threader
from . import console
@ -28,7 +28,6 @@ class CheckedTask (threader.StoppableThread):
try:
self.run_checked()
except KeyboardInterrupt:
log.warn(LOG_CHECK, "interrupt did not reach the main thread")
thread.interrupt_main()
except Exception:
self.internal_error()

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2011 Bastian Kleineidam
# Copyright (C) 2005-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -275,6 +275,12 @@ def is_accessable_by_others(filename):
return mode & (stat.S_IRWXG | stat.S_IRWXO)
def is_writable_by_others(filename):
"""Check if file or directory is world writable."""
mode = os.stat(filename)[stat.ST_MODE]
return mode & stat.S_IWOTH
@memoized
def is_writable(filename):
"""Check if

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2009-2010 Bastian Kleineidam
# Copyright (C) 2009-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -23,7 +23,7 @@ from PyQt4 import QtCore, QtGui
from .linkchecker_ui_main import Ui_MainWindow
from .properties import set_properties, clear_properties
from .statistics import set_statistics, clear_statistics
from .debug import LinkCheckerDebug, LinkCheckerDebugMemory
from .debug import LinkCheckerDebug
from .logger import SignalLogger, GuiLogHandler, StatusLogger
from .help import HelpWindow
from .options import LinkCheckerOptions
@ -37,7 +37,7 @@ from .settings import Settings
from .recentdocs import RecentDocumentModel
from .projects import openproject, saveproject, loadproject, ProjectExt
from .. import configuration, checker, director, get_link_pat, \
strformat, fileutil, LinkCheckerError, memoryutil
strformat, fileutil, LinkCheckerError
from ..containers import enum
from .. import url as urlutil
from ..checker import httpheaders
@ -99,7 +99,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
# init subdialogs
self.options = LinkCheckerOptions(parent=self)
self.debug = LinkCheckerDebug(parent=self)
self.debugmemory = LinkCheckerDebugMemory(parent=self)
self.checker = CheckerThread(parent=self)
self.contextmenu = ContextMenu(parent=self)
self.editor = EditorWindow(parent=self)
@ -175,8 +174,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
def set_idle ():
"""Set application status to idle."""
self.status = Status.idle
if self.config["debugmemory"]:
self.dump_memory()
self.set_statusmsg(_("Check finished."))
self.controlButton.clicked.disconnect(self.checker.cancel)
self.checker.finished.connect(set_idle)
@ -250,7 +247,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
self.config["threads"] = 1
else:
self.config.reset_loglevel()
self.config["debugmemory"] = data["debugmemory"]
if data["warninglines"]:
lines = data["warninglines"].splitlines()
ro = re.compile(warninglines2regex(lines))
@ -313,7 +309,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
elif status == Status.checking:
self.treeView.setSortingEnabled(False)
self.debug.reset()
self.debugmemory.reset()
self.set_statusmsg(u"Checking site...")
# disable commands
self.menubar.setEnabled(False)
@ -423,7 +418,7 @@ Version 2 or later.
def cancel (self):
"""Note that checking is canceled."""
self.controlButton.setEnabled(False)
duration = strformat.strduration_long(self.config["timeout"])
duration = strformat.strduration_long(self.config["aborttimeout"])
self.set_statusmsg(_(u"Closing active URLs with timeout %s...") % duration)
@QtCore.pyqtSlot()
@ -436,16 +431,6 @@ Version 2 or later.
else:
raise ValueError("Invalid application status %r" % self.status)
def dump_memory (self):
"""Dump memory to temporary file and inform user with a modal
dialog where the file is."""
self.set_statusmsg(_(u"Dumping memory statistics..."))
filename = memoryutil.write_memory_dump()
title = _(u"LinkChecker memory dump written")
message = _(u"The memory dump has been written to `%(filename)s'.")
attrs = dict(filename=filename)
QtGui.QMessageBox.information(self, title, message % attrs)
def get_url (self):
"""Return URL to check from the urlinput widget."""
url = strformat.stripurl(unicode(self.urlinput.text()))
@ -524,9 +509,10 @@ Version 2 or later.
"""View URL source in editor window."""
self.editor.setWindowTitle(u"View %s" % url)
self.editor.setUrl(url)
info, data = urlutil.get_content(url, proxy=self.config["proxy"])
if (info, data) == (None, None):
self.editor.setText(u"An error occurred retreiving URL `%s'." % url)
data, info = urlutil.get_content(url, proxy=self.config["proxy"])
if data is None:
msg = u"An error occurred retreiving URL `%s': %s." % (url, info)
self.editor.setText(msg)
else:
content_type = httpheaders.get_content_type(info)
if not content_type:

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2008-2011 Bastian Kleineidam
# Copyright (C) 2008-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2009-2011 Bastian Kleineidam
# Copyright (C) 2009-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2009-2012 Bastian Kleineidam
# Copyright (C) 2009-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -41,23 +41,3 @@ class LinkCheckerDebug (QtGui.QDialog, Ui_DebugDialog):
def getText (self):
"""Get debug info as string."""
return self.textEdit.toPlainText()
class LinkCheckerDebugMemory (QtGui.QDialog, Ui_DebugDialog):
"""Show memory debugging output."""
def __init__ (self, parent=None):
"""Setup the debug memory dialog."""
super(LinkCheckerDebugMemory, self).__init__(parent)
self.setupUi(self)
font = QtGui.QFont("Consolas", 11)
font.setFixedPitch(True)
self.textEdit.document().setDefaultFont(font)
def reset (self):
"""Clear memory info."""
self.textEdit.clear()
def setText (self, text):
"""Set memory debug info."""
return self.textEdit.setPlainText(text)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2012 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2009-2011 Bastian Kleineidam
# Copyright (C) 2009-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2012 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -2,8 +2,8 @@
# Form implementation generated from reading ui file 'ui/debug.ui'
#
# Created: Mon Dec 12 19:00:37 2011
# by: PyQt4 UI code generator 4.8.6
# Created: Fri Feb 28 21:24:59 2014
# by: PyQt4 UI code generator 4.9.3
#
# WARNING! All changes made in this file will be lost!
@ -19,7 +19,6 @@ class Ui_DebugDialog(object):
DebugDialog.setObjectName(_fromUtf8("DebugDialog"))
DebugDialog.setWindowModality(QtCore.Qt.ApplicationModal)
DebugDialog.resize(564, 547)
DebugDialog.setWindowTitle(_("LinkChecker debug log"))
self.verticalLayout = QtGui.QVBoxLayout(DebugDialog)
self.verticalLayout.setObjectName(_fromUtf8("verticalLayout"))
self.frame = QtGui.QFrame(DebugDialog)
@ -40,5 +39,5 @@ class Ui_DebugDialog(object):
QtCore.QMetaObject.connectSlotsByName(DebugDialog)
def retranslateUi(self, DebugDialog):
pass
DebugDialog.setWindowTitle(_("LinkChecker debug log"))

View file

@ -2,7 +2,7 @@
# Form implementation generated from reading ui file 'ui/main.ui'
#
# Created: Tue Nov 6 21:47:39 2012
# Created: Fri Feb 28 21:24:58 2014
# by: PyQt4 UI code generator 4.9.3
#
# WARNING! All changes made in this file will be lost!
@ -679,29 +679,6 @@ class Ui_MainWindow(object):
self.stats_url_maxlen.setOpenExternalLinks(True)
self.stats_url_maxlen.setObjectName(_fromUtf8("stats_url_maxlen"))
self.gridLayout_3.addWidget(self.stats_url_maxlen, 1, 1, 1, 1)
self.label_14 = QtGui.QLabel(self.groupBox_2)
sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.label_14.sizePolicy().hasHeightForWidth())
self.label_14.setSizePolicy(sizePolicy)
self.label_14.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)
self.label_14.setObjectName(_fromUtf8("label_14"))
self.gridLayout_3.addWidget(self.label_14, 1, 2, 1, 1)
self.stats_domains = QtGui.QLabel(self.groupBox_2)
sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.stats_domains.sizePolicy().hasHeightForWidth())
self.stats_domains.setSizePolicy(sizePolicy)
self.stats_domains.setMinimumSize(QtCore.QSize(30, 0))
self.stats_domains.setFrameShape(QtGui.QFrame.StyledPanel)
self.stats_domains.setFrameShadow(QtGui.QFrame.Sunken)
self.stats_domains.setText(_fromUtf8(""))
self.stats_domains.setTextFormat(QtCore.Qt.RichText)
self.stats_domains.setOpenExternalLinks(True)
self.stats_domains.setObjectName(_fromUtf8("stats_domains"))
self.gridLayout_3.addWidget(self.stats_domains, 1, 3, 1, 1)
self.verticalLayout_2.addWidget(self.groupBox_2)
self.horizontalLayout.addWidget(self.statistics)
self.verticalLayout.addLayout(self.horizontalLayout)
@ -831,7 +808,6 @@ class Ui_MainWindow(object):
self.label_18.setText(_("Min. length"))
self.label_20.setText(_("Avg. length"))
self.label_19.setText(_("Max. length"))
self.label_14.setText(_("Domains"))
self.menuEdit.setTitle(_("&Edit"))
self.menuFile.setTitle(_("&File"))
self.menuHelp.setTitle(_("&Help"))

View file

@ -2,8 +2,8 @@
# Form implementation generated from reading ui file 'ui/options.ui'
#
# Created: Sun Jun 10 11:51:42 2012
# by: PyQt4 UI code generator 4.9.1
# Created: Fri Feb 28 21:24:59 2014
# by: PyQt4 UI code generator 4.9.3
#
# WARNING! All changes made in this file will be lost!
@ -28,6 +28,7 @@ class Ui_Options(object):
self.widget = QtGui.QWidget(self.groupBox_2)
self.widget.setObjectName(_fromUtf8("widget"))
self.formLayout = QtGui.QFormLayout(self.widget)
self.formLayout.setFieldGrowthPolicy(QtGui.QFormLayout.ExpandingFieldsGrow)
self.formLayout.setMargin(0)
self.formLayout.setObjectName(_fromUtf8("formLayout"))
self.label = QtGui.QLabel(self.widget)
@ -60,14 +61,6 @@ class Ui_Options(object):
self.debug.setText(_fromUtf8(""))
self.debug.setObjectName(_fromUtf8("debug"))
self.formLayout.setWidget(2, QtGui.QFormLayout.FieldRole, self.debug)
self.label_7 = QtGui.QLabel(self.widget)
self.label_7.setToolTip(_fromUtf8(""))
self.label_7.setObjectName(_fromUtf8("label_7"))
self.formLayout.setWidget(3, QtGui.QFormLayout.LabelRole, self.label_7)
self.debugmemory = QtGui.QCheckBox(self.widget)
self.debugmemory.setText(_fromUtf8(""))
self.debugmemory.setObjectName(_fromUtf8("debugmemory"))
self.formLayout.setWidget(3, QtGui.QFormLayout.FieldRole, self.debugmemory)
self.verticalLayout.addWidget(self.widget)
spacerItem = QtGui.QSpacerItem(20, 10, QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding)
self.verticalLayout.addItem(spacerItem)
@ -143,7 +136,6 @@ class Ui_Options(object):
self.label_2.setText(_("Verbose output"))
self.verbose.setToolTip(_("Log all checked URLs once. Default is to log only errors and warnings."))
self.label_4.setText(_("Debug"))
self.label_7.setText(_("Debug memory usage"))
self.label_5.setText(_("Warn when one of these strings are found (one per line):"))
self.label_6.setText(_("Ignore URLs matching one of these patterns (one per line):"))
self.groupBox.setTitle(_("Configuration file"))

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2009-2012 Bastian Kleineidam
# Copyright (C) 2009-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -19,7 +19,7 @@ import os
from PyQt4 import QtGui
from .linkchecker_ui_options import Ui_Options
from .editor import EditorWindow
from ..fileutil import is_writable, has_module
from ..fileutil import is_writable
from .. import configuration
@ -46,11 +46,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
self.recursionlevel.setValue(-1)
self.verbose.setChecked(False)
self.debug.setChecked(False)
self.debugmemory.setChecked(False)
if not has_module("meliae"):
self.debugmemory.setEnabled(False)
from ..memoryutil import MemoryDebugMsg
self.debugmemory.setToolTip(MemoryDebugMsg)
self.warninglines.setPlainText(u"")
self.ignorelines.setPlainText(u"")
@ -69,7 +64,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
"""Return option data as dictionary."""
return dict(
debug=self.debug.isChecked(),
debugmemory=self.debugmemory.isChecked(),
verbose=self.verbose.isChecked(),
recursionlevel=self.recursionlevel.value(),
warninglines=unicode(self.warninglines.toPlainText()),
@ -80,8 +74,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
"""Set GUI options from given data."""
if data.get("debug") is not None:
self.debug.setChecked(data["debug"])
if data.get("debugmemory") is not None:
self.debugmemory.setChecked(data["debugmemory"])
if data.get("verbose") is not None:
self.verbose.setChecked(data["verbose"])
if data.get("recursionlevel") is not None:

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -57,9 +57,6 @@ class ProjectParser (confparse.LCConfigParser):
return
data = {}
option = "debug"
if self.has_option(section, option):
data[option] = self.getboolean(section, option)
option = "debugmemory"
if self.has_option(section, option):
data[option] = self.getboolean(section, option)
option = "verbose"

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2012 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -38,8 +38,8 @@ def set_properties (widget, data):
widget.prop_dltime.setText(_("%.3f seconds") % data.dltime)
else:
widget.prop_dltime.setText(u"")
if data.dlsize >= 0:
widget.prop_size.setText(strformat.strsize(data.dlsize))
if data.size >= 0:
widget.prop_size.setText(strformat.strsize(data.size))
else:
widget.prop_size.setText(u"")
if data.modified:

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -85,10 +85,10 @@ class Settings (object):
def read_options (self):
"""Return stored GUI options."""
data = dict(debug=None, debugmemory=None, verbose=None,
data = dict(debug=None, verbose=None,
recursionlevel=None, warninglines=None, ignorelines=None)
self.settings.beginGroup('output')
for key in ("debug", "debugmemory", "verbose"):
for key in ("debug", "verbose"):
if self.settings.contains(key):
data[key] = self.settings.value(key).toBool()
self.settings.endGroup()
@ -116,7 +116,7 @@ class Settings (object):
def save_options (self, data):
"""Save GUI options."""
self.settings.beginGroup('output')
for key in ("debug", "debugmemory", "verbose"):
for key in ("debug", "verbose"):
self.settings.setValue(key, QtCore.QVariant(data[key]))
self.settings.endGroup()
self.settings.beginGroup('checking')

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2011 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -19,7 +19,6 @@ from ..logger import ContentTypes
def set_statistics (widget, statistics):
"""Set statistic information in given widget."""
widget.stats_domains.setText(u"%d" % len(statistics.domains))
widget.stats_url_minlen.setText(u"%d" % statistics.min_url_length)
widget.stats_url_maxlen.setText(u"%d" % statistics.max_url_length)
widget.stats_url_avglen.setText(u"%d" % statistics.avg_url_length)
@ -38,7 +37,6 @@ def set_statistics (widget, statistics):
def clear_statistics (widget):
"""Reset statistic information in given widget."""
widget.stats_domains.setText(u"")
widget.stats_url_minlen.setText(u"")
widget.stats_url_maxlen.setText(u"")
widget.stats_url_avglen.setText(u"")

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1402,53 +1402,6 @@
</property>
</widget>
</item>
<item row="1" column="2">
<widget class="QLabel" name="label_14">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="text">
<string>Domains</string>
</property>
<property name="alignment">
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
</property>
</widget>
</item>
<item row="1" column="3">
<widget class="QLabel" name="stats_domains">
<property name="sizePolicy">
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="minimumSize">
<size>
<width>30</width>
<height>0</height>
</size>
</property>
<property name="frameShape">
<enum>QFrame::StyledPanel</enum>
</property>
<property name="frameShadow">
<enum>QFrame::Sunken</enum>
</property>
<property name="text">
<string/>
</property>
<property name="textFormat">
<enum>Qt::RichText</enum>
</property>
<property name="openExternalLinks">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View file

@ -29,6 +29,9 @@
<item>
<widget class="QWidget" name="widget" native="true">
<layout class="QFormLayout" name="formLayout">
<property name="fieldGrowthPolicy">
<enum>QFormLayout::ExpandingFieldsGrow</enum>
</property>
<item row="0" column="0">
<widget class="QLabel" name="label">
<property name="toolTip">
@ -104,23 +107,6 @@
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="label_7">
<property name="toolTip">
<string extracomment="When checking finishes, write a memory dump to a temporary file. The memory dump is written both when checking finishes normally and when checking gets canceled."/>
</property>
<property name="text">
<string>Debug memory usage</string>
</property>
</widget>
</item>
<item row="3" column="1">
<widget class="QCheckBox" name="debugmemory">
<property name="text">
<string/>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2011 Bastian Kleineidam
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2008-2009 Bastian Kleineidam
# Copyright (C) 2008-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2001-2010 Bastian Kleineidam
# Copyright (C) 2001-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -201,9 +201,7 @@ class LinkFinder (TagFinder):
def start_element (self, tag, attrs):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d",
self.parser.lineno(), self.parser.column(),
self.parser.last_lineno(), self.parser.last_column())
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
if tag == "base" and not self.base_ref:
self.base_ref = unquote(attrs.get_true("href", u''))
tagattrs = self.tags.get(tag, [])
@ -282,7 +280,6 @@ class LinkFinder (TagFinder):
return
for u in urls:
assert isinstance(u, unicode) or u is None, repr(u)
log.debug(LOG_CHECK,
u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
self.callback(u, self.parser.last_lineno(),
self.parser.last_column(), name, base)

File diff suppressed because it is too large Load diff

View file

@ -1,86 +1,9 @@
# -*- coding: iso-8859-1 -*-
# Various HTTP utils with a free license
from cStringIO import StringIO
from . import gzip2 as gzip
from . import httplib2 as httplib
from . import log, LOG_CHECK, fileutil
import re
import zlib
import urllib
import urllib2
from . import fileutil
import base64
###########################################################################
# urlutils.py - Simplified urllib handling
#
# Written by Chris Lawrence <lawrencc@debian.org>
# (C) 1999-2002 Chris Lawrence
#
# This program is freely distributable per the following license:
#
## Permission to use, copy, modify, and distribute this software and its
## documentation for any purpose and without fee is hereby granted,
## provided that the above copyright notice appears in all copies and that
## both that copyright notice and this permission notice appear in
## supporting documentation.
##
## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I
## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
## SOFTWARE.
def decode (page):
"""Gunzip or deflate a compressed page."""
log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
# cannot seek in socket descriptors, so must get content now
content = page.read()
try:
if encoding == 'deflate':
fp = StringIO(zlib.decompress(content))
else:
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
except zlib.error as msg:
log.debug(LOG_CHECK, "uncompressing had error "
"%s, assuming non-compressed content", str(msg))
fp = StringIO(content)
# remove content-encoding header
headers = httplib.HTTPMessage(StringIO(""))
ceheader = re.compile(r"(?i)content-encoding:")
for h in page.info().keys():
if not ceheader.match(h):
headers[h] = page.info()[h]
newpage = urllib.addinfourl(fp, headers, page.geturl())
newpage.code = page.code
newpage.msg = page.msg
return newpage
return page
class HttpWithGzipHandler (urllib2.HTTPHandler):
"""Support gzip encoding."""
def http_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPHandler.http_open(self, req))
if hasattr(httplib, 'HTTPS'):
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
"""Support gzip encoding."""
def https_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPSHandler.https_open(self, req))
# end of urlutils.py routines
###########################################################################
def encode_multipart_formdata(fields, files=None):
"""
From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -172,7 +172,7 @@ def get_configuration(form, out):
config["logger"] = config.logger_new('html', fd=out, encoding=HTML_ENCODING)
config["threads"] = 2
if "anchors" in form:
config["anchors"] = True
config["enabledplugins"].append("AnchorCheck")
if "errors" not in form:
config["verbose"] = True
# avoid checking of local files or other nasty stuff
@ -246,15 +246,16 @@ def format_error (why):
@return: HTML page content
@rtype: unicode
"""
return _("""<html><head>
return _("""<!DOCTYPE HTML>
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>LinkChecker Online Error</title></head>
<body text=#192c83 bgcolor=#fff7e5 link=#191c83 vlink=#191c83 alink=#191c83>
<blockquote>
<b>Error: %s</b><br>
<b>Error: %s</b><br/>
The LinkChecker Online script has encountered an error. Please ensure
that your provided URL link begins with <code>http://</code> and
contains only these characters: <code>A-Za-z0-9./_~-</code><br><br>
contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
Errors are logged.
</blockquote>
</body>

Some files were not shown because too many files have changed in this diff Show more