mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-17 06:20:27 +00:00
Introduce check plugins, use Python requests for http/s connections, and some code cleanups and improvements.
This commit is contained in:
parent
adc17fbe77
commit
7b34be590b
194 changed files with 4817 additions and 8903 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -36,3 +36,4 @@ Changelog.linkchecker*
|
|||
/todo
|
||||
/alexa*.log
|
||||
/testresults.txt
|
||||
/linkchecker.prof
|
||||
|
|
|
|||
6
Makefile
6
Makefile
|
|
@ -18,11 +18,11 @@ DEBORIGFILE:=$(DEBUILDDIR)/$(LAPPNAME)_$(VERSION).orig.tar.xz
|
|||
DEBPACKAGEDIR:=$(DEBUILDDIR)/$(APPNAME)-$(VERSION)
|
||||
FILESCHECK_URL:=http://localhost/~calvin/
|
||||
SRCDIR:=${HOME}/src
|
||||
PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc
|
||||
PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc/examples
|
||||
MYPY_FILES_DIRS:=linkcheck/HtmlParser linkcheck/checker \
|
||||
linkcheck/cache linkcheck/configuration linkcheck/director \
|
||||
linkcheck/htmlutil linkcheck/logger linkcheck/network \
|
||||
linkcheck/bookmarks \
|
||||
linkcheck/bookmarks linkcheck/plugins linkcheck/parser \
|
||||
linkcheck/gui/__init__.py \
|
||||
linkcheck/gui/checker.py \
|
||||
linkcheck/gui/contextmenu.py \
|
||||
|
|
@ -192,7 +192,7 @@ filescheck: localbuild
|
|||
done
|
||||
|
||||
update-copyright:
|
||||
update-copyright --holder="Bastian Kleineidam"
|
||||
update-copyright --holder="Bastian Kleineidam" $(PY_FILES_DIRS)
|
||||
|
||||
releasecheck: check update-certificates
|
||||
@if egrep -i "xx\.|xxxx|\.xx" doc/changelog.txt > /dev/null; then \
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ create table linksdb (
|
|||
name varchar(256),
|
||||
checktime int,
|
||||
dltime int,
|
||||
dlsize int,
|
||||
size int,
|
||||
cached int,
|
||||
level int not null,
|
||||
modified varchar(256)
|
||||
|
|
|
|||
|
|
@ -131,32 +131,18 @@
|
|||
#threads=100
|
||||
# connection timeout in seconds
|
||||
#timeout=60
|
||||
# check anchors?
|
||||
#anchors=0
|
||||
# Time to wait for checks to finish after the user aborts the first time
|
||||
# (with Ctrl-C or the abort button).
|
||||
#aborttimeout=300
|
||||
# The recursion level determines how many times links inside pages are followed.
|
||||
#recursionlevel=1
|
||||
# supply a regular expression for which warnings are printed if found
|
||||
# in any HTML files.
|
||||
#warningregex=(Oracle DB Error|Page Not Found|badsite\.example\.com)
|
||||
# Basic NNTP server. Overrides NNTP_SERVER environment variable.
|
||||
# warn if size info exceeds given maximum of bytes
|
||||
#warnsizebytes=2000
|
||||
#nntpserver=
|
||||
# check HTML or CSS syntax with the W3C online validator
|
||||
#checkhtml=1
|
||||
#checkcss=1
|
||||
# scan URL content for viruses with ClamAV
|
||||
#scanvirus=1
|
||||
# ClamAV config file
|
||||
#clamavconf=/etc/clamav/clamd.conf
|
||||
# Send and store cookies
|
||||
#cookies=1
|
||||
# parse a cookiefile for initial cookie data
|
||||
#cookiefile=/path/to/cookies.txt
|
||||
# User-Agent header string to send to HTTP web servers
|
||||
# Note that robots.txt are always checked with the original User-Agent.
|
||||
#useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
|
||||
# Pause the given number of seconds between two subsequent connection
|
||||
# requests to the same host.
|
||||
#pause=0
|
||||
# When checking finishes, write a memory dump to a temporary file.
|
||||
# The memory dump is written both when checking finishes normally
|
||||
# and when checking gets canceled.
|
||||
|
|
@ -175,22 +161,16 @@
|
|||
# Check SSL certificates. Set to an absolute pathname for a custom
|
||||
# CA cert bundle to use. Set to zero to disable SSL certificate verification.
|
||||
#sslverify=1
|
||||
# Check that SSL certificates are at least the given number of days valid.
|
||||
# The number must not be negative.
|
||||
# If the number of days is zero a warning is printed only for certificates
|
||||
# that are already expired.
|
||||
# The default number of days is 14.
|
||||
#sslcertwarndays=14
|
||||
# Stop checking new URLs after the given number of seconds. Same as if the
|
||||
# user hits Ctrl-C after X seconds.
|
||||
#maxrunseconds=600
|
||||
# Maximum number of URLs to check. New URLs will not be queued after the
|
||||
# given number of URLs is checked.
|
||||
#maxnumurls=153
|
||||
# Maximum number of connections to one single host for different connection types.
|
||||
#maxconnectionshttp=10
|
||||
#maxconnectionshttps=10
|
||||
#maxconnectionsftp=2
|
||||
# Maximum number of requests per second to one host.
|
||||
#maxrequestspersecond=10
|
||||
# Allowed URL schemes as a comma-separated list.
|
||||
#allowedschemes=http,https
|
||||
|
||||
##################### filtering options ##########################
|
||||
[filtering]
|
||||
|
|
@ -211,11 +191,12 @@
|
|||
# recognized warnings). Add a comma-separated list of warnings here
|
||||
# that prevent a valid URL from being logged. Note that the warning
|
||||
# will be logged in invalid URLs.
|
||||
#ignorewarnings=url-unicode-domain,anchor-not-found
|
||||
#ignorewarnings=url-unicode-domain
|
||||
# Regular expression to add more URLs recognized as internal links.
|
||||
# Default is that URLs given on the command line are internal.
|
||||
|
||||
#internlinks=^http://www\.example\.net/
|
||||
# Check external links
|
||||
#checkextern=1
|
||||
|
||||
|
||||
##################### password authentication ##########################
|
||||
|
|
@ -247,3 +228,30 @@
|
|||
#loginextrafields=
|
||||
# name1:value1
|
||||
# name 2:value 2
|
||||
|
||||
############################ Plugins ###################################
|
||||
#
|
||||
# uncomment sections to enable plugins
|
||||
|
||||
# Check HTML anchors
|
||||
#[AnchorCheck]
|
||||
|
||||
# Add country info to URLs
|
||||
#[LocationInfo]
|
||||
|
||||
# Run W3C syntax checks
|
||||
#[CssSyntaxCheck]
|
||||
#[HtmlSyntaxCheck]
|
||||
|
||||
# Search for regular expression in page contents
|
||||
#[RegexCheck]
|
||||
#warningregex=Oracle Error
|
||||
|
||||
# Search for viruses in page contents
|
||||
#[VirusCheck]
|
||||
#clamavconf=/etc/clamav/clam.conf
|
||||
|
||||
# Check that SSL certificates are at least the given number of days valid.
|
||||
#[SslCertificateCheck]
|
||||
#sslcertwarndays=14
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,34 @@
|
|||
8.7 "" (released xx.xx.2014)
|
||||
|
||||
Features:
|
||||
- checking: Support connection and content check plugins.
|
||||
- checking: Move lots of custom checks like Antivirus and syntax
|
||||
checks into plugins (see upgrading.txt for more info).
|
||||
- checking: Add options to limit the number of requests per second,
|
||||
allowed URL schemes and maximum file or download size.
|
||||
|
||||
Changes:
|
||||
- checking: Use the Python requests module for HTTP and HTTPS requests.
|
||||
- logging: Removed download, domains and robots.txt statistics.
|
||||
- logging: HTML output is now in HTML5.
|
||||
- checking: Removed 301 warning since 301 redirects are used
|
||||
a lot without updating the old URL links.
|
||||
- checking: Disallowed access by robots.txt is an info now, not
|
||||
a warning. Otherwise it produces a lot of warnings which
|
||||
is counter-productive.
|
||||
- checking: Do not check SMTP connections for mailto: URLs anymore.
|
||||
It resulted in lots of false warnings since spam prevention
|
||||
usually disallows direct SMTP connections from unrecognized
|
||||
client IPs.
|
||||
- checking: Only internal URLs are checked as default. To check
|
||||
external urls use --check-extern.
|
||||
|
||||
Fixes:
|
||||
- logging: Status was printed every second regardless of the
|
||||
configured wait time.
|
||||
- checking: Several speed and memory usage improvements.
|
||||
|
||||
|
||||
8.6 "About Time" (released 8.1.2014)
|
||||
|
||||
Changes:
|
||||
|
|
|
|||
|
|
@ -41,16 +41,15 @@ Antivirusprüfung
|
|||
.IP \(bu
|
||||
ein Kommandozeilenprogramm, GUI und web interface
|
||||
.SH BEISPIELE
|
||||
Der häufigste Gebrauchsfall prüft die angegebene Domäne rekursiv,
|
||||
inklusive aller einzelnen nach außen zeigenden Verknüpfungen:
|
||||
\fBlinkchecker http://www.example.net/\fP
|
||||
The most common use checks the given domain recursively:
|
||||
\fBlinkchecker http://www.example.com/\fP
|
||||
.br
|
||||
Beachten Sie dass dies die komplette Domäne überprüft, welche aus mehreren
|
||||
tausend URLs bestehen kann. Benutzen Sie die Option \fB\-r\fP, um die
|
||||
Rekursionstiefe zu beschränken.
|
||||
.br
|
||||
Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich geprüft:
|
||||
\fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
|
||||
Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
|
||||
\fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
|
||||
.br
|
||||
Überprüfung einer lokalen HTML Datei unter Unix:
|
||||
\fBlinkchecker ../bla.html\fP
|
||||
|
|
@ -61,8 +60,8 @@ Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich
|
|||
Sie können den \fBhttp://\fP URL Anteil weglassen wenn die Domäne mit \fBwww.\fP beginnt:
|
||||
\fBlinkchecker www.example.com\fP
|
||||
.br
|
||||
Sie können den \fBftp://\fP URL Anteil weglassen wenn die Domäne mit \fBftp.\fP beginnt:
|
||||
\fBlinkchecker \-r0 ftp.example.org\fP
|
||||
You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
|
||||
\fBlinkchecker \-r0 ftp.example.com\fP
|
||||
.br
|
||||
Erzeuge einen Sitemap Graphen und konvertiere ihn mit dem graphviz dot Programm:
|
||||
\fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
|
||||
|
|
@ -88,19 +87,12 @@ positive Nummer an.
|
|||
.TP
|
||||
\fB\-V\fP, \fB\-\-version\fP
|
||||
Gebe die Version aus und beende das Programm.
|
||||
.TP
|
||||
\fB\-\-list\-plugins\fP
|
||||
Print available check plugins and exit.
|
||||
.
|
||||
.SS Ausgabeoptionen
|
||||
.TP
|
||||
\fB\-\-check\-css\fP
|
||||
Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
|
||||
.TP
|
||||
\fB\-\-check\-html\fP
|
||||
Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
|
||||
.TP
|
||||
\fB\-\-complete\fP
|
||||
Gebe alle geprüften URLs aus. Standard ist es, doppelte URLs nur einmal
|
||||
auszugeben.
|
||||
.TP
|
||||
\fB\-D\fP\fINAME\fP, \fB\-\-debug=\fP\fINAME\fP
|
||||
Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
|
||||
\fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP und \fBall\fP. Die Angabe
|
||||
|
|
@ -144,12 +136,6 @@ lokalen Spracheinstellung. Gültige Enkodierungen sind aufgelistet unter
|
|||
Keine Ausgabe, ein Alias für \fB\-o none\fP. Dies ist nur in Verbindung mit
|
||||
\fB\-F\fP nützlich.
|
||||
.TP
|
||||
\fB\-\-scan\-virus\fP
|
||||
Prüfe Inhalt von URLs auf Viren mit ClamAV.
|
||||
.TP
|
||||
\fB\-\-trace\fP
|
||||
Trace\-Information ausgeben.
|
||||
.TP
|
||||
\fB\-v\fP, \fB\-\-verbose\fP
|
||||
Gebe alle geprüften URLs aus. Standard ist es, nur fehlerhafte URLs und
|
||||
Warnungen auszugeben.
|
||||
|
|
@ -168,27 +154,15 @@ werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
|
|||
Applikationsfehler)".
|
||||
.br
|
||||
Siehe Abschnitt \fBREGULAR EXPRESSIONS\fP für weitere Infos.
|
||||
.TP
|
||||
\fB\-\-warning\-size\-bytes=\fP\fINUMMER\fP
|
||||
Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
|
||||
Anzahl von Bytes übersteigt.
|
||||
.
|
||||
.SS "Optionen zum Prüfen"
|
||||
.TP
|
||||
\fB\-a\fP, \fB\-\-anchors\fP
|
||||
Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
|
||||
Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
|
||||
.TP
|
||||
\fB\-C\fP, \fB\-\-cookies\fP
|
||||
Akzeptiere und sende HTTP Cookies nach der RFC 2109. Lediglich Cookies, die
|
||||
zum ursprünglichen Server zurückgesendet werden, werden akzeptiert.
|
||||
Gesendete und akzeptierte Cookies werden als zusätzlicheLoginformation
|
||||
aufgeführt.
|
||||
.TP
|
||||
\fB\-\-cookiefile=\fP\fIDATEINAME\fP
|
||||
Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird weiter unten
|
||||
erklärt.
|
||||
.TP
|
||||
\fB\-\-check\-extern\fP
|
||||
Check external URLs.
|
||||
.TP
|
||||
\fB\-\-ignore\-url=\fP\fIREGEX\fP
|
||||
URLs welche dem angegebenen regulären Ausdruck entsprechen werden ignoriert
|
||||
und nicht geprüft.
|
||||
|
|
@ -215,11 +189,6 @@ Liest ein Passwort von der Kommandozeile und verwende es für HTTP und FTP
|
|||
Autorisierung. Für FTP ist das Standardpasswort \fBanonymous@\fP. Für HTTP gibt
|
||||
es kein Standardpasswort. Siehe auch \fB\-u\fP.
|
||||
.TP
|
||||
\fB\-P\fP\fINUMMER\fP, \fB\-\-pause=\fP\fINUMMER\fP
|
||||
Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
|
||||
folgenden Verbindungen zum demselben Rechner. Standard ist keine Pause
|
||||
zwischen Verbindungen.
|
||||
.TP
|
||||
\fB\-r\fP\fINUMMER\fP, \fB\-\-recursion\-level=\fP\fINUMMER\fP
|
||||
Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
|
||||
bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
|
||||
|
|
@ -301,17 +270,13 @@ Eine Cookie\-Datei enthält Standard HTTP\-Header (RFC 2616) mit den folgenden
|
|||
möglichen Namen:
|
||||
.
|
||||
.TP
|
||||
\fBScheme\fP (optional)
|
||||
Setzt das Schema für das die Cookies gültig sind; Standardschema ist
|
||||
\fBhttp\fP.
|
||||
.TP
|
||||
\fBHost\fP (erforderlich)
|
||||
Setzt die Domäne für die die Cookies gültig sind.
|
||||
.TP
|
||||
\fBPath\fP (optional)
|
||||
Gibt den Pfad für den die Cookies gültig sind; Standardpfad ist \fB/\fP.
|
||||
.TP
|
||||
\fBSet\-cookie\fP (optional)
|
||||
\fBSet\-cookie\fP (required)
|
||||
Setzt den Cookie Name/Wert. Kann mehrmals angegeben werden.
|
||||
.PP
|
||||
Mehrere Einträge sind durch eine Leerzeile zu trennen.
|
||||
|
|
@ -325,7 +290,6 @@ Das untige Beispiel sendet zwei Cookies zu allen URLs die mit
|
|||
Set\-cookie: ID="smee"
|
||||
Set\-cookie: spam="egg"
|
||||
|
||||
Scheme: https
|
||||
Host: example.org
|
||||
Set\-cookie: baggage="elitist"; comment="hologram"
|
||||
|
||||
|
|
@ -362,12 +326,10 @@ beschrieben.
|
|||
.
|
||||
.TP
|
||||
HTTP Verknüpfungen (\fBhttp:\fP, \fBhttps:\fP)
|
||||
Nach Verbinden zu dem gegebenen HTTP\-Server wird der eingegebene Pfad oder
|
||||
Query angefordert. Alle Umleitungen werden verfolgt, und falls ein
|
||||
Benutzer/Passwort angegeben wurde werden diese falls notwendig als
|
||||
Authorisierung benutzt. Permanent umgezogene Webseiten werden als Warnung
|
||||
ausgegeben. Alle finalen HTTP Statuscodes, die nicht dem Muster 2xx
|
||||
entsprechen, werden als Fehler ausgegeben.
|
||||
After connecting to the given HTTP server the given path or query is
|
||||
requested. All redirections are followed, and if user/password is given it
|
||||
will be used as authorization when necessary. All final HTTP status codes
|
||||
other than 2xx are errors.
|
||||
.
|
||||
Der Inhalt von HTML\-Seiten wird rekursiv geprüft.
|
||||
.TP
|
||||
|
|
@ -418,6 +380,19 @@ Nicht unterstützte Links (\*(lqjavascript:\*(lq, etc.)
|
|||
Die komplette Liste von erkannten, aber nicht unterstützten Links ist in der
|
||||
Quelldatei \fBlinkcheck/checker/unknownurl.py\fP. Die bekanntesten davon dürften JavaScript\-Links sein.
|
||||
|
||||
.SH PLUGINS
|
||||
There are two plugin types: connection and content plugins.
|
||||
.
|
||||
Connection plugins are run after a successful connection to the URL host.
|
||||
.
|
||||
Content plugins are run if the URL type has content (mailto: URLs have no
|
||||
content for example) and if the check is not forbidden (ie. by HTTP
|
||||
robots.txt).
|
||||
.
|
||||
See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and their
|
||||
documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
|
||||
configuration file.
|
||||
|
||||
.SH Rekursion
|
||||
Bevor eine URL rekursiv geprüft wird, hat diese mehrere Bedingungen zu
|
||||
erfüllen. Diese werden in folgender Reihenfolge geprüft:
|
||||
|
|
|
|||
|
|
@ -14,52 +14,14 @@ in einem INI\-Format geschrieben.
|
|||
Die Standarddatei ist \fB~/.linkchecker/linkcheckerrc\fP unter Unix\-,
|
||||
\fB%HOMEPATH%\e.linkchecker\elinkcheckerrc\fP unter Windows\-Systemen.
|
||||
.SH EIGENSCHAFTEN
|
||||
|
||||
.SS [checking]
|
||||
.TP
|
||||
\fBanchors=\fP[\fB0\fP|\fB1\fP]
|
||||
Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
|
||||
Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-anchors\fP
|
||||
.TP
|
||||
\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
|
||||
Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-check\-css\fP
|
||||
.TP
|
||||
\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
|
||||
Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-check\-html\fP
|
||||
.TP
|
||||
\fBclamavconf=\fP\fIDateiname\fP
|
||||
Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBcookiefile=\fP\fIDateiname\fP
|
||||
Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird in
|
||||
linkchecker(1) erklärt.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-cookiefile\fP
|
||||
.TP
|
||||
\fBcookies=\fP[\fB0\fP|\fB1\fP]
|
||||
Akzeptiere und sende HTTP cookies.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-cookies\fP
|
||||
.TP
|
||||
\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
|
||||
Schreibe einen Speicherabzug in eine temporäre Datei wenn die Prüfung
|
||||
endet. Der Speicherabzug wird sowohl beim normalen Beenden der Prüfung als
|
||||
auch wenn die Prüfung abgebrochen wird geschrieben.
|
||||
.br
|
||||
Der Speicherabzug funktioniert nur falls das Paket python\-meliae installiert
|
||||
ist. Andernfalls wird eine Warnung angezeigt mit dem Hinweis dieses Paket zu
|
||||
installieren.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBlocalwebroot=\fP\fISTRING\fP
|
||||
Beim Prüfen von absoluten URLs in lokalen Dateien wird das angegebene
|
||||
Wurzelverzeichnis als Basis\-URL benutzt.
|
||||
|
|
@ -78,23 +40,12 @@ korrekte Syntax des Links geprüft.
|
|||
.br
|
||||
Kommandozeilenoption: \fB\-\-nntp\-server\fP
|
||||
.TP
|
||||
\fBpause=\fP\fINUMBER\fP
|
||||
Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
|
||||
folgenden Verbindungen zum demselben Rechner.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-pause\fP
|
||||
.TP
|
||||
\fBrecursionlevel=\fP\fINUMBER\fP
|
||||
Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
|
||||
bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-recursion\-level\fP
|
||||
.TP
|
||||
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
|
||||
Prüfe Inhalt von URLs auf Viren mit ClamAV.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-scan\-virus\fP
|
||||
.TP
|
||||
\fBthreads=\fP\fINUMBER\fP
|
||||
Generiere nicht mehr als die angegebene Anzahl von Threads. Standard Anzahl
|
||||
von Threads ist 100. Um Threads zu deaktivieren, geben Sie eine nicht
|
||||
|
|
@ -108,6 +59,12 @@ Setze den Timeout für TCP\-Verbindungen in Sekunden. Der Standard Timeout ist
|
|||
.br
|
||||
Kommandozeilenoption: \fB\-\-timeout\fP
|
||||
.TP
|
||||
\fBaborttimeout=\fP\fINUMBER\fP
|
||||
Time to wait for checks to finish after the user aborts the first time (with
|
||||
Ctrl\-C or the abort button). The default abort timeout is 300 seconds.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-timeout\fP
|
||||
.TP
|
||||
\fBuseragent=\fP\fISTRING\fP
|
||||
Gibt den User\-Agent an, der zu HTTP\-Servern geschickt wird,
|
||||
z.B. "Mozilla/4.0". Der Standard ist "LinkChecker/X.Y", wobei X.Y die
|
||||
|
|
@ -115,23 +72,6 @@ aktuelle Version von LinkChecker ist.
|
|||
.br
|
||||
Kommandozeilenoption: \fB\-\-user\-agent\fP
|
||||
.TP
|
||||
\fBwarningregex=\fP=\fIREGEX\fP
|
||||
Definieren Sie einen regulären Ausdruck der eine Warnung ausgibt falls er
|
||||
auf den Inhalt einer geprüften URL zutrifft. Dies gilt nur für gültige
|
||||
Seiten deren Inhalt wir bekommen können.
|
||||
.br
|
||||
Benutzen Sie dies, um nach Seiten zu suchen, welche bestimmte Fehler
|
||||
enthalten, zum Beispiel "Diese Seite wurde entfernt" oder "Oracle
|
||||
Applikationsfehler".
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-warning\-regex\fP
|
||||
.TP
|
||||
\fBwarnsizebytes=\fP\fINUMBER\fP
|
||||
Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
|
||||
Anzahl von Bytes übersteigt.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-warning\-size\-bytes\fP
|
||||
.TP
|
||||
\fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIdateiname\fP]
|
||||
Falls der Wert Null ist werden SSL Zertifikate nicht überprüft. Falls er auf
|
||||
Eins gesetzt wird (der Standard) werden SSL Zertifikate mit der gelieferten
|
||||
|
|
@ -140,15 +80,6 @@ zur Prüfung verwendet.
|
|||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
|
||||
Prüfe ob SSL\-Zertifikate mindestens die angegebene Anzahl an Tagen gültig
|
||||
sind. Die Anzahl darf nicht negativ sein. Falls die Anzahl Null ist wird
|
||||
eine Warnung nur für Zertifikate ausgegeben, die schon abgelaufen sind.
|
||||
.br
|
||||
The Standardanzahl an Tagen ist 14.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBmaxrunseconds=\fP\fINUMBER\fP
|
||||
Hört nach der angegebenen Anzahl von Sekunden auf, neue URLs zu prüfen. Dies
|
||||
ist dasselbe als wenn der Benutzer nach der gegebenen Anzahl von Sekunden
|
||||
|
|
@ -167,26 +98,11 @@ Standard ist alle URLs anzunehmen und zu prüfen.
|
|||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBmaxconnectionshttp=\fP\fINUMBER\fP
|
||||
Maximale Anzahl an HTTP\-Verbindungen.
|
||||
.br
|
||||
Der Standard ist 10.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
\fBmaxrequestspersecond=\fP\fINUMBER\fP
|
||||
Limit the maximum number of requests per second to one host.
|
||||
.TP
|
||||
\fBmaxconnectionshttps=\fP\fINUMBER\fP
|
||||
Maximale Anzahl an HTTPS\-Verbindungen.
|
||||
.br
|
||||
Der Standard ist 10.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
.TP
|
||||
\fBmaxconnectionsftp=\fP\fINUMBER\fP
|
||||
Maximale Anzahl an FTP\-Verbindungen.
|
||||
.br
|
||||
Der Standard ist 2.
|
||||
.br
|
||||
Kommandozeilenoption: keine
|
||||
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
|
||||
Allowed URL schemes as comma\-separated list.
|
||||
.SS [filtering]
|
||||
.TP
|
||||
\fBignore=\fP\fIREGEX\fP (MULTILINE)
|
||||
|
|
@ -212,6 +128,11 @@ Prüfe URLs die auf den regulären Ausdruck zutreffen, aber führe keine
|
|||
Rekursion durch.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-no\-follow\-url\fP
|
||||
.TP
|
||||
\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
|
||||
Check external links. Default is to check internal links only.
|
||||
.br
|
||||
Command line option: \fB\-\-checkextern\fP
|
||||
.SS [authentication]
|
||||
.TP
|
||||
\fBentry=\fP\fIREGEX\fP \fIBENUTZER\fP [\fIPASSWORT\fP] (MULTILINE)
|
||||
|
|
@ -232,9 +153,8 @@ wird Authentifizierung für http[s] und ftp Verknüpfungen benutzt.
|
|||
Kommandozeilenoption: \fB\-u\fP, \fB\-p\fP
|
||||
.TP
|
||||
\fBloginurl=\fP\fIURL\fP
|
||||
Eine Anmelde\-URL, die vor der Prüfung besucht wird. Benötigt einen Eintrag
|
||||
zur Authentifizierung und impliziert die Benutzung von Cookies, weil die
|
||||
meisten Anmeldungen heutzutage Cookies benutzen.
|
||||
A login URL to be visited before checking. Also needs authentication data
|
||||
set for it.
|
||||
.TP
|
||||
\fBloginuserfield=\fP\fINAME\fP
|
||||
Der Name für das Benutzer CGI\-Feld. Der Standardname ist \fBlogin\fP.
|
||||
|
|
@ -247,12 +167,6 @@ Optional zusätzliche CGI Namen/Werte\-Paare. Die Default\-Werte werden
|
|||
automatisch übermittelt.
|
||||
.SS [output]
|
||||
.TP
|
||||
\fBcomplete=\fP[\fB0\fP|\fB1\fP]
|
||||
Falls gesetzt, gebe alle geprüften URLs aus, sogar Duplikate. Standard ist
|
||||
es, URLs nur einmal auszugeben.
|
||||
.br
|
||||
Kommandozeilenoption: \fB\-\-complete\fP
|
||||
.TP
|
||||
\fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
|
||||
Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
|
||||
\fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP, \fBthread\fP und \fBall\fP. Die
|
||||
|
|
@ -528,6 +442,52 @@ ignoriert, müssen aber eingerückt sein.
|
|||
|
||||
[filtering]
|
||||
ignorewarnings=http\-moved\-permanent
|
||||
|
||||
.SH PLUGINS
|
||||
All plugins have a separate section. If the section appears in the
|
||||
configuration file the plugin is enabled. Some plugins read extra options
|
||||
in their section.
|
||||
|
||||
.SS [AnchorCheck]
|
||||
Checks validity of HTML anchors.
|
||||
|
||||
.SS [LocationInfo]
|
||||
Adds the country and if possible city name of the URL host as info. Needs
|
||||
GeoIP or pygeoip and a local country or city lookup DB installed.
|
||||
|
||||
.SS [RegexCheck]
|
||||
Define a regular expression which prints a warning if it matches any content
|
||||
of the checked link. This applies only to valid pages, so we can get their
|
||||
content.
|
||||
|
||||
Use this to check for pages that contain some form of error message, for
|
||||
example 'This page has moved' or 'Oracle Application error'.
|
||||
|
||||
Man beachte, dass mehrere Werte in dem regulären Ausdruck kombiniert
|
||||
werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
|
||||
Applikationsfehler)".
|
||||
|
||||
.SS [SslCertificateCheck]
|
||||
Check SSL certificate expiration date. Only internal https: links will be
|
||||
checked. A domain will only be checked once to avoid duplicate warnings.
|
||||
.TP
|
||||
\fBsslcertwarndays=\fP\fINUMBER\fP
|
||||
Configures the expiration warning time in days.
|
||||
|
||||
.SS [HtmlSyntaxCheck]
|
||||
Check the syntax of HTML pages with the online W3C HTML validator. See
|
||||
http://validator.w3.org/docs/api.html.
|
||||
|
||||
.SS [CssSyntaxCheck]
|
||||
Check the syntax of HTML pages with the online W3C CSS validator. See
|
||||
http://jigsaw.w3.org/css\-validator/manual.html#expert.
|
||||
|
||||
.SS [VirusCheck]
|
||||
Checks the page content for virus infections with clamav. A local clamav
|
||||
daemon must be installed.
|
||||
.TP
|
||||
\fBclamavconf=\fP\fIDateiname\fP
|
||||
Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
|
||||
.
|
||||
.SH WARNUNGEN
|
||||
Die folgenden Warnungen werden vom Konfigurationseintrag 'ignorewarnings'
|
||||
|
|
@ -543,57 +503,21 @@ Der file: Pfad ist nicht derselbe wie der Systempfad.
|
|||
\fBftp\-missing\-slash\fP
|
||||
Der ftp: URL fehlt ein abschließender Schrägstrich.
|
||||
.TP
|
||||
\fBhttp\-auth\-unknonwn\fP
|
||||
Nicht unterstützte HTTP Authentifizierungsmethode.
|
||||
.TP
|
||||
\fBhttp\-cookie\-store\-error\fP
|
||||
Ein Fehler trat auf während des Speicherns eines Cookies.
|
||||
.TP
|
||||
\fBhttp\-decompress\-error\fP
|
||||
Ein Fehler trat beim Dekomprimieren des URL Inhalts auf.
|
||||
.TP
|
||||
\fBhttp\-empty\-content\fP
|
||||
Die URL besitzt keinen Inhalt.
|
||||
.TP
|
||||
\fBhttp\-moved\-permanent\fP
|
||||
Die URL wurde dauerhaft verschoben.
|
||||
.TP
|
||||
\fBhttp\-robots\-denied\fP
|
||||
Die http: URL\-Überprüfung wurde verweigert.
|
||||
.TP
|
||||
\fBhttp\-unsupported\-encoding\fP
|
||||
Der URL\-Inhalt ist in einer unbekannten Kodierung verfasst.
|
||||
.TP
|
||||
\fBhttp\-wrong\-redirect\fP
|
||||
Die URL wurde zu einem anderen URL\-Typ umgeleitet.
|
||||
.TP
|
||||
\fBhttps\-certificate\-error\fP
|
||||
Das SSL\-Zertifikat ist ungültig oder abgelaufen.
|
||||
.TP
|
||||
\fBignore\-url\fP
|
||||
Die URL wurde ignoriert.
|
||||
.TP
|
||||
\fBmail\-no\-connection\fP
|
||||
Es konnte keine Verbindung zu einem MX\-Rechner hergestellt werden.
|
||||
.TP
|
||||
\fBmail\-no\-mx\-host\fP
|
||||
Der MX Mail\-Rechner konnte nicht gefunden werden.
|
||||
.TP
|
||||
\fBmail\-unverified\-address\fP
|
||||
Die mailto: Addresse konnte nicht überprüft werden.
|
||||
.TP
|
||||
\fBnntp\-no\-newsgroup\fP
|
||||
Die NNTP Nachrichtengruppe konnte nicht gefunden werden.
|
||||
.TP
|
||||
\fBnntp\-no\-server\fP
|
||||
Es wurde kein NNTP Server gefunden.
|
||||
.TP
|
||||
\fBurl\-anchor\-not\-found\fP
|
||||
URL Anker wurde nicht gefunden.
|
||||
.TP
|
||||
\fBurl\-content\-size\-unequal\fP
|
||||
Der URL Inhaltsgrößenangabe und die Download\-Größe sind unterschiedlich.
|
||||
.TP
|
||||
\fBurl\-content\-size\-zero\fP
|
||||
Der URL Inhaltsgrößenangabe ist Null.
|
||||
.TP
|
||||
|
|
@ -609,9 +533,6 @@ Konnte den Inhalt der URL nicht bekommen.
|
|||
\fBurl\-obfuscated\-ip\fP
|
||||
Die IP\-Adresse ist verschleiert.
|
||||
.TP
|
||||
\fBurl\-warnregex\-found\fP
|
||||
Der reguläre Ausdruck für Warnungen wurde in den URL Inhalten gefunden.
|
||||
.TP
|
||||
\fBurl\-whitespace\fP
|
||||
Die URL %(url)s enthält Leerzeichen am Anfang oder Ende.
|
||||
|
||||
|
|
|
|||
|
|
@ -33,15 +33,14 @@ Antivirus check
|
|||
.IP \(bu
|
||||
a command line, GUI and web interface
|
||||
.SH EXAMPLES
|
||||
The most common use checks the given domain recursively, plus any
|
||||
URL pointing outside of the domain:
|
||||
\fBlinkchecker http://www.example.net/\fP
|
||||
The most common use checks the given domain recursively:
|
||||
\fBlinkchecker http://www.example.com/\fP
|
||||
.br
|
||||
Beware that this checks the whole site which can have thousands of URLs.
|
||||
Use the \fB\-r\fP option to restrict the recursion depth.
|
||||
.br
|
||||
Don't check \fBmailto:\fP URLs. All other links are checked as usual:
|
||||
\fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
|
||||
Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
|
||||
\fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
|
||||
.br
|
||||
Checking a local HTML file on Unix:
|
||||
\fBlinkchecker ../bla.html\fP
|
||||
|
|
@ -53,7 +52,7 @@ You can skip the \fBhttp://\fP url part if the domain starts with \fBwww.\fP:
|
|||
\fBlinkchecker www.example.com\fP
|
||||
.br
|
||||
You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
|
||||
\fBlinkchecker \-r0 ftp.example.org\fP
|
||||
\fBlinkchecker \-r0 ftp.example.com\fP
|
||||
.br
|
||||
Generate a sitemap graph and convert it with the graphviz dot utility:
|
||||
\fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
|
||||
|
|
@ -77,18 +76,12 @@ of threads is 100. To disable threading specify a non-positive number.
|
|||
.TP
|
||||
\fB\-V\fP, \fB\-\-version\fP
|
||||
Print version and exit.
|
||||
.TP
|
||||
\fB\-\-list\-plugins\fP
|
||||
Print available check plugins and exit.
|
||||
.
|
||||
.SS Output options
|
||||
.TP
|
||||
\fB\-\-check\-css\fP
|
||||
Check syntax of CSS URLs with the W3C online validator.
|
||||
.TP
|
||||
\fB\-\-check\-html\fP
|
||||
Check syntax of HTML URLs with the W3C online validator.
|
||||
.TP
|
||||
\fB\-\-complete\fP
|
||||
Log all URLs, including duplicates. Default is to log duplicate URLs only once.
|
||||
.TP
|
||||
\fB\-D\fP\fISTRING\fP, \fB\-\-debug=\fP\fISTRING\fP
|
||||
Print debugging output for the given logger.
|
||||
Available loggers are \fBcmdline\fP, \fBchecking\fP,
|
||||
|
|
@ -139,12 +132,6 @@ that of your locale. Valid encodings are listed at
|
|||
Quiet operation, an alias for \fB\-o none\fP.
|
||||
This is only useful with \fB\-F\fP.
|
||||
.TP
|
||||
\fB\-\-scan\-virus\fP
|
||||
Scan content of URLs for viruses with ClamAV.
|
||||
.TP
|
||||
\fB\-\-trace\fP
|
||||
Print tracing information.
|
||||
.TP
|
||||
\fB\-v\fP, \fB\-\-verbose\fP
|
||||
Log all checked URLs. Default is to log only errors and warnings.
|
||||
.TP
|
||||
|
|
@ -160,27 +147,15 @@ Note that multiple values can be combined in the regular expression,
|
|||
for example "(This page has moved|Oracle Application error)".
|
||||
.br
|
||||
See section \fBREGULAR EXPRESSIONS\fP for more info.
|
||||
.TP
|
||||
\fB\-\-warning\-size\-bytes=\fP\fINUMBER\fP
|
||||
Print a warning if content size info is available and exceeds the given
|
||||
number of \fIbytes\fP.
|
||||
.
|
||||
.SS Checking options
|
||||
.TP
|
||||
\fB\-a\fP, \fB\-\-anchors\fP
|
||||
Check HTTP anchor references. Default is not to check anchors.
|
||||
This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
|
||||
.TP
|
||||
\fB\-C\fP, \fB\-\-cookies\fP
|
||||
Accept and send HTTP cookies according to RFC 2109. Only cookies
|
||||
which are sent back to the originating server are accepted.
|
||||
Sent and accepted cookies are provided as additional logging
|
||||
information.
|
||||
.TP
|
||||
\fB\-\-cookiefile=\fP\fIFILENAME\fP
|
||||
Read a file with initial cookie data. The cookie data
|
||||
format is explained below.
|
||||
.TP
|
||||
\fB\-\-check\-extern
|
||||
Check external URLs.
|
||||
.TP
|
||||
\fB\-\-ignore\-url=\fP\fIREGEX\fP
|
||||
URLs matching the given regular expression will be ignored and not checked.
|
||||
.br
|
||||
|
|
@ -206,10 +181,6 @@ Read a password from console and use it for HTTP and FTP authorization.
|
|||
For FTP the default password is \fBanonymous@\fP. For HTTP there is
|
||||
no default password. See also \fB\-u\fP.
|
||||
.TP
|
||||
\fB\-P\fP\fINUMBER\fP, \fB\-\-pause=\fP\fINUMBER\fP
|
||||
Pause the given number of seconds between two subsequent connection
|
||||
requests to the same host. Default is no pause between requests.
|
||||
.TP
|
||||
\fB\-r\fP\fINUMBER\fP, \fB\-\-recursion\-level=\fP\fINUMBER\fP
|
||||
Check recursively all links up to given depth.
|
||||
A negative depth will enable infinite recursion.
|
||||
|
|
@ -291,16 +262,13 @@ A cookie file contains standard HTTP header (RFC 2616) data with the
|
|||
following possible names:
|
||||
.
|
||||
.TP
|
||||
\fBScheme\fP (optional)
|
||||
Sets the scheme the cookies are valid for; default scheme is \fBhttp\fP.
|
||||
.TP
|
||||
\fBHost\fP (required)
|
||||
Sets the domain the cookies are valid for.
|
||||
.TP
|
||||
\fBPath\fP (optional)
|
||||
Gives the path the cookies are value for; default path is \fB/\fP.
|
||||
.TP
|
||||
\fBSet-cookie\fP (optional)
|
||||
\fBSet-cookie\fP (required)
|
||||
Set cookie name/value. Can be given more than once.
|
||||
.PP
|
||||
Multiple entries are separated by a blank line.
|
||||
|
|
@ -314,7 +282,6 @@ with \fBhttps://example.org/\fP:
|
|||
Set-cookie: ID="smee"
|
||||
Set-cookie: spam="egg"
|
||||
|
||||
Scheme: https
|
||||
Host: example.org
|
||||
Set-cookie: baggage="elitist"; comment="hologram"
|
||||
|
||||
|
|
@ -353,7 +320,6 @@ After connecting to the given HTTP server the given path
|
|||
or query is requested. All redirections are followed, and
|
||||
if user/password is given it will be used as authorization
|
||||
when necessary.
|
||||
Permanently moved pages issue a warning.
|
||||
All final HTTP status codes other than 2xx are errors.
|
||||
.
|
||||
HTML page contents are checked for recursion.
|
||||
|
|
@ -412,6 +378,20 @@ Unsupported links (``javascript:``, etc.)
|
|||
in the \fBlinkcheck/checker/unknownurl.py\fP source file.
|
||||
The most prominent of them should be JavaScript links.
|
||||
|
||||
.SH PLUGINS
|
||||
There are two plugin types: connection and content plugins.
|
||||
.
|
||||
Connection plugins are run after a successful connection to the
|
||||
URL host.
|
||||
.
|
||||
Content plugins are run if the URL type has content
|
||||
(mailto: URLs have no content for example) and if the check is not
|
||||
forbidden (ie. by HTTP robots.txt).
|
||||
.
|
||||
See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and
|
||||
their documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
|
||||
configuration file.
|
||||
|
||||
.SH RECURSION
|
||||
Before descending recursively into a URL, it has to fulfill several
|
||||
conditions. They are checked in this order:
|
||||
|
|
|
|||
|
|
@ -9,51 +9,14 @@ The file is written in an INI-style format.
|
|||
The default file location is \fB~/.linkchecker/linkcheckerrc\fP on Unix,
|
||||
\fB%HOMEPATH%\\.linkchecker\\linkcheckerrc\fP on Windows systems.
|
||||
.SH SETTINGS
|
||||
|
||||
.SS \fB[checking]\fP
|
||||
.TP
|
||||
\fBanchors=\fP[\fB0\fP|\fB1\fP]
|
||||
Check HTTP anchor references. Default is not to check anchors.
|
||||
This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
|
||||
.br
|
||||
Command line option: \fB\-\-anchors\fP
|
||||
.TP
|
||||
\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
|
||||
Check syntax of CSS URLs with the W3C online validator.
|
||||
.br
|
||||
Command line option: \fB\-\-check\-css\fP
|
||||
.TP
|
||||
\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
|
||||
Check syntax of HTML URLs with the W3C online validator.
|
||||
.br
|
||||
Command line option: \fB\-\-check\-html\fP
|
||||
.TP
|
||||
\fBclamavconf=\fP\fIfilename\fP
|
||||
Filename of \fBclamd.conf\fP config file.
|
||||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBcookiefile=\fP\fIfilename\fP
|
||||
Read a file with initial cookie data. The cookie data
|
||||
format is explained in linkchecker(1).
|
||||
.br
|
||||
Command line option: \fB\-\-cookiefile\fP
|
||||
.TP
|
||||
\fBcookies=\fP[\fB0\fP|\fB1\fP]
|
||||
Accept and send HTTP cookies.
|
||||
.br
|
||||
Command line option: \fB\-\-cookies\fP
|
||||
.TP
|
||||
\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
|
||||
When checking finishes, write a memory dump to a temporary file.
|
||||
The memory dump is written both when checking finishes normally
|
||||
and when checking gets canceled.
|
||||
.br
|
||||
The memory dump only works if the python-meliae package is installed.
|
||||
Otherwise a warning is printed to install it.
|
||||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBlocalwebroot=\fP\fISTRING\fP
|
||||
When checking absolute URLs inside local files, the given root directory
|
||||
is used as base URL.
|
||||
|
|
@ -71,12 +34,6 @@ only the syntax of the link is checked.
|
|||
.br
|
||||
Command line option: \fB\-\-nntp\-server\fP
|
||||
.TP
|
||||
\fBpause=\fP\fINUMBER\fP
|
||||
Pause the given number of seconds between two subsequent connection
|
||||
requests to the same host.
|
||||
.br
|
||||
Command line option: \fB\-\-pause\fP
|
||||
.TP
|
||||
\fBrecursionlevel=\fP\fINUMBER\fP
|
||||
Check recursively all links up to given depth.
|
||||
A negative depth will enable infinite recursion.
|
||||
|
|
@ -84,11 +41,6 @@ Default depth is infinite.
|
|||
.br
|
||||
Command line option: \fB\-\-recursion\-level\fP
|
||||
.TP
|
||||
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
|
||||
Scan content of URLs for viruses with ClamAV.
|
||||
.br
|
||||
Command line option: \fB\-\-scan\-virus\fP
|
||||
.TP
|
||||
\fBthreads=\fP\fINUMBER\fP
|
||||
Generate no more than the given number of threads. Default number
|
||||
of threads is 100. To disable threading specify a non-positive number.
|
||||
|
|
@ -101,6 +53,13 @@ is 60 seconds.
|
|||
.br
|
||||
Command line option: \fB\-\-timeout\fP
|
||||
.TP
|
||||
\fBaborttimeout=\fP\fINUMBER\fP
|
||||
Time to wait for checks to finish after the user aborts the first time
|
||||
(with Ctrl-C or the abort button).
|
||||
The default abort timeout is 300 seconds.
|
||||
.br
|
||||
Command line option: \fB\-\-timeout\fP
|
||||
.TP
|
||||
\fBuseragent=\fP\fISTRING\fP
|
||||
Specify the User-Agent string to send to the HTTP server, for example
|
||||
"Mozilla/4.0". The default is "LinkChecker/X.Y" where X.Y is the current
|
||||
|
|
@ -108,22 +67,6 @@ version of LinkChecker.
|
|||
.br
|
||||
Command line option: \fB\-\-user\-agent\fP
|
||||
.TP
|
||||
\fBwarningregex=\fP=\fIREGEX\fP
|
||||
Define a regular expression which prints a warning if it matches any
|
||||
content of the checked link.
|
||||
This applies only to valid pages, so we can get their content.
|
||||
.br
|
||||
Use this to check for pages that contain some form of error, for example
|
||||
"This page has moved" or "Oracle Application Server error".
|
||||
.br
|
||||
Command line option: \fB\-\-warning\-regex\fP
|
||||
.TP
|
||||
\fBwarnsizebytes=\fP\fINUMBER\fP
|
||||
Print a warning if content size info is available and exceeds the given
|
||||
number of \fIbytes\fP.
|
||||
.br
|
||||
Command line option: \fB\-\-warning\-size\-bytes\fP
|
||||
.TP
|
||||
\fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIfilename\fP]
|
||||
If set to zero disables SSL certificate checking.
|
||||
If set to one (the default) enables SSL certificate checking with
|
||||
|
|
@ -132,16 +75,6 @@ will be used as the certificate file.
|
|||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
|
||||
Check that SSL certificates are at least the given number of days valid.
|
||||
The number must not be negative.
|
||||
If the number of days is zero a warning is printed only for certificates
|
||||
that are already expired.
|
||||
.br
|
||||
The default number of days is 14.
|
||||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBmaxrunseconds=\fP\fINUMBER\fP
|
||||
Stop checking new URLs after the given number of seconds. Same as if the
|
||||
user stops (by hitting Ctrl-C or clicking the abort buttin in the GUI)
|
||||
|
|
@ -159,26 +92,11 @@ The default is to queue and check all URLs.
|
|||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBmaxconnectionshttp=\fP\fINUMBER\fP
|
||||
Maximum number of connections to HTTP servers.
|
||||
.br
|
||||
The default is 10.
|
||||
.br
|
||||
Command line option: none
|
||||
\fBmaxrequestspersecond=\fP\fINUMBER\fP
|
||||
Limit the maximum number of requests per second to one host.
|
||||
.TP
|
||||
\fBmaxconnectionshttps=\fP\fINUMBER\fP
|
||||
Maximum number of connections to HTTPS servers.
|
||||
.br
|
||||
The default is 10.
|
||||
.br
|
||||
Command line option: none
|
||||
.TP
|
||||
\fBmaxconnectionsftp=\fP\fINUMBER\fP
|
||||
Maximum number of connections to FTP servers.
|
||||
.br
|
||||
The default is 2.
|
||||
.br
|
||||
Command line option: none
|
||||
\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
|
||||
Allowed URL schemes as comma-separated list.
|
||||
.SS \fB[filtering]\fP
|
||||
.TP
|
||||
\fBignore=\fP\fIREGEX\fP (MULTILINE)
|
||||
|
|
@ -203,6 +121,11 @@ Check but do not recurse into URLs matching the given regular
|
|||
expressions.
|
||||
.br
|
||||
Command line option: \fB\-\-no\-follow\-url\fP
|
||||
.TP
|
||||
\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
|
||||
Check external links. Default is to check internal links only.
|
||||
.br
|
||||
Command line option: \fB\-\-checkextern\fP
|
||||
.SS \fB[authentication]\fP
|
||||
.TP
|
||||
\fBentry=\fP\fIREGEX\fP \fIUSER\fP [\fIPASS\fP] (MULTILINE)
|
||||
|
|
@ -224,8 +147,7 @@ Command line option: \fB\-u\fP, \fB\-p\fP
|
|||
.TP
|
||||
\fBloginurl=\fP\fIURL\fP
|
||||
A login URL to be visited before checking. Also needs authentication
|
||||
data set for it, and implies using cookies because most logins use
|
||||
cookies nowadays.
|
||||
data set for it.
|
||||
.TP
|
||||
\fBloginuserfield=\fP\fISTRING\fP
|
||||
The name of the user CGI field. Default name is \fBlogin\fP.
|
||||
|
|
@ -238,12 +160,6 @@ Optionally any additional CGI name/value pairs. Note that the default
|
|||
values are submitted automatically.
|
||||
.SS \fB[output]\fP
|
||||
.TP
|
||||
\fBcomplete=\fP[\fB0\fP|\fB1\fP]
|
||||
If set log all checked URLs, even duplicates. Default is to log
|
||||
duplicate URLs only once.
|
||||
.br
|
||||
Command line option: \fB\-\-complete\fP
|
||||
.TP
|
||||
\fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
|
||||
Print debugging output for the given loggers.
|
||||
Available loggers are \fBcmdline\fP, \fBchecking\fP,
|
||||
|
|
@ -524,6 +440,53 @@ though they must still be indented.
|
|||
|
||||
[filtering]
|
||||
ignorewarnings=http-moved-permanent
|
||||
|
||||
.SH PLUGINS
|
||||
All plugins have a separate section. If the section
|
||||
appears in the configuration file the plugin is enabled.
|
||||
Some plugins read extra options in their section.
|
||||
|
||||
.SS \fB[AnchorCheck]\fP
|
||||
Checks validity of HTML anchors.
|
||||
|
||||
.SS \fB[LocationInfo]\fP
|
||||
Adds the country and if possible city name of the URL host as info.
|
||||
Needs GeoIP or pygeoip and a local country or city lookup DB installed.
|
||||
|
||||
.SS \fB[RegexCheck]\fP
|
||||
Define a regular expression which prints a warning if it matches
|
||||
any content of the checked link. This applies only to valid pages,
|
||||
so we can get their content.
|
||||
|
||||
Use this to check for pages that contain some form of error
|
||||
message, for example 'This page has moved' or 'Oracle
|
||||
Application error'.
|
||||
|
||||
Note that multiple values can be combined in the regular expression,
|
||||
for example "(This page has moved|Oracle Application error)".
|
||||
|
||||
.SS \fB[SslCertificateCheck]\fP
|
||||
Check SSL certificate expiration date. Only internal https: links
|
||||
will be checked. A domain will only be checked once to avoid duplicate
|
||||
warnings.
|
||||
.TP
|
||||
\fBsslcertwarndays=\fP\fINUMBER\fP
|
||||
Configures the expiration warning time in days.
|
||||
|
||||
.SS \fB[HtmlSyntaxCheck]\fP
|
||||
Check the syntax of HTML pages with the online W3C HTML validator.
|
||||
See http://validator.w3.org/docs/api.html.
|
||||
|
||||
.SS \fB[CssSyntaxCheck]\fP
|
||||
Check the syntax of HTML pages with the online W3C CSS validator.
|
||||
See http://jigsaw.w3.org/css-validator/manual.html#expert.
|
||||
|
||||
.SS \fB[VirusCheck]\fP
|
||||
Checks the page content for virus infections with clamav.
|
||||
A local clamav daemon must be installed.
|
||||
.TP
|
||||
\fBclamavconf=\fP\fIfilename\fP
|
||||
Filename of \fBclamd.conf\fP config file.
|
||||
.
|
||||
.SH WARNINGS
|
||||
The following warnings are recognized in the 'ignorewarnings' config
|
||||
|
|
@ -539,57 +502,21 @@ The file: path is not the same as the system specific path.
|
|||
\fBftp-missing-slash\fP
|
||||
The ftp: URL is missing a trailing slash.
|
||||
.TP
|
||||
\fBhttp-auth-unknonwn\fP
|
||||
Unsupported HTTP authentication method.
|
||||
.TP
|
||||
\fBhttp-cookie-store-error\fP
|
||||
An error occurred while storing a cookie.
|
||||
.TP
|
||||
\fBhttp-decompress-error\fP
|
||||
An error occurred while decompressing the URL content.
|
||||
.TP
|
||||
\fBhttp-empty-content\fP
|
||||
The URL had no content.
|
||||
.TP
|
||||
\fBhttp-moved-permanent\fP
|
||||
The URL has moved permanently.
|
||||
.TP
|
||||
\fBhttp-robots-denied\fP
|
||||
The http: URL checking has been denied.
|
||||
.TP
|
||||
\fBhttp-unsupported-encoding\fP
|
||||
The URL content is encoded with an unknown encoding.
|
||||
.TP
|
||||
\fBhttp-wrong-redirect\fP
|
||||
The URL has been redirected to an URL of a different type.
|
||||
.TP
|
||||
\fBhttps-certificate-error\fP
|
||||
The SSL certificate is invalid or expired.
|
||||
.TP
|
||||
\fBignore-url\fP
|
||||
The URL has been ignored.
|
||||
.TP
|
||||
\fBmail-no-connection\fP
|
||||
No connection to a MX host could be established.
|
||||
.TP
|
||||
\fBmail-no-mx-host\fP
|
||||
The mail MX host could not be found.
|
||||
.TP
|
||||
\fBmail-unverified-address\fP
|
||||
The mailto: address could not be verified.
|
||||
.TP
|
||||
\fBnntp-no-newsgroup\fP
|
||||
The NNTP newsgroup could not be found.
|
||||
.TP
|
||||
\fBnntp-no-server\fP
|
||||
No NNTP server was found.
|
||||
.TP
|
||||
\fBurl-anchor-not-found\fP
|
||||
URL anchor was not found.
|
||||
.TP
|
||||
\fBurl-content-size-unequal\fP
|
||||
The URL content size and download size are unequal.
|
||||
.TP
|
||||
\fBurl-content-size-zero\fP
|
||||
The URL content size is zero.
|
||||
.TP
|
||||
|
|
@ -605,9 +532,6 @@ Could not get the content of the URL.
|
|||
\fBurl-obfuscated-ip\fP
|
||||
The IP is obfuscated.
|
||||
.TP
|
||||
\fBurl-warnregex-found\fP
|
||||
The warning regular expression was found in the URL contents.
|
||||
.TP
|
||||
\fBurl-whitespace\fP
|
||||
The URL contains leading or trailing whitespace.
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,9 @@ First, install the required software.
|
|||
On Debian or Ubuntu systems, install the package qt4-dev-tools.
|
||||
On Redhat systems, install the package qt-devel.
|
||||
|
||||
4. *Optional, for bash-completion:*
|
||||
4. Python requests module from https://pypi.python.org/pypi/requests
|
||||
|
||||
5. *Optional, for bash-completion:*
|
||||
argcomplete Python module from https://pypi.python.org/pypi/argcomplete
|
||||
|
||||
6. *Optional, for displaying country codes:*
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,43 @@
|
|||
Upgrading
|
||||
=========
|
||||
Migrating from 8.x to 9.0
|
||||
-------------------------
|
||||
The Python requests module is now required.
|
||||
|
||||
Several checks have been moved to plugins (see below).
|
||||
Plugins have to be enabled in the configuration file.
|
||||
|
||||
The following commandline and configuration options have been deprecated
|
||||
and do not have any effect:
|
||||
--anchors, anchors: moved to plugin AnchorCheck
|
||||
--check-css, checkcss: moved to plugin CssSyntaxCheck
|
||||
--check-html, checkhtml: moved to plugin HtmlSyntaxCheck
|
||||
--complete: feature removed
|
||||
--cookies, sendcookies, storecookies: cookies are sent/stored per default
|
||||
--pause, wait: replaced with numrequestspersecond
|
||||
--scan-virus, scanvirus: moved to plugin VirusCheck
|
||||
--warning-regex: moved to plugin RegexCheck
|
||||
--warning-size-bytes, warnsizebytes: feature removed
|
||||
warnsslcertdaysvalid: moved to plugin SslCertificationCheck
|
||||
|
||||
The "html" logger generates HTML5 documents now.
|
||||
|
||||
The following warnings have been removed:
|
||||
- http-auth-unauthorized: removed
|
||||
- http-auth-unknonwn: removed
|
||||
- http-decompress-error: removed
|
||||
- http-robots-denied: downgraded to info
|
||||
- http-moved-permanent: downgraded to info
|
||||
- http-unsupported-encoding: removed
|
||||
- https-certificate-error: is an error now
|
||||
- mail-unverified-address: removed
|
||||
- mail-no-connection: removed
|
||||
- syntax-css: moved to plugin
|
||||
- syntax-html: moved to plugin
|
||||
- url-anchor-not-found: moved to plugin
|
||||
- url-content-size-unequal: removed
|
||||
- url-warnregex-found: moved to plugin
|
||||
|
||||
Migrating from 8.4 to 8.5
|
||||
--------------------------
|
||||
Custom output loggers have been changed.
|
||||
|
|
|
|||
|
|
@ -21,8 +21,9 @@ Features
|
|||
- honors robots.txt exclusion protocol
|
||||
- Cookie support
|
||||
- HTML5 support
|
||||
- HTML and CSS syntax check
|
||||
- Antivirus check
|
||||
- [Plugin support](plugins.html)
|
||||
allowing custom page checks. Currently available are
|
||||
HTML and CSS syntax checks, Antivirus checks, and more.
|
||||
- Different interfaces: command line, GUI and web interface
|
||||
- ... and a lot more check options documented in the
|
||||
[manual page](man1/linkchecker.1.html).
|
||||
|
|
|
|||
11
doc/web/content/plugins.md
Normal file
11
doc/web/content/plugins.md
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
title: Plugin support
|
||||
---
|
||||
|
||||
Plugin documentation
|
||||
=====================
|
||||
|
||||
Standard plugins
|
||||
=================
|
||||
|
||||
Custom plugins
|
||||
===============
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -2612,7 +2612,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
|
|||
#define YY_MORE_ADJ 0
|
||||
#define YY_RESTORE_YY_MORE_OFFSET
|
||||
#line 1 "htmllex.l"
|
||||
/* Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
/* Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -2951,6 +2951,10 @@ int yyget_lineno (yyscan_t yyscanner );
|
|||
|
||||
void yyset_lineno (int line_number ,yyscan_t yyscanner );
|
||||
|
||||
int yyget_column (yyscan_t yyscanner );
|
||||
|
||||
void yyset_column (int column_no ,yyscan_t yyscanner );
|
||||
|
||||
/* %if-bison-bridge */
|
||||
|
||||
YYSTYPE * yyget_lval (yyscan_t yyscanner );
|
||||
|
|
@ -3132,7 +3136,7 @@ YY_DECL
|
|||
|
||||
|
||||
/*********************** EOF ************************/
|
||||
#line 3135 "htmllex.c"
|
||||
#line 3139 "htmllex.c"
|
||||
|
||||
yylval = yylval_param;
|
||||
|
||||
|
|
@ -4683,7 +4687,7 @@ YY_RULE_SETUP
|
|||
#line 1091 "htmllex.l"
|
||||
ECHO;
|
||||
YY_BREAK
|
||||
#line 4686 "htmllex.c"
|
||||
#line 4690 "htmllex.c"
|
||||
|
||||
case YY_END_OF_BUFFER:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
/* Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@
|
|||
/* Line 268 of yacc.c */
|
||||
#line 1 "htmlparse.y"
|
||||
|
||||
/* Copyright (C) 2000-2011 Bastian Kleineidam
|
||||
/* Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
%{
|
||||
/* Copyright (C) 2000-2011 Bastian Kleineidam
|
||||
/* Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/* Copyright (C) 2000-2010 Bastian Kleineidam
|
||||
/* Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -68,12 +68,14 @@ LOG_CHECK = "linkcheck.check"
|
|||
LOG_CACHE = "linkcheck.cache"
|
||||
LOG_GUI = "linkcheck.gui"
|
||||
LOG_THREAD = "linkcheck.thread"
|
||||
LOG_PLUGIN = "linkcheck.plugin"
|
||||
lognames = {
|
||||
"cmdline": LOG_CMDLINE,
|
||||
"checking": LOG_CHECK,
|
||||
"cache": LOG_CACHE,
|
||||
"gui": LOG_GUI,
|
||||
"thread": LOG_THREAD,
|
||||
"plugin": LOG_PLUGIN,
|
||||
"all": LOG_ROOT,
|
||||
}
|
||||
lognamelist = ", ".join(repr(name) for name in lognames)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
2
linkcheck/cache/__init__.py
vendored
2
linkcheck/cache/__init__.py
vendored
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
223
linkcheck/cache/connection.py
vendored
223
linkcheck/cache/connection.py
vendored
|
|
@ -1,223 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Store and retrieve open connections.
|
||||
"""
|
||||
|
||||
import time
|
||||
from .. import log, LOG_CACHE
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock, get_semaphore
|
||||
from ..containers import enum
|
||||
|
||||
_lock = get_lock("connection")
|
||||
_wait_lock = get_lock("connwait")
|
||||
|
||||
ConnectionTypes = ("ftp", "http", "https")
|
||||
ConnectionState = enum("available", "busy")
|
||||
|
||||
|
||||
def get_connection_id(connection):
|
||||
"""Return unique id for connection object."""
|
||||
return id(connection)
|
||||
|
||||
|
||||
def is_expired(curtime, conn_data):
|
||||
"""Test if connection is expired."""
|
||||
return (curtime+5.0) >= conn_data[2]
|
||||
|
||||
|
||||
class ConnectionPool (object):
|
||||
"""Thread-safe cache, storing a set of connections for URL retrieval."""
|
||||
|
||||
def __init__ (self, limits, wait=0):
|
||||
"""
|
||||
Initialize an empty connection dictionary which will have the form:
|
||||
{(type, host, port) -> (lock, {id -> [connection, state, expiration time]})}
|
||||
|
||||
Connection can be any open connection object (HTTP, FTP, ...).
|
||||
State is of type ConnectionState (either 'available' or 'busy').
|
||||
Expiration time is the point of time in seconds when this
|
||||
connection will be timed out.
|
||||
|
||||
The type is the connection type and an either 'ftp' or 'http'.
|
||||
The host is the hostname as string, port the port number as an integer.
|
||||
|
||||
For each type, the maximum number of connections to one single host is defined
|
||||
in limits.
|
||||
"""
|
||||
# open connections
|
||||
self.connections = {}
|
||||
# {host -> due time}
|
||||
self.times = {}
|
||||
# {host -> wait}
|
||||
self.host_waits = {}
|
||||
if wait < 0:
|
||||
raise ValueError("negative wait value %d" % wait)
|
||||
self.wait = wait
|
||||
# {connection type -> max number of connections to one host}
|
||||
self.limits = limits
|
||||
|
||||
@synchronized(_wait_lock)
|
||||
def host_wait (self, host, wait):
|
||||
"""Set a host specific time to wait between requests."""
|
||||
if wait < 0:
|
||||
raise ValueError("negative wait value %d" % wait)
|
||||
self.host_waits[host] = wait
|
||||
|
||||
@synchronized(_wait_lock)
|
||||
def wait_for_host (self, host):
|
||||
"""Honor wait time for given host."""
|
||||
t = time.time()
|
||||
if host in self.times:
|
||||
due_time = self.times[host]
|
||||
if due_time > t:
|
||||
wait = due_time - t
|
||||
log.debug(LOG_CACHE,
|
||||
"waiting for %.01f seconds on connection to %s", wait, host)
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
self.times[host] = t + self.host_waits.get(host, self.wait)
|
||||
|
||||
def _add (self, type, host, port, create_connection):
|
||||
"""Add connection to the pool with given parameters.
|
||||
|
||||
@param type: the connection scheme (eg. http)
|
||||
@ptype type: string
|
||||
@param host: the hostname
|
||||
@ptype host: string
|
||||
@param port: the port number
|
||||
@ptype port: int
|
||||
@param create_connection: function to create a new connection object
|
||||
@ptype create_connection: callable
|
||||
@return: newly created connection
|
||||
@rtype: HTTP(S)Connection or FTPConnection
|
||||
"""
|
||||
self.wait_for_host(host)
|
||||
connection = create_connection(type, host, port)
|
||||
cid = get_connection_id(connection)
|
||||
expiration = None
|
||||
conn_data = [connection, 'busy', expiration]
|
||||
key = (type, host, port)
|
||||
if key in self.connections:
|
||||
lock, entries = self.connections[key]
|
||||
entries[cid] = conn_data
|
||||
else:
|
||||
lock = get_semaphore("%s:%d" % (host, port), self.limits[type])
|
||||
lock.acquire()
|
||||
log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
|
||||
entries = {cid: conn_data}
|
||||
self.connections[key] = (lock, entries)
|
||||
return connection
|
||||
|
||||
@synchronized(_lock)
|
||||
def get (self, type, host, port, create_connection):
|
||||
"""Get open connection if available or create a new one.
|
||||
|
||||
@param type: connection type
|
||||
@ptype type: ConnectionType
|
||||
@param host: hostname
|
||||
@ptype host: string
|
||||
@param port: port number
|
||||
@ptype port: int
|
||||
@return: Open connection object or None if none is available.
|
||||
@rtype None or FTPConnection or HTTP(S)Connection
|
||||
"""
|
||||
assert type in ConnectionTypes, 'invalid type %r' % type
|
||||
# 65536 == 2**16
|
||||
assert 0 < port < 65536, 'invalid port number %r' % port
|
||||
key = (type, host, port)
|
||||
if key not in self.connections:
|
||||
return self._add(type, host, port, create_connection)
|
||||
lock, entries = self.connections[key]
|
||||
if not lock.acquire(False):
|
||||
log.debug(LOG_CACHE, "wait for %s connection to %s:%d",
|
||||
type, host, port)
|
||||
return lock
|
||||
log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
|
||||
# either a connection is available or a new one can be created
|
||||
t = time.time()
|
||||
delete_entries = []
|
||||
try:
|
||||
for id, conn_data in entries.items():
|
||||
if conn_data[1] == ConnectionState.available:
|
||||
if is_expired(t, conn_data):
|
||||
delete_entries.append(id)
|
||||
else:
|
||||
conn_data[1] = ConnectionState.busy
|
||||
log.debug(LOG_CACHE,
|
||||
"reusing connection %s timing out in %.01f seconds",
|
||||
key, (conn_data[2] - t))
|
||||
return conn_data[0]
|
||||
finally:
|
||||
for id in delete_entries:
|
||||
del entries[id]
|
||||
# make a new connection
|
||||
return self._add(type, host, port, create_connection)
|
||||
|
||||
@synchronized(_lock)
|
||||
def release (self, type, host, port, connection, expiration=None):
|
||||
"""Release a used connection."""
|
||||
key = (type, host, port)
|
||||
if key in self.connections:
|
||||
lock, entries = self.connections[key]
|
||||
id = get_connection_id(connection)
|
||||
if id in entries:
|
||||
log.debug(LOG_CACHE, "Release lock for %s://%s:%d and expiration %s", type, host, port, expiration)
|
||||
# if the connection is reusable, set it to available, else delete it
|
||||
if expiration is None:
|
||||
del entries[id]
|
||||
else:
|
||||
entries[id][1] = ConnectionState.available
|
||||
entries[id][2] = expiration
|
||||
lock.release()
|
||||
else:
|
||||
log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d from entries %s", type, host, port, entries.keys())
|
||||
else:
|
||||
log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d", type, host, port)
|
||||
|
||||
@synchronized(_lock)
|
||||
def remove_expired (self):
|
||||
"""Remove expired or soon to be expired connections from this pool."""
|
||||
t = time.time()
|
||||
for lock, entries in self.connections.values():
|
||||
delete_entries = []
|
||||
for id, conn_data in entries.items():
|
||||
if conn_data[1] == 'available' and (t+5.0) >= conn_data[2]:
|
||||
try_close(conn_data[0])
|
||||
delete_entries.add(id)
|
||||
for id in delete_entries:
|
||||
del entries[id]
|
||||
lock.release()
|
||||
log.debug(LOG_CACHE, "released lock for id %s", id)
|
||||
|
||||
@synchronized(_lock)
|
||||
def clear (self):
|
||||
"""Remove all connections from this cache, even if busy."""
|
||||
for lock, entries in self.connections.values():
|
||||
for conn_data in entries.values():
|
||||
try_close(conn_data[0])
|
||||
self.connections.clear()
|
||||
|
||||
|
||||
def try_close (connection):
|
||||
"""Close and remove a connection (not thread-safe, internal use only)."""
|
||||
try:
|
||||
connection.close()
|
||||
except Exception:
|
||||
# ignore close errors
|
||||
pass
|
||||
83
linkcheck/cache/cookie.py
vendored
83
linkcheck/cache/cookie.py
vendored
|
|
@ -1,83 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Store and retrieve cookies.
|
||||
"""
|
||||
from .. import log, LOG_CACHE, cookies
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock
|
||||
|
||||
|
||||
_lock = get_lock("cookie")
|
||||
|
||||
class CookieJar (object):
|
||||
"""Cookie storage, implementing the cookie handling policy."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize empty cookie cache."""
|
||||
# Store all cookies in a set.
|
||||
self.cache = set()
|
||||
|
||||
@synchronized(_lock)
|
||||
def add (self, headers, scheme, host, path):
|
||||
"""Parse cookie values, add to cache."""
|
||||
errors = []
|
||||
for h in headers.getallmatchingheaders("Set-Cookie"):
|
||||
# RFC 2109 (Netscape) cookie type
|
||||
name, value = h.split(':', 1)
|
||||
try:
|
||||
cookie = cookies.NetscapeCookie(value, scheme, host, path)
|
||||
if cookie in self.cache:
|
||||
self.cache.remove(cookie)
|
||||
if not cookie.is_expired():
|
||||
self.cache.add(cookie)
|
||||
except cookies.CookieError as msg:
|
||||
errmsg = "Invalid cookie %r for %s:%s%s: %s" % (
|
||||
h, scheme, host, path, msg)
|
||||
errors.append(errmsg)
|
||||
for h in headers.getallmatchingheaders("Set-Cookie2"):
|
||||
# RFC 2965 cookie type
|
||||
name, value = h.split(':', 1)
|
||||
try:
|
||||
cookie = cookies.Rfc2965Cookie(value, scheme, host, path)
|
||||
if cookie in self.cache:
|
||||
self.cache.remove(cookie)
|
||||
if not cookie.is_expired():
|
||||
self.cache.add(cookie)
|
||||
except cookies.CookieError as msg:
|
||||
errmsg = "Invalid cookie2 %r for %s:%s%s: %s" % (
|
||||
h, scheme, host, path, msg)
|
||||
errors.append(errmsg)
|
||||
return errors
|
||||
|
||||
@synchronized(_lock)
|
||||
def get (self, scheme, host, port, path):
|
||||
"""Cookie cache getter function. Return ordered list of cookies
|
||||
which match the given host, port and path.
|
||||
Cookies with more specific paths are listed first."""
|
||||
cookies = [x for x in self.cache if x.check_expired() and \
|
||||
x.is_valid_for(scheme, host, port, path)]
|
||||
# order cookies with more specific (ie. longer) paths first
|
||||
cookies.sort(key=lambda c: len(c.attributes['path']), reverse=True)
|
||||
log.debug(LOG_CACHE, "Found %d cookies for host %r path %r",
|
||||
len(cookies), host, path)
|
||||
return cookies
|
||||
|
||||
@synchronized(_lock)
|
||||
def __str__ (self):
|
||||
"""Return stored cookies as string."""
|
||||
return "<CookieJar with %s>" % self.cache
|
||||
2
linkcheck/cache/robots_txt.py
vendored
2
linkcheck/cache/robots_txt.py
vendored
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
67
linkcheck/cache/urlqueue.py
vendored
67
linkcheck/cache/urlqueue.py
vendored
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -23,9 +23,6 @@ from time import time as _time
|
|||
from .. import log, LOG_CACHE
|
||||
|
||||
|
||||
LARGE_QUEUE_THRESHOLD = 1000
|
||||
FRONT_CHUNK_SIZE = 100
|
||||
|
||||
class Timeout (StandardError):
|
||||
"""Raised by join()"""
|
||||
pass
|
||||
|
|
@ -55,8 +52,8 @@ class UrlQueue (object):
|
|||
self.all_tasks_done = threading.Condition(self.mutex)
|
||||
self.unfinished_tasks = 0
|
||||
self.finished_tasks = 0
|
||||
self.in_progress = {}
|
||||
self.seen = {}
|
||||
self.in_progress = 0
|
||||
self.seen = set()
|
||||
self.shutdown = False
|
||||
# Each put() decreases the number of allowed puts.
|
||||
# This way we can restrict the number of URLs that are checked.
|
||||
|
|
@ -103,24 +100,29 @@ class UrlQueue (object):
|
|||
if remaining <= 0.0:
|
||||
raise Empty()
|
||||
self.not_empty.wait(remaining)
|
||||
url_data = self.queue.popleft()
|
||||
if url_data.has_result:
|
||||
# Already checked and copied from cache.
|
||||
pass
|
||||
else:
|
||||
key = url_data.cache_url_key
|
||||
assert key is not None
|
||||
self.in_progress[key] = url_data
|
||||
return url_data
|
||||
self.in_progress += 1
|
||||
return self.queue.popleft()
|
||||
|
||||
def put (self, item):
|
||||
"""Put an item into the queue.
|
||||
Block if necessary until a free slot is available.
|
||||
"""
|
||||
if self.put_denied(item):
|
||||
return
|
||||
with self.mutex:
|
||||
self._put(item)
|
||||
self.not_empty.notify()
|
||||
|
||||
def put_denied(self, url_data):
|
||||
"""Determine if put() will not append the item on the queue.
|
||||
@return True (reliable) or False (unreliable)
|
||||
"""
|
||||
if self.shutdown or self.allowed_puts == 0:
|
||||
return True
|
||||
if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _put (self, url_data):
|
||||
"""Put URL in queue, increase number of unfished tasks."""
|
||||
if self.shutdown:
|
||||
|
|
@ -133,17 +135,16 @@ class UrlQueue (object):
|
|||
self.allowed_puts -= 1
|
||||
log.debug(LOG_CACHE, "queueing %s", url_data)
|
||||
key = url_data.cache_url_key
|
||||
# cache key is None for URLs with invalid syntax
|
||||
assert key is not None or url_data.has_result, "invalid cache key in %s" % url_data
|
||||
if key in self.seen:
|
||||
self.seen[key] += 1
|
||||
if key is not None:
|
||||
# do not check duplicate URLs
|
||||
if key is not None:
|
||||
if key in self.seen:
|
||||
# don't check duplicate URLs
|
||||
return
|
||||
else:
|
||||
self.seen[key] = 0
|
||||
self.queue.append(url_data)
|
||||
self.seen.add(key)
|
||||
self.unfinished_tasks += 1
|
||||
if url_data.has_result:
|
||||
self.queue.appendleft(url_data)
|
||||
else:
|
||||
self.queue.append(url_data)
|
||||
|
||||
def task_done (self, url_data):
|
||||
"""
|
||||
|
|
@ -163,17 +164,11 @@ class UrlQueue (object):
|
|||
with self.all_tasks_done:
|
||||
log.debug(LOG_CACHE, "task_done %s", url_data)
|
||||
# check for aliases (eg. through HTTP redirections)
|
||||
if hasattr(url_data, "aliases"):
|
||||
for key in url_data.aliases:
|
||||
if key in self.seen:
|
||||
self.seen[key] += 1
|
||||
else:
|
||||
self.seen[key] = 0
|
||||
key = url_data.cache_url_key
|
||||
if key in self.in_progress:
|
||||
del self.in_progress[key]
|
||||
if hasattr(url_data, "aliases") and url_data.aliases:
|
||||
self.seen.update(url_data.aliases)
|
||||
self.finished_tasks += 1
|
||||
self.unfinished_tasks -= 1
|
||||
self.in_progress -= 1
|
||||
if self.unfinished_tasks <= 0:
|
||||
if self.unfinished_tasks < 0:
|
||||
raise ValueError('task_done() called too many times')
|
||||
|
|
@ -216,7 +211,5 @@ class UrlQueue (object):
|
|||
|
||||
def status (self):
|
||||
"""Get tuple (finished tasks, in progress, queue size)."""
|
||||
with self.mutex:
|
||||
return (self.finished_tasks,
|
||||
len(self.in_progress), len(self.queue))
|
||||
|
||||
# no need to acquire self.mutex since the numbers are unreliable anyways.
|
||||
return (self.finished_tasks, self.in_progress, len(self.queue))
|
||||
|
|
|
|||
|
|
@ -101,43 +101,46 @@ def get_url_from (base_url, recursion_level, aggregate,
|
|||
base_ref = strformat.unicode_safe(base_ref)
|
||||
name = strformat.unicode_safe(name)
|
||||
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
|
||||
scheme = None
|
||||
if not (url or name):
|
||||
# use filename as base url, with slash as path seperator
|
||||
name = base_url.replace("\\", "/")
|
||||
if parent_content_type == 'application/x-httpd-php' and \
|
||||
'<?' in base_url and '?>' in base_url and url.startswith('file:'):
|
||||
# ignore but warn about URLs from local PHP files with execution directives
|
||||
elif ":" in url:
|
||||
scheme = url.split(":", 1)[0].lower()
|
||||
allowed_schemes = aggregate.config["allowedschemes"]
|
||||
# ignore local PHP files with execution directives
|
||||
local_php = (parent_content_type == 'application/x-httpd-php' and
|
||||
'<?' in base_url and '?>' in base_url and scheme == 'file')
|
||||
if local_php or (allowed_schemes and scheme not in allowed_schemes):
|
||||
klass = ignoreurl.IgnoreUrl
|
||||
else:
|
||||
assume_local_file = recursion_level == 0
|
||||
klass = get_urlclass_from(url, assume_local_file=assume_local_file)
|
||||
assume_local_file = (recursion_level == 0)
|
||||
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
|
||||
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
|
||||
return klass(base_url, recursion_level, aggregate,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name, extern=extern)
|
||||
|
||||
|
||||
def get_urlclass_from (url, assume_local_file=False):
|
||||
"""Return checker class for given URL. If URL does not start
|
||||
with a URL scheme and assume_local_file is True, assume that
|
||||
the given URL is a local file."""
|
||||
if url.startswith("http:"):
|
||||
def get_urlclass_from (scheme, assume_local_file=False):
|
||||
"""Return checker class for given URL scheme. If the scheme
|
||||
cannot be matched and assume_local_file is True, assume a local file.
|
||||
"""
|
||||
if scheme in ("http", "https"):
|
||||
klass = httpurl.HttpUrl
|
||||
elif url.startswith("ftp:"):
|
||||
elif scheme == "ftp":
|
||||
klass = ftpurl.FtpUrl
|
||||
elif url.startswith("file:"):
|
||||
elif scheme == "file":
|
||||
klass = fileurl.FileUrl
|
||||
elif url.startswith("telnet:"):
|
||||
elif scheme == "telnet":
|
||||
klass = telneturl.TelnetUrl
|
||||
elif url.startswith("mailto:"):
|
||||
elif scheme == "mailto":
|
||||
klass = mailtourl.MailtoUrl
|
||||
elif url.startswith("https:"):
|
||||
klass = httpsurl.HttpsUrl
|
||||
elif url.startswith(("nntp:", "news:", "snews:")):
|
||||
elif scheme in ("nntp", "news", "snews"):
|
||||
klass = nntpurl.NntpUrl
|
||||
elif url.startswith('dns:'):
|
||||
elif scheme == "dns":
|
||||
klass = dnsurl.DnsUrl
|
||||
elif unknownurl.is_unknown_url(url):
|
||||
elif scheme and unknownurl.is_unknown_scheme(scheme):
|
||||
klass = unknownurl.UnknownUrl
|
||||
elif assume_local_file:
|
||||
klass = fileurl.FileUrl
|
||||
|
|
@ -168,4 +171,4 @@ def get_index_html (urls):
|
|||
|
||||
# all the URL classes
|
||||
from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
|
||||
httpsurl, mailtourl, telneturl, nntpurl, ignoreurl)
|
||||
mailtourl, telneturl, nntpurl, ignoreurl)
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@ import socket
|
|||
import select
|
||||
import nntplib
|
||||
import ftplib
|
||||
import httplib as orighttplib
|
||||
from .. import LinkCheckerError, httplib2 as httplib
|
||||
import requests
|
||||
from .. import LinkCheckerError
|
||||
from dns.exception import DNSException
|
||||
|
||||
# Catch these exception on syntax checks.
|
||||
|
|
@ -45,9 +45,8 @@ ExcCacheList = [
|
|||
nntplib.error_perm,
|
||||
nntplib.error_proto,
|
||||
EOFError,
|
||||
# http error
|
||||
httplib.error,
|
||||
orighttplib.error,
|
||||
# http errors
|
||||
requests.exceptions.RequestException,
|
||||
# ftp errors
|
||||
ftplib.error_reply,
|
||||
ftplib.error_temp,
|
||||
|
|
@ -75,39 +74,25 @@ ExcList = ExcCacheList + ExcNoCacheList
|
|||
|
||||
# some constants
|
||||
URL_MAX_LENGTH = 2000
|
||||
URL_WARN_LENGTH = 255
|
||||
URL_WARN_LENGTH = 1024
|
||||
|
||||
# the warnings
|
||||
WARN_URL_EFFECTIVE_URL = "url-effective-url"
|
||||
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
|
||||
WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
|
||||
WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
|
||||
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
|
||||
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
|
||||
WARN_URL_TOO_LONG = "url-too-long"
|
||||
WARN_URL_WHITESPACE = "url-whitespace"
|
||||
WARN_FILE_MISSING_SLASH = "file-missing-slash"
|
||||
WARN_FILE_SYSTEM_PATH = "file-system-path"
|
||||
WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
|
||||
WARN_HTTP_ROBOTS_DENIED = "http-robots-denied"
|
||||
WARN_HTTP_MOVED_PERMANENT = "http-moved-permanent"
|
||||
WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
|
||||
WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
|
||||
WARN_HTTP_DECOMPRESS_ERROR = "http-decompress-error"
|
||||
WARN_HTTP_UNSUPPORTED_ENCODING = "http-unsupported-encoding"
|
||||
WARN_HTTP_AUTH_UNKNOWN = "http-auth-unknonwn"
|
||||
WARN_HTTP_AUTH_UNAUTHORIZED = "http-auth-unauthorized"
|
||||
WARN_HTTPS_CERTIFICATE = "https-certificate-error"
|
||||
WARN_IGNORE_URL = "ignore-url"
|
||||
WARN_MAIL_NO_MX_HOST = "mail-no-mx-host"
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS = "mail-unverified-address"
|
||||
WARN_MAIL_NO_CONNECTION = "mail-no-connection"
|
||||
WARN_NNTP_NO_SERVER = "nntp-no-server"
|
||||
WARN_NNTP_NO_NEWSGROUP = "nntp-no-newsgroup"
|
||||
WARN_SYNTAX_HTML = "syntax-html"
|
||||
WARN_SYNTAX_CSS = "syntax-css"
|
||||
|
||||
# registered warnings
|
||||
Warnings = {
|
||||
|
|
@ -115,41 +100,20 @@ Warnings = {
|
|||
_("The effective URL is different from the original."),
|
||||
WARN_URL_ERROR_GETTING_CONTENT:
|
||||
_("Could not get the content of the URL."),
|
||||
WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
|
||||
WARN_URL_WARNREGEX_FOUND:
|
||||
_("The warning regular expression was found in the URL contents."),
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
|
||||
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
|
||||
WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
|
||||
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
|
||||
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
|
||||
WARN_FILE_SYSTEM_PATH:
|
||||
_("The file: path is not the same as the system specific path."),
|
||||
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
|
||||
WARN_HTTP_ROBOTS_DENIED: _("The http: URL checking has been denied."),
|
||||
WARN_HTTP_MOVED_PERMANENT: _("The URL has moved permanently."),
|
||||
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
|
||||
WARN_HTTP_COOKIE_STORE_ERROR:
|
||||
_("An error occurred while storing a cookie."),
|
||||
WARN_HTTP_DECOMPRESS_ERROR:
|
||||
_("An error occurred while decompressing the URL content."),
|
||||
WARN_HTTP_UNSUPPORTED_ENCODING:
|
||||
_("The URL content is encoded with an unknown encoding."),
|
||||
WARN_HTTP_AUTH_UNKNOWN:
|
||||
_("Unsupported HTTP authentication method."),
|
||||
WARN_HTTP_AUTH_UNAUTHORIZED:
|
||||
_("Unauthorized access without HTTP authentication."),
|
||||
WARN_HTTPS_CERTIFICATE: _("The SSL certificate is invalid or expired."),
|
||||
WARN_IGNORE_URL: _("The URL has been ignored."),
|
||||
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS:
|
||||
_("The mailto: address could not be verified."),
|
||||
WARN_MAIL_NO_CONNECTION:
|
||||
_("No connection to a MX host could be established."),
|
||||
WARN_NNTP_NO_SERVER: _("No NNTP server was found."),
|
||||
WARN_NNTP_NO_NEWSGROUP: _("The NNTP newsgroup could not be found."),
|
||||
WARN_URL_OBFUSCATED_IP: _("The IP is obfuscated."),
|
||||
WARN_SYNTAX_HTML: _("HTML syntax error."),
|
||||
WARN_SYNTAX_CSS: _("CSS syntax error."),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -25,7 +25,7 @@ import urllib
|
|||
import urllib2
|
||||
from datetime import datetime
|
||||
|
||||
from . import urlbase, get_index_html, get_url_from
|
||||
from . import urlbase, get_index_html
|
||||
from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil
|
||||
from ..bookmarks import firefox
|
||||
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
|
||||
|
|
@ -163,8 +163,6 @@ class FileUrl (urlbase.UrlBase):
|
|||
return
|
||||
filename = self.get_os_filename()
|
||||
self.size = fileutil.get_size(filename)
|
||||
if self.dlsize == -1:
|
||||
self.dlsize = self.size
|
||||
self.modified = datetime.utcfromtimestamp(fileutil.get_mtime(filename))
|
||||
|
||||
def check_connection (self):
|
||||
|
|
@ -203,16 +201,13 @@ class FileUrl (urlbase.UrlBase):
|
|||
def read_content (self):
|
||||
"""Return file content, or in case of directories a dummy HTML file
|
||||
with links to the files."""
|
||||
if self.size > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
if self.is_directory():
|
||||
data = get_index_html(get_files(self.get_os_filename()))
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode("iso8859-1", "ignore")
|
||||
size = len(data)
|
||||
else:
|
||||
data, size = super(FileUrl, self).read_content()
|
||||
return data, size
|
||||
data = super(FileUrl, self).read_content()
|
||||
return data
|
||||
|
||||
def is_html (self):
|
||||
"""Check if file is a HTML file."""
|
||||
|
|
@ -272,27 +267,6 @@ class FileUrl (urlbase.UrlBase):
|
|||
log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
|
||||
return False
|
||||
|
||||
def parse_url (self):
|
||||
"""Parse file contents for new links to check."""
|
||||
if self.is_directory():
|
||||
self.parse_html()
|
||||
elif firefox.has_sqlite and firefox.extension.search(self.url):
|
||||
self.parse_firefox()
|
||||
else:
|
||||
mime = self.get_content_type()
|
||||
key = self.ContentMimetypes[mime]
|
||||
getattr(self, "parse_"+key)()
|
||||
self.add_num_url_info()
|
||||
|
||||
def parse_firefox (self):
|
||||
"""Parse a Firefox3 bookmark file."""
|
||||
log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
|
||||
filename = self.get_os_filename()
|
||||
for url, name in firefox.parse_bookmark_file(filename):
|
||||
url_data = get_url_from(url, self.recursion_level+1,
|
||||
self.aggregate, parent_url=self.url, name=name)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def get_content_type (self):
|
||||
"""Return URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
|
|
@ -326,6 +300,5 @@ class FileUrl (urlbase.UrlBase):
|
|||
webroot = self.aggregate.config["localwebroot"]
|
||||
if webroot and url and url.startswith(u"/"):
|
||||
url = webroot + url[1:]
|
||||
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.",
|
||||
webroot, url)
|
||||
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
|
||||
super(FileUrl, self).add_url(url, line=line, column=column, name=name, base=base)
|
||||
|
|
|
|||
|
|
@ -22,11 +22,11 @@ import ftplib
|
|||
from cStringIO import StringIO
|
||||
|
||||
from .. import log, LOG_CHECK, LinkCheckerError, fileutil
|
||||
from . import proxysupport, httpurl, internpaturl, get_index_html, pooledconnection
|
||||
from . import proxysupport, httpurl, internpaturl, get_index_html
|
||||
from .const import WARN_FTP_MISSING_SLASH
|
||||
|
||||
|
||||
class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
|
||||
class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||
"""
|
||||
Url link with ftp scheme.
|
||||
"""
|
||||
|
|
@ -70,14 +70,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
|
||||
def login (self):
|
||||
"""Log into ftp server and check the welcome message."""
|
||||
def create_connection(scheme, host, port):
|
||||
"""Create a new ftp connection."""
|
||||
connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
|
||||
if log.is_debug(LOG_CHECK):
|
||||
connection.set_debuglevel(1)
|
||||
return connection
|
||||
scheme, host, port = self.get_netloc()
|
||||
self.get_pooled_connection(scheme, host, port, create_connection)
|
||||
self.url_connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
|
||||
if log.is_debug(LOG_CHECK):
|
||||
self.url_connection.set_debuglevel(1)
|
||||
try:
|
||||
self.url_connection.connect(self.host, self.port)
|
||||
_user, _password = self.get_user_password()
|
||||
|
|
@ -92,6 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
# note that the info may change every time a user logs in,
|
||||
# so don't add it to the url_data info.
|
||||
log.debug(LOG_CHECK, "FTP info %s", info)
|
||||
pass
|
||||
else:
|
||||
raise LinkCheckerError(_("Got no answer from FTP server"))
|
||||
except EOFError as msg:
|
||||
|
|
@ -105,6 +101,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
features = self.url_connection.sendcmd("FEAT")
|
||||
except ftplib.error_perm as msg:
|
||||
log.debug(LOG_CHECK, "Ignoring error when getting FTP features: %s" % msg)
|
||||
pass
|
||||
else:
|
||||
log.debug(LOG_CHECK, "FTP features %s", features)
|
||||
if " UTF-8" in features.splitlines():
|
||||
|
|
@ -176,7 +173,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
"""See if URL target is parseable for recursion."""
|
||||
if self.is_directory():
|
||||
return True
|
||||
ctype = self.get_content_type(self.get_content)
|
||||
ctype = self.get_content_type()
|
||||
if ctype in self.ContentMimetypes:
|
||||
return True
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
|
||||
|
|
@ -188,20 +185,11 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
path = self.urlparts[2]
|
||||
return (not path) or path.endswith('/')
|
||||
|
||||
def parse_url (self):
|
||||
"""Parse URL target for links."""
|
||||
if self.is_directory():
|
||||
self.parse_html()
|
||||
return
|
||||
key = self.ContentMimetypes[self.get_content_type(self.get_content)]
|
||||
getattr(self, "parse_"+key)()
|
||||
self.add_num_url_info()
|
||||
|
||||
def get_content_type (self, read=None):
|
||||
def get_content_type (self):
|
||||
"""Return URL content type, or an empty string if content
|
||||
type could not be found."""
|
||||
if self.content_type is None:
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=read)
|
||||
self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
|
||||
return self.content_type
|
||||
|
||||
def read_content (self):
|
||||
|
|
@ -210,6 +198,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
if self.is_directory():
|
||||
self.url_connection.cwd(self.filename)
|
||||
self.files = self.get_files()
|
||||
# XXX limit number of files?
|
||||
data = get_index_html(self.files)
|
||||
else:
|
||||
# download file in BINARY mode
|
||||
|
|
@ -217,20 +206,20 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
|
|||
buf = StringIO()
|
||||
def stor_data (s):
|
||||
"""Helper method storing given data"""
|
||||
self.aggregate.add_download_data(self.cache_content_key, s)
|
||||
# limit the download size
|
||||
if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
|
||||
if (buf.tell() + len(s)) > self.max_size:
|
||||
raise LinkCheckerError(_("FTP file size too large"))
|
||||
buf.write(s)
|
||||
self.url_connection.retrbinary(ftpcmd, stor_data)
|
||||
data = buf.getvalue()
|
||||
buf.close()
|
||||
return data, len(data)
|
||||
return data
|
||||
|
||||
def close_connection (self):
|
||||
"""Release the open connection from the connection pool."""
|
||||
if self.url_connection is None:
|
||||
return
|
||||
scheme, host, port = self.get_netloc()
|
||||
self.aggregate.connections.release(scheme, host, port, self.url_connection)
|
||||
self.url_connection = None
|
||||
if self.url_connection is not None:
|
||||
try:
|
||||
self.url_connection.quit()
|
||||
except Exception:
|
||||
pass
|
||||
self.url_connection = None
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,179 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Handle https links.
|
||||
"""
|
||||
import time
|
||||
from . import httpurl
|
||||
from .const import WARN_HTTPS_CERTIFICATE
|
||||
from .. import log, LOG_CHECK, strformat
|
||||
|
||||
|
||||
class HttpsUrl (httpurl.HttpUrl):
|
||||
"""
|
||||
Url link with https scheme.
|
||||
"""
|
||||
|
||||
def local_check (self):
|
||||
"""
|
||||
Check connection if SSL is supported, else ignore.
|
||||
"""
|
||||
if httpurl.supportHttps:
|
||||
super(HttpsUrl, self).local_check()
|
||||
else:
|
||||
self.add_info(_("%s URL ignored.") % self.scheme.capitalize())
|
||||
|
||||
def get_http_object (self, scheme, host, port):
|
||||
"""Open a HTTP connection and check the SSL certificate."""
|
||||
super(HttpsUrl, self).get_http_object(scheme, host, port)
|
||||
self.check_ssl_certificate(self.url_connection.sock, host)
|
||||
|
||||
def check_ssl_certificate(self, ssl_sock, host):
|
||||
"""Run all SSL certificate checks that have not yet been done.
|
||||
OpenSSL already checked the SSL notBefore and notAfter dates.
|
||||
"""
|
||||
if not hasattr(ssl_sock, "getpeercert"):
|
||||
# the URL was a HTTPS -> HTTP redirect
|
||||
return
|
||||
cert = ssl_sock.getpeercert()
|
||||
log.debug(LOG_CHECK, "Got SSL certificate %s", cert)
|
||||
if not cert:
|
||||
return
|
||||
if 'subject' in cert:
|
||||
self.check_ssl_hostname(ssl_sock, cert, host)
|
||||
else:
|
||||
msg = _('certificate did not include "subject" information')
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
if 'notAfter' in cert:
|
||||
self.check_ssl_valid_date(ssl_sock, cert)
|
||||
else:
|
||||
msg = _('certificate did not include "notAfter" information')
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
|
||||
def check_ssl_hostname(self, ssl_sock, cert, host):
|
||||
"""Check the hostname against the certificate according to
|
||||
RFC2818.
|
||||
"""
|
||||
try:
|
||||
match_hostname(cert, host)
|
||||
except CertificateError as msg:
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
|
||||
def check_ssl_valid_date(self, ssl_sock, cert):
|
||||
"""Check if the certificate is still valid, or if configured check
|
||||
if it's at least a number of days valid.
|
||||
"""
|
||||
import ssl
|
||||
checkDaysValid = self.aggregate.config["warnsslcertdaysvalid"]
|
||||
try:
|
||||
notAfter = ssl.cert_time_to_seconds(cert['notAfter'])
|
||||
except ValueError as msg:
|
||||
msg = _('invalid certficate "notAfter" value %r') % cert['notAfter']
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
return
|
||||
curTime = time.time()
|
||||
# Calculate seconds until certifcate expires. Can be negative if
|
||||
# the certificate is already expired.
|
||||
secondsValid = notAfter - curTime
|
||||
if secondsValid < 0:
|
||||
msg = _('certficate is expired on %s') % cert['notAfter']
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
elif checkDaysValid > 0 and \
|
||||
secondsValid < (checkDaysValid * strformat.SECONDS_PER_DAY):
|
||||
strSecondsValid = strformat.strduration_long(secondsValid)
|
||||
msg = _('certificate is only %s valid') % strSecondsValid
|
||||
self.add_ssl_warning(ssl_sock, msg)
|
||||
|
||||
def add_ssl_warning(self, ssl_sock, msg):
|
||||
"""Add a warning message about an SSL certificate error."""
|
||||
cipher_name, ssl_protocol, secret_bits = ssl_sock.cipher()
|
||||
err = _(u"SSL warning: %(msg)s. Cipher %(cipher)s, %(protocol)s.")
|
||||
attrs = dict(msg=msg, cipher=cipher_name, protocol=ssl_protocol)
|
||||
self.add_warning(err % attrs, tag=WARN_HTTPS_CERTIFICATE)
|
||||
|
||||
|
||||
# Copied from ssl.py in Python 3:
|
||||
# Wrapper module for _ssl, providing some additional facilities
|
||||
# implemented in Python. Written by Bill Janssen.
|
||||
import re
|
||||
|
||||
class CertificateError(ValueError):
|
||||
"""Raised on certificate errors."""
|
||||
pass
|
||||
|
||||
|
||||
def _dnsname_to_pat(dn, max_wildcards=1):
|
||||
"""Convert a DNS certificate name to a hostname matcher."""
|
||||
pats = []
|
||||
for frag in dn.split(r'.'):
|
||||
if frag.count('*') > max_wildcards:
|
||||
# Issue #17980: avoid denials of service by refusing more
|
||||
# than one wildcard per fragment. A survery of established
|
||||
# policy among SSL implementations showed it to be a
|
||||
# reasonable choice.
|
||||
raise CertificateError(
|
||||
"too many wildcards in certificate DNS name: " + repr(dn))
|
||||
if frag == '*':
|
||||
# When '*' is a fragment by itself, it matches a non-empty dotless
|
||||
# fragment.
|
||||
pats.append('[^.]+')
|
||||
else:
|
||||
# Otherwise, '*' matches any dotless fragment.
|
||||
frag = re.escape(frag)
|
||||
pats.append(frag.replace(r'\*', '[^.]*'))
|
||||
return re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE)
|
||||
|
||||
|
||||
def match_hostname(cert, hostname):
|
||||
"""Verify that *cert* (in decoded format as returned by
|
||||
SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 rules
|
||||
are mostly followed, but IP addresses are not accepted for *hostname*.
|
||||
|
||||
CertificateError is raised on failure. On success, the function
|
||||
returns nothing.
|
||||
"""
|
||||
if not cert:
|
||||
raise ValueError("empty or no certificate")
|
||||
dnsnames = []
|
||||
san = cert.get('subjectAltName', ())
|
||||
for key, value in san:
|
||||
if key == 'DNS':
|
||||
if _dnsname_to_pat(value).match(hostname):
|
||||
return
|
||||
dnsnames.append(value)
|
||||
if not dnsnames:
|
||||
# The subject is only checked when there is no dNSName entry
|
||||
# in subjectAltName
|
||||
for sub in cert.get('subject', ()):
|
||||
for key, value in sub:
|
||||
# XXX according to RFC 2818, the most specific Common Name
|
||||
# must be used.
|
||||
if key == 'commonName':
|
||||
if _dnsname_to_pat(value).match(hostname):
|
||||
return
|
||||
dnsnames.append(value)
|
||||
if len(dnsnames) > 1:
|
||||
raise CertificateError("hostname %r "
|
||||
"doesn't match either of %s"
|
||||
% (hostname, ', '.join(map(repr, dnsnames))))
|
||||
elif len(dnsnames) == 1:
|
||||
raise CertificateError("hostname %r "
|
||||
"doesn't match %r"
|
||||
% (hostname, dnsnames[0]))
|
||||
else:
|
||||
raise CertificateError("no appropriate commonName or "
|
||||
"subjectAltName fields were found")
|
||||
|
|
@ -18,26 +18,14 @@
|
|||
Handle http links.
|
||||
"""
|
||||
|
||||
import urlparse
|
||||
import os
|
||||
import errno
|
||||
import zlib
|
||||
import socket
|
||||
import rfc822
|
||||
import time
|
||||
import requests
|
||||
from cStringIO import StringIO
|
||||
from datetime import datetime
|
||||
|
||||
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
|
||||
httplib2 as httplib, LinkCheckerError, httputil, configuration)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
|
||||
get_url_from, pooledconnection)
|
||||
from .. import (log, LOG_CHECK, strformat,
|
||||
url as urlutil, LinkCheckerError)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers)
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_ROBOTS_DENIED, \
|
||||
WARN_HTTP_MOVED_PERMANENT, \
|
||||
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
|
||||
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
|
||||
WARN_HTTP_AUTH_UNKNOWN, WARN_HTTP_AUTH_UNAUTHORIZED
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
|
||||
# assumed HTTP header encoding
|
||||
HEADER_ENCODING = "iso-8859-1"
|
||||
|
|
@ -46,18 +34,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
|
|||
# helper alias
|
||||
unicode_safe = strformat.unicode_safe
|
||||
|
||||
supportHttps = hasattr(httplib, "HTTPSConnection")
|
||||
|
||||
SUPPORTED_ENCODINGS = ('x-gzip', 'gzip', 'deflate')
|
||||
# Accept-Encoding header value
|
||||
ACCEPT_ENCODING = ",".join(SUPPORTED_ENCODINGS)
|
||||
# Accept-Charset header value
|
||||
ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.3"
|
||||
# Accept mime type header value
|
||||
ACCEPT = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
||||
|
||||
|
||||
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
|
||||
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||
"""
|
||||
Url link with http scheme.
|
||||
"""
|
||||
|
|
@ -67,28 +44,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
Initialize HTTP specific variables.
|
||||
"""
|
||||
super(HttpUrl, self).reset()
|
||||
self.max_redirects = 5
|
||||
self.has301status = False
|
||||
# flag if connection is persistent
|
||||
self.persistent = False
|
||||
# URLs seen through 301/302 redirections
|
||||
# URLs seen through redirections
|
||||
self.aliases = []
|
||||
# initialize check data
|
||||
self.headers = None
|
||||
self.headers = {}
|
||||
self.auth = None
|
||||
self.cookies = []
|
||||
# temporary data filled when reading redirections
|
||||
self._data = None
|
||||
# flag telling if GET method is allowed; determined by robots.txt
|
||||
self.method_get_allowed = True
|
||||
# HttpResponse object
|
||||
self.response = None
|
||||
|
||||
def allows_robots (self, url):
|
||||
"""
|
||||
Fetch and parse the robots.txt of given url. Checks if LinkChecker
|
||||
can get the requested resource content. HEAD requests however are
|
||||
still allowed.
|
||||
can get the requested resource content.
|
||||
|
||||
@param url: the url to be requested
|
||||
@type url: string
|
||||
|
|
@ -98,9 +63,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
roboturl = self.get_robots_txt_url()
|
||||
user, password = self.get_user_password()
|
||||
rb = self.aggregate.robots_txt
|
||||
callback = self.aggregate.connections.host_wait
|
||||
return rb.allows_url(roboturl, url, self.proxy, user, password,
|
||||
callback=callback)
|
||||
#callback = self.aggregate.connections.host_wait
|
||||
return rb.allows_url(roboturl, self.url, self.proxy, user, password)
|
||||
|
||||
def add_size_info (self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
|
|
@ -110,8 +74,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
# the content data is always decoded.
|
||||
try:
|
||||
self.size = int(self.getheader("Content-Length"))
|
||||
if self.dlsize == -1:
|
||||
self.dlsize = self.size
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
else:
|
||||
|
|
@ -134,164 +96,56 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
- 5xx: Server Error - The server failed to fulfill an apparently
|
||||
valid request
|
||||
"""
|
||||
self.session = self.aggregate.get_request_session()
|
||||
# set the proxy, so a 407 status after this is an error
|
||||
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
||||
self.construct_auth()
|
||||
# check robots.txt
|
||||
if not self.allows_robots(self.url):
|
||||
# remove all previously stored results
|
||||
self.add_warning(
|
||||
_("Access denied by robots.txt, skipping content checks."),
|
||||
tag=WARN_HTTP_ROBOTS_DENIED)
|
||||
self.method_get_allowed = False
|
||||
# first try with HEAD
|
||||
self.method = "HEAD"
|
||||
self.add_info(_("Access denied by robots.txt, checked only syntax."))
|
||||
self.set_result(_("syntax OK"))
|
||||
self.do_check_content = False
|
||||
return
|
||||
# check the http connection
|
||||
self.check_http_connection()
|
||||
# redirections might have changed the URL
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
# check response
|
||||
if self.response is not None:
|
||||
self.check_response()
|
||||
self.close_response()
|
||||
request = self.build_request()
|
||||
self.send_request(request)
|
||||
self.follow_redirections(request)
|
||||
self.check_response()
|
||||
|
||||
def check_http_connection (self):
|
||||
"""
|
||||
Check HTTP connection and return get response and a flag
|
||||
if the check algorithm had to fall back to the GET method.
|
||||
def build_request(self):
|
||||
"""Build a prepared request object."""
|
||||
clientheaders = {
|
||||
"User-Agent": self.aggregate.config["useragent"],
|
||||
"DNT": "1",
|
||||
}
|
||||
if (self.parent_url and
|
||||
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
|
||||
clientheaders["Referer"] = self.parent_url
|
||||
kwargs = dict(
|
||||
method='GET',
|
||||
url=self.url,
|
||||
headers=clientheaders,
|
||||
)
|
||||
if self.auth:
|
||||
kwargs['auth'] = self.auth
|
||||
log.debug(LOG_CHECK, "Prepare request with %s", kwargs)
|
||||
request = requests.Request(**kwargs)
|
||||
return self.session.prepare_request(request)
|
||||
|
||||
@return: response or None if url is already handled
|
||||
@rtype: HttpResponse or None
|
||||
"""
|
||||
while True:
|
||||
# XXX refactor this
|
||||
self.close_response()
|
||||
try:
|
||||
self._try_http_response()
|
||||
except httplib.BadStatusLine as msg:
|
||||
# some servers send empty HEAD replies
|
||||
if self.method == "HEAD" and self.method_get_allowed:
|
||||
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
|
||||
self.fallback_to_get()
|
||||
continue
|
||||
raise
|
||||
except socket.error as msg:
|
||||
# some servers reset the connection on HEAD requests
|
||||
if self.method == "HEAD" and self.method_get_allowed and \
|
||||
msg[0] == errno.ECONNRESET:
|
||||
self.fallback_to_get()
|
||||
continue
|
||||
raise
|
||||
|
||||
uheaders = unicode_safe(self.headers, encoding=HEADER_ENCODING)
|
||||
log.debug(LOG_CHECK, "Headers: %s", uheaders)
|
||||
# proxy enforcement (overrides standard proxy)
|
||||
if self.response.status == 305 and self.headers:
|
||||
oldproxy = (self.proxy, self.proxyauth)
|
||||
newproxy = self.getheader("Location")
|
||||
if newproxy:
|
||||
self.add_info(_("Enforced proxy `%(name)s'.") %
|
||||
{"name": newproxy})
|
||||
self.set_proxy(newproxy)
|
||||
self.close_response()
|
||||
if self.proxy is None:
|
||||
self.set_result(
|
||||
_("Missing 'Location' header with enforced proxy status 305, aborting."),
|
||||
valid=False)
|
||||
return
|
||||
elif not self.proxy:
|
||||
self.set_result(
|
||||
_("Empty 'Location' header value with enforced proxy status 305, aborting."),
|
||||
valid=False)
|
||||
return
|
||||
self._try_http_response()
|
||||
# restore old proxy settings
|
||||
self.proxy, self.proxyauth = oldproxy
|
||||
try:
|
||||
tries = self.follow_redirections()
|
||||
except httplib.BadStatusLine as msg:
|
||||
# some servers send empty HEAD replies
|
||||
if self.method == "HEAD" and self.method_get_allowed:
|
||||
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
|
||||
self.fallback_to_get()
|
||||
continue
|
||||
raise
|
||||
if tries == -1:
|
||||
log.debug(LOG_CHECK, "already handled")
|
||||
self.close_response()
|
||||
self.do_check_content = False
|
||||
return
|
||||
if tries >= self.max_redirects:
|
||||
if self.method == "HEAD" and self.method_get_allowed:
|
||||
# Microsoft servers tend to recurse HEAD requests
|
||||
self.fallback_to_get()
|
||||
continue
|
||||
self.set_result(_("more than %d redirections, aborting") %
|
||||
self.max_redirects, valid=False)
|
||||
self.close_response()
|
||||
self.do_check_content = False
|
||||
return
|
||||
if self.do_fallback(self.response.status):
|
||||
self.fallback_to_get()
|
||||
continue
|
||||
# user authentication
|
||||
if self.response.status == 401:
|
||||
authenticate = self.getheader('WWW-Authenticate')
|
||||
if authenticate is None:
|
||||
# Either the server intentionally blocked this request,
|
||||
# or there is a form on this page which requires
|
||||
# manual user/password input.
|
||||
# Either way, this is a warning.
|
||||
self.add_warning(_("Unauthorized access without HTTP authentication."),
|
||||
tag=WARN_HTTP_AUTH_UNAUTHORIZED)
|
||||
return
|
||||
if not authenticate.startswith("Basic"):
|
||||
# LinkChecker only supports Basic authorization
|
||||
args = {"auth": authenticate}
|
||||
self.add_warning(
|
||||
_("Unsupported HTTP authentication `%(auth)s', " \
|
||||
"only `Basic' authentication is supported.") % args,
|
||||
tag=WARN_HTTP_AUTH_UNKNOWN)
|
||||
return
|
||||
if not self.auth:
|
||||
self.construct_auth()
|
||||
if self.auth:
|
||||
continue
|
||||
break
|
||||
|
||||
def do_fallback(self, status):
|
||||
"""Check for fallback according to response status.
|
||||
@param status: The HTTP response status
|
||||
@ptype status: int
|
||||
@return: True if checker should use GET, else False
|
||||
@rtype: bool
|
||||
"""
|
||||
if self.method == "HEAD":
|
||||
# Some sites do not support HEAD requests, for example
|
||||
# youtube sends a 404 with HEAD, 200 with GET. Doh.
|
||||
# A 405 "Method not allowed" status should also use GET.
|
||||
if status >= 400:
|
||||
log.debug(LOG_CHECK, "Method HEAD error %d, falling back to GET", status)
|
||||
return True
|
||||
# Other sites send 200 with HEAD, but 404 with GET. Bummer.
|
||||
poweredby = self.getheader('X-Powered-By', u'')
|
||||
server = self.getheader('Server', u'')
|
||||
# Some servers (Zope, Apache Coyote/Tomcat, IIS have wrong
|
||||
# content type with HEAD. This seems to be a common problem.
|
||||
if (poweredby.startswith('Zope') or server.startswith('Zope')
|
||||
or server.startswith('Apache-Coyote')
|
||||
or ('ASP.NET' in poweredby and 'Microsoft-IIS' in server)):
|
||||
return True
|
||||
return False
|
||||
|
||||
def fallback_to_get(self):
|
||||
"""Set method to GET and clear aliases."""
|
||||
self.close_response()
|
||||
self.close_connection()
|
||||
self.method = "GET"
|
||||
self.aliases = []
|
||||
self.urlparts = strformat.url_unicode_split(self.url)
|
||||
self.build_url_parts()
|
||||
def send_request(self, request):
|
||||
"""Send request and store response in self.url_connection."""
|
||||
# throttle the number of requests to each host
|
||||
self.aggregate.wait_for_host(self.urlparts[1])
|
||||
kwargs = dict(
|
||||
stream=True,
|
||||
timeout=self.aggregate.config["timeout"],
|
||||
allow_redirects=False,
|
||||
)
|
||||
if self.scheme == "https" and self.aggregate.config["sslverify"]:
|
||||
kwargs["verify"] = self.aggregate.config["sslverify"]
|
||||
log.debug(LOG_CHECK, "Send request with %s", kwargs)
|
||||
self.url_connection = self.session.send(request, **kwargs)
|
||||
self.headers = self.url_connection.headers
|
||||
|
||||
def construct_auth (self):
|
||||
"""Construct HTTP Basic authentication credentials if there
|
||||
|
|
@ -301,162 +155,34 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
return
|
||||
_user, _password = self.get_user_password()
|
||||
if _user is not None and _password is not None:
|
||||
credentials = httputil.encode_base64("%s:%s" % (_user, _password))
|
||||
self.auth = "Basic " + credentials
|
||||
log.debug(LOG_CHECK, "Using basic authentication")
|
||||
self.auth = (_user, _password)
|
||||
|
||||
def get_content_type (self):
|
||||
"""Return content MIME type or empty string."""
|
||||
if self.content_type is None:
|
||||
if self.headers:
|
||||
self.content_type = headers.get_content_type(self.headers)
|
||||
else:
|
||||
self.content_type = u""
|
||||
if not self.content_type:
|
||||
self.content_type = headers.get_content_type(self.headers)
|
||||
return self.content_type
|
||||
|
||||
def follow_redirections (self, set_result=True):
|
||||
def follow_redirections(self, request):
|
||||
"""Follow all redirections of http response."""
|
||||
log.debug(LOG_CHECK, "follow all redirections")
|
||||
redirected = self.url
|
||||
tries = 0
|
||||
while self.response.status in [301, 302] and self.headers and \
|
||||
tries < self.max_redirects:
|
||||
num = self.follow_redirection(set_result, redirected)
|
||||
if num == -1:
|
||||
return num
|
||||
redirected = urlutil.urlunsplit(self.urlparts)
|
||||
tries += num
|
||||
return tries
|
||||
|
||||
def follow_redirection (self, set_result, redirected):
|
||||
"""Follow one redirection of http response."""
|
||||
newurl = self.getheader("Location",
|
||||
self.getheader("Uri", u""))
|
||||
# make new url absolute and unicode
|
||||
newurl = urlparse.urljoin(redirected, unicode_safe(newurl))
|
||||
log.debug(LOG_CHECK, "Redirected to %r", newurl)
|
||||
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
|
||||
# norm base url - can raise UnicodeError from url.idna_encode()
|
||||
redirected, is_idn = urlbase.url_norm(newurl)
|
||||
log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
|
||||
urlparts = strformat.url_unicode_split(redirected)
|
||||
if not self.check_redirection_scheme(redirected, urlparts, set_result):
|
||||
return -1
|
||||
if not self.check_redirection_newscheme(redirected, urlparts, set_result):
|
||||
return -1
|
||||
if not self.check_redirection_domain(redirected, urlparts,
|
||||
set_result):
|
||||
return -1
|
||||
if not self.check_redirection_robots(redirected, set_result):
|
||||
return -1
|
||||
num = self.check_redirection_recursion(redirected, set_result)
|
||||
if num != 0:
|
||||
return num
|
||||
if set_result:
|
||||
self.check301status()
|
||||
self.close_response()
|
||||
self.close_connection()
|
||||
# remember redirected url as alias
|
||||
self.aliases.append(redirected)
|
||||
if self.anchor:
|
||||
urlparts[4] = self.anchor
|
||||
# note: urlparts has to be a list
|
||||
self.urlparts = urlparts
|
||||
self.build_url_parts()
|
||||
# store cookies from redirect response
|
||||
self.store_cookies()
|
||||
# new response data
|
||||
self._try_http_response()
|
||||
return 1
|
||||
|
||||
def check_redirection_scheme (self, redirected, urlparts, set_result):
|
||||
"""Return True if redirection scheme is ok, else False."""
|
||||
if urlparts[0] in ('ftp', 'http', 'https'):
|
||||
return True
|
||||
# For security reasons do not allow redirects to protocols
|
||||
# other than HTTP, HTTPS or FTP.
|
||||
if set_result:
|
||||
self.add_warning(
|
||||
_("Redirection to url `%(newurl)s' is not allowed.") %
|
||||
{'newurl': redirected})
|
||||
self.set_result(_("syntax OK"))
|
||||
return False
|
||||
|
||||
def check_redirection_domain (self, redirected, urlparts, set_result):
|
||||
"""Return True if redirection domain is ok, else False."""
|
||||
# XXX does not support user:pass@netloc format
|
||||
if urlparts[1] != self.urlparts[1]:
|
||||
# URL domain changed
|
||||
if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
|
||||
# Add intern patterns for redirection of URLs given by the
|
||||
# user for HTTP schemes.
|
||||
self.add_intern_pattern(url=redirected)
|
||||
return True
|
||||
# check extern filter again
|
||||
self.extern = None
|
||||
self.set_extern(redirected)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
if set_result:
|
||||
self.check301status()
|
||||
self.add_info(_("The redirected URL is outside of the domain "
|
||||
"filter, checked only syntax."))
|
||||
self.set_result(_("filtered"))
|
||||
return False
|
||||
return True
|
||||
|
||||
def check_redirection_robots (self, redirected, set_result):
|
||||
"""Check robots.txt allowance for redirections. Return True if
|
||||
allowed, else False."""
|
||||
if self.allows_robots(redirected):
|
||||
return True
|
||||
if set_result:
|
||||
self.add_warning(
|
||||
_("Access to redirected URL denied by robots.txt, "
|
||||
"checked only syntax."), tag=WARN_HTTP_ROBOTS_DENIED)
|
||||
self.set_result(_("syntax OK"))
|
||||
return False
|
||||
|
||||
def check_redirection_recursion (self, redirected, set_result):
|
||||
"""Check for recursive redirect. Return zero if no recursion
|
||||
detected, max_redirects for recursion with HEAD request,
|
||||
-1 otherwise."""
|
||||
all_seen = [self.cache_url_key] + self.aliases
|
||||
if redirected not in all_seen:
|
||||
return 0
|
||||
if self.method == "HEAD" and self.method_get_allowed:
|
||||
# Microsoft servers tend to recurse HEAD requests
|
||||
# fall back to the original url and use GET
|
||||
return self.max_redirects
|
||||
if set_result:
|
||||
urls = "\n => ".join(all_seen + [redirected])
|
||||
self.set_result(_("recursive redirection encountered:\n %(urls)s") %
|
||||
{"urls": urls}, valid=False)
|
||||
return -1
|
||||
|
||||
def check_redirection_newscheme (self, redirected, urlparts, set_result):
|
||||
"""Check for HTTP(S)/FTP redirection. Return True for
|
||||
redirection with same scheme, else False."""
|
||||
if urlparts[0] != self.urlparts[0]:
|
||||
# changed scheme
|
||||
newobj = get_url_from(
|
||||
redirected, self.recursion_level, self.aggregate,
|
||||
parent_url=self.parent_url, base_ref=self.base_ref,
|
||||
line=self.line, column=self.column, name=self.name)
|
||||
if set_result:
|
||||
self.set_result(_("syntax OK"))
|
||||
# append new object to queue
|
||||
self.aggregate.urlqueue.put(newobj)
|
||||
return False
|
||||
raise LinkCheckerError(_('Cannot redirect to different scheme without result'))
|
||||
return True
|
||||
|
||||
def check301status (self):
|
||||
"""If response page has been permanently moved add a warning."""
|
||||
if self.response.status == 301 and not self.has301status:
|
||||
self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
|
||||
" should update this link."),
|
||||
tag=WARN_HTTP_MOVED_PERMANENT)
|
||||
self.has301status = True
|
||||
kwargs = dict(
|
||||
stream=True,
|
||||
)
|
||||
response = None
|
||||
for response in self.session.resolve_redirects(self.url_connection, request, **kwargs):
|
||||
newurl = response.url
|
||||
log.debug(LOG_CHECK, "Redirected to %r", newurl)
|
||||
self.aliases.append(newurl)
|
||||
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
|
||||
urlparts = strformat.url_unicode_split(newurl)
|
||||
if response is not None:
|
||||
self.urlparts = urlparts
|
||||
self.build_url_parts()
|
||||
self.url_connection = response
|
||||
self.headers = response.headers
|
||||
self.url = urlutil.urlunsplit(urlparts)
|
||||
self.scheme = urlparts[0].lower()
|
||||
|
||||
def getheader (self, name, default=None):
|
||||
"""Get decoded header value.
|
||||
|
|
@ -471,271 +197,29 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
|
||||
def check_response (self):
|
||||
"""Check final result and log it."""
|
||||
if self.response.status >= 400:
|
||||
self.set_result(u"%r %s" % (self.response.status, self.response.reason),
|
||||
if self.url_connection.status_code >= 400:
|
||||
self.set_result(u"%d %s" % (self.url_connection.status_code, self.url_connection.reason),
|
||||
valid=False)
|
||||
else:
|
||||
if self.response.status == 204:
|
||||
if self.url_connection.status_code == 204:
|
||||
# no content
|
||||
self.add_warning(self.response.reason,
|
||||
self.add_warning(self.url_connection.reason,
|
||||
tag=WARN_HTTP_EMPTY_CONTENT)
|
||||
# store cookies for valid links
|
||||
self.store_cookies()
|
||||
if self.response.status >= 200:
|
||||
self.set_result(u"%r %s" % (self.response.status, self.response.reason))
|
||||
if self.url_connection.status_code >= 200:
|
||||
self.set_result(u"%r %s" % (self.url_connection.status_code, self.url_connection.reason))
|
||||
else:
|
||||
self.set_result(_("OK"))
|
||||
modified = rfc822.parsedate(self.getheader('Last-Modified', u''))
|
||||
if modified:
|
||||
self.modified = datetime.utcfromtimestamp(time.mktime(modified))
|
||||
|
||||
def _try_http_response (self):
|
||||
"""Try to get a HTTP response object. For persistent
|
||||
connections that the server closed unexpected, a new connection
|
||||
will be opened.
|
||||
"""
|
||||
try:
|
||||
self._get_http_response()
|
||||
except socket.error as msg:
|
||||
if msg.args[0] == 32 and self.persistent:
|
||||
# server closed persistent connection - retry
|
||||
log.debug(LOG_CHECK, "Server closed connection: retry")
|
||||
self.persistent = False
|
||||
self._get_http_response()
|
||||
else:
|
||||
raise
|
||||
except httplib.BadStatusLine as msg:
|
||||
if self.persistent:
|
||||
# server closed connection - retry
|
||||
log.debug(LOG_CHECK, "Empty status line: retry")
|
||||
self.persistent = False
|
||||
self._get_http_response()
|
||||
else:
|
||||
raise
|
||||
|
||||
def _get_http_response (self):
|
||||
"""Send HTTP request and get response object."""
|
||||
scheme, host, port = self.get_netloc()
|
||||
log.debug(LOG_CHECK, "Connecting to %r", host)
|
||||
self.get_http_object(scheme, host, port)
|
||||
self.add_connection_request()
|
||||
self.add_connection_headers()
|
||||
self.response = self.url_connection.getresponse(buffering=True)
|
||||
self.headers = self.response.msg
|
||||
self.content_type = None
|
||||
self.persistent = not self.response.will_close
|
||||
if self.persistent and self.method == "HEAD":
|
||||
# Some servers send page content after a HEAD request,
|
||||
# but only after making the *next* request. This breaks
|
||||
# protocol synchronisation. Workaround here is to close
|
||||
# the connection after HEAD.
|
||||
# Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
|
||||
self.persistent = False
|
||||
# Note that for POST method the connection should also be closed,
|
||||
# but this method is never used.
|
||||
# If possible, use official W3C HTTP response name
|
||||
if self.response.status in httplib.responses:
|
||||
self.response.reason = httplib.responses[self.response.status]
|
||||
if self.response.reason:
|
||||
self.response.reason = unicode_safe(self.response.reason)
|
||||
log.debug(LOG_CHECK, "Response: %s %s", self.response.status, self.response.reason)
|
||||
|
||||
def add_connection_request(self):
|
||||
"""Add connection request."""
|
||||
# the anchor fragment is not part of a HTTP URL, see
|
||||
# http://tools.ietf.org/html/rfc2616#section-3.2.2
|
||||
anchor = ''
|
||||
if self.proxy:
|
||||
path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1],
|
||||
self.urlparts[2], self.urlparts[3], anchor))
|
||||
else:
|
||||
path = urlutil.urlunsplit(('', '', self.urlparts[2],
|
||||
self.urlparts[3], anchor))
|
||||
self.url_connection.putrequest(self.method, path, skip_host=True,
|
||||
skip_accept_encoding=True)
|
||||
|
||||
def add_connection_headers(self):
|
||||
"""Add connection header."""
|
||||
# be sure to use the original host as header even for proxies
|
||||
self.url_connection.putheader("Host", self.urlparts[1])
|
||||
if self.auth:
|
||||
# HTTP authorization
|
||||
self.url_connection.putheader("Authorization", self.auth)
|
||||
if self.proxyauth:
|
||||
self.url_connection.putheader("Proxy-Authorization",
|
||||
self.proxyauth)
|
||||
if (self.parent_url and
|
||||
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
|
||||
self.url_connection.putheader("Referer", self.parent_url)
|
||||
self.url_connection.putheader("User-Agent",
|
||||
self.aggregate.config["useragent"])
|
||||
# prefer compressed content
|
||||
self.url_connection.putheader("Accept-Encoding", ACCEPT_ENCODING)
|
||||
# prefer UTF-8 encoding
|
||||
self.url_connection.putheader("Accept-Charset", ACCEPT_CHARSET)
|
||||
# prefer parseable mime types
|
||||
self.url_connection.putheader("Accept", ACCEPT)
|
||||
# send do-not-track header
|
||||
self.url_connection.putheader("DNT", "1")
|
||||
if self.aggregate.config['sendcookies']:
|
||||
self.send_cookies()
|
||||
self.url_connection.endheaders()
|
||||
|
||||
def store_cookies (self):
|
||||
"""Save cookies from response headers."""
|
||||
if self.aggregate.config['storecookies']:
|
||||
for c in self.cookies:
|
||||
self.add_info(_("Sent Cookie: %(cookie)s.") %
|
||||
{"cookie": c.client_header_value()})
|
||||
errors = self.aggregate.cookies.add(self.headers,
|
||||
self.urlparts[0], self.urlparts[1], self.urlparts[2])
|
||||
if errors:
|
||||
self.add_warning(
|
||||
_("Could not store cookies from headers: %(error)s.") %
|
||||
{'error': "\n".join(errors)},
|
||||
tag=WARN_HTTP_COOKIE_STORE_ERROR)
|
||||
|
||||
def send_cookies (self):
|
||||
"""Add cookie headers to request."""
|
||||
scheme = self.urlparts[0]
|
||||
host = self.urlparts[1]
|
||||
port = urlutil.default_ports.get(scheme, 80)
|
||||
host, port = urlutil.splitport(host, port)
|
||||
path = self.urlparts[2] or u"/"
|
||||
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
|
||||
if not self.cookies:
|
||||
return
|
||||
# add one cookie header with all cookie data
|
||||
# this is limited by maximum header length
|
||||
headername = "Cookie"
|
||||
headervalue = ""
|
||||
max_value_len = headers.MAX_HEADER_BYTES - len(headername) - 2
|
||||
for c in self.cookies:
|
||||
cookievalue = c.client_header_value()
|
||||
if "version" in c.attributes:
|
||||
# add separate header for explicit versioned cookie
|
||||
if headervalue:
|
||||
self.url_connection.putheader(headername, headervalue)
|
||||
self.url_connection.putheader(headername, cookievalue)
|
||||
headervalue = ""
|
||||
continue
|
||||
if headervalue:
|
||||
cookievalue = "; " + cookievalue
|
||||
if (len(headervalue) + len(cookievalue)) < max_value_len:
|
||||
headervalue += cookievalue
|
||||
else:
|
||||
log.debug(LOG_CHECK, "Discard too-long cookie %r", cookievalue)
|
||||
if headervalue:
|
||||
log.debug(LOG_CHECK, "Sending cookie header %s:%s", headername, headervalue)
|
||||
self.url_connection.putheader(headername, headervalue)
|
||||
|
||||
def get_http_object (self, scheme, host, port):
|
||||
"""
|
||||
Open a HTTP connection.
|
||||
|
||||
@param host: the host to connect to
|
||||
@ptype host: string of the form <host>[:<port>]
|
||||
@param scheme: 'http' or 'https'
|
||||
@ptype scheme: string
|
||||
@return: None
|
||||
"""
|
||||
self.close_connection()
|
||||
def create_connection(scheme, host, port):
|
||||
"""Create a new http or https connection."""
|
||||
kwargs = dict(port=port, strict=True, timeout=self.aggregate.config["timeout"])
|
||||
if scheme == "http":
|
||||
h = httplib.HTTPConnection(host, **kwargs)
|
||||
elif scheme == "https" and supportHttps:
|
||||
devel_dir = os.path.join(configuration.configdata.install_data, "config")
|
||||
sslverify = self.aggregate.config["sslverify"]
|
||||
if sslverify:
|
||||
if sslverify is not True:
|
||||
kwargs["ca_certs"] = sslverify
|
||||
else:
|
||||
kwargs["ca_certs"] = configuration.get_share_file(devel_dir, 'ca-certificates.crt')
|
||||
h = httplib.HTTPSConnection(host, **kwargs)
|
||||
else:
|
||||
msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
|
||||
raise LinkCheckerError(msg)
|
||||
if log.is_debug(LOG_CHECK):
|
||||
h.set_debuglevel(1)
|
||||
return h
|
||||
self.get_pooled_connection(scheme, host, port, create_connection)
|
||||
self.url_connection.connect()
|
||||
|
||||
def read_content (self):
|
||||
"""Get content of the URL target. The content data is cached after
|
||||
the first call to this method.
|
||||
|
||||
@return: URL content, decompressed and decoded
|
||||
@rtype: string
|
||||
"""
|
||||
assert self.method_get_allowed, 'unallowed content read'
|
||||
if self.method != "GET" or self.response is None:
|
||||
self.method = "GET"
|
||||
self._try_http_response()
|
||||
num = self.follow_redirections(set_result=False)
|
||||
if not (0 <= num <= self.max_redirects):
|
||||
raise LinkCheckerError(_("Redirection error"))
|
||||
# Re-read size info, since the GET request result could be different
|
||||
# than a former HEAD request.
|
||||
self.add_size_info()
|
||||
if self.size > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
self.charset = headers.get_charset(self.headers)
|
||||
return self._read_content()
|
||||
|
||||
def _read_content (self):
|
||||
"""Read URL contents."""
|
||||
data = self.response.read(self.MaxFilesizeBytes+1)
|
||||
if len(data) > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
dlsize = len(data)
|
||||
self.aggregate.add_download_data(self.cache_content_key, data)
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding in SUPPORTED_ENCODINGS:
|
||||
try:
|
||||
if encoding == 'deflate':
|
||||
f = StringIO(zlib.decompress(data))
|
||||
else:
|
||||
f = gzip.GzipFile('', 'rb', 9, StringIO(data))
|
||||
except zlib.error as msg:
|
||||
log.debug(LOG_CHECK, "Error %s data of len %d", encoding, len(data))
|
||||
self.add_warning(_("Decompress error %(err)s") %
|
||||
{"err": str(msg)},
|
||||
tag=WARN_HTTP_DECOMPRESS_ERROR)
|
||||
f = StringIO(data)
|
||||
try:
|
||||
data = f.read()
|
||||
finally:
|
||||
f.close()
|
||||
return data, dlsize
|
||||
|
||||
def encoding_supported (self):
|
||||
"""Check if page encoding is supported."""
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding and encoding not in SUPPORTED_ENCODINGS and \
|
||||
encoding != 'identity':
|
||||
self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
|
||||
{"encoding": encoding},
|
||||
tag=WARN_HTTP_UNSUPPORTED_ENCODING)
|
||||
return False
|
||||
return True
|
||||
|
||||
def can_get_content(self):
|
||||
"""Check if it's allowed to read content."""
|
||||
return self.method_get_allowed
|
||||
|
||||
def content_allows_robots (self):
|
||||
"""Check if it's allowed to read content before execution."""
|
||||
if not self.method_get_allowed:
|
||||
return False
|
||||
return super(HttpUrl, self).content_allows_robots()
|
||||
|
||||
def check_warningregex (self):
|
||||
"""Check if it's allowed to read content before execution."""
|
||||
if self.method_get_allowed:
|
||||
super(HttpUrl, self).check_warningregex()
|
||||
def read_content(self):
|
||||
"""Return data and data size for this URL.
|
||||
Can be overridden in subclasses."""
|
||||
maxbytes = self.aggregate.config["maxfilesizedownload"]
|
||||
buf = StringIO()
|
||||
for data in self.url_connection.iter_content(chunk_size=self.ReadChunkBytes):
|
||||
if buf.tell() + len(data) > maxbytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
buf.write(data)
|
||||
return buf.getvalue()
|
||||
|
||||
def is_html (self):
|
||||
"""
|
||||
|
|
@ -748,22 +232,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
if not self.valid:
|
||||
return False
|
||||
mime = self.get_content_type()
|
||||
if self.ContentMimetypes.get(mime) != "html":
|
||||
return False
|
||||
if self.headers:
|
||||
return self.encoding_supported()
|
||||
return True
|
||||
return self.ContentMimetypes.get(mime) == "html"
|
||||
|
||||
def is_css (self):
|
||||
"""Return True iff content of this url is CSS stylesheet."""
|
||||
if not self.valid:
|
||||
return False
|
||||
mime = self.get_content_type()
|
||||
if self.ContentMimetypes.get(mime) != "css":
|
||||
return False
|
||||
if self.headers:
|
||||
return self.encoding_supported()
|
||||
return True
|
||||
return self.ContentMimetypes.get(mime) == "css"
|
||||
|
||||
def is_http (self):
|
||||
"""
|
||||
|
|
@ -781,30 +257,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
@return: True if content is parseable
|
||||
@rtype: bool
|
||||
"""
|
||||
if not (self.valid and self.headers):
|
||||
if not self.valid:
|
||||
return False
|
||||
ctype = self.get_content_type()
|
||||
if ctype not in self.ContentMimetypes:
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
|
||||
return False
|
||||
return self.encoding_supported()
|
||||
|
||||
def parse_url (self):
|
||||
"""
|
||||
Parse file contents for new links to check.
|
||||
"""
|
||||
ctype = self.get_content_type()
|
||||
if self.is_html():
|
||||
self.parse_html()
|
||||
elif self.is_css():
|
||||
self.parse_css()
|
||||
elif ctype == "application/x-shockwave-flash":
|
||||
self.parse_swf()
|
||||
elif ctype == "application/msword":
|
||||
self.parse_word()
|
||||
elif ctype == "text/vnd.wap.wml":
|
||||
self.parse_wml()
|
||||
self.add_num_url_info()
|
||||
return True
|
||||
|
||||
def get_robots_txt_url (self):
|
||||
"""
|
||||
|
|
@ -814,28 +273,3 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
|
|||
@rtype: string
|
||||
"""
|
||||
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
|
||||
|
||||
def close_response(self):
|
||||
"""Close the HTTP response object."""
|
||||
if self.response is None:
|
||||
return
|
||||
self.response.close()
|
||||
self.response = None
|
||||
|
||||
def close_connection (self):
|
||||
"""Release the connection from the connection pool. Persistent
|
||||
connections will not be closed.
|
||||
"""
|
||||
log.debug(LOG_CHECK, "Closing %s", self.url_connection)
|
||||
if self.url_connection is None:
|
||||
# no connection is open
|
||||
return
|
||||
# add to cached connections
|
||||
scheme, host, port = self.get_netloc()
|
||||
if self.persistent and self.url_connection.is_idle():
|
||||
expiration = time.time() + headers.http_keepalive(self.headers)
|
||||
else:
|
||||
self.close_response()
|
||||
expiration = None
|
||||
self.aggregate.connections.release(scheme, host, port, self.url_connection, expiration=expiration)
|
||||
self.url_connection = None
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -21,16 +21,13 @@ Handle for mailto: links.
|
|||
import re
|
||||
import urllib
|
||||
import urlparse
|
||||
import smtplib
|
||||
import socket
|
||||
from email._parseaddr import AddressList
|
||||
|
||||
from . import urlbase
|
||||
from .. import log, LOG_CHECK, strformat, url as urlutil
|
||||
from dns import resolver
|
||||
from ..network import iputil
|
||||
from .const import WARN_MAIL_NO_MX_HOST, \
|
||||
WARN_MAIL_UNVERIFIED_ADDRESS, WARN_MAIL_NO_CONNECTION
|
||||
from .const import WARN_MAIL_NO_MX_HOST
|
||||
|
||||
|
||||
def getaddresses (addr):
|
||||
|
|
@ -287,78 +284,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
# debug output
|
||||
log.debug(LOG_CHECK, "found %d MX mailhosts:", len(answers))
|
||||
for preference, host in mxdata:
|
||||
log.debug(LOG_CHECK,
|
||||
"MX host %r, preference %d", host, preference)
|
||||
# connect
|
||||
self.check_smtp_connect(mxdata, username, domain)
|
||||
|
||||
def check_smtp_connect (self, mxdata, username, domain):
|
||||
"""
|
||||
Connect to SMTP servers and check emails.
|
||||
|
||||
@param mxdata: list of (preference, host) tuples to check for
|
||||
@type mxdata: list
|
||||
@param username: the username to verify
|
||||
@type username: string
|
||||
"""
|
||||
smtpconnect = 0
|
||||
for preference, host in mxdata:
|
||||
try:
|
||||
log.debug(LOG_CHECK,
|
||||
"SMTP check for %r (preference %d)", host, preference)
|
||||
self.url_connection = smtplib.SMTP(timeout=self.aggregate.config["timeout"])
|
||||
if log.is_debug(LOG_CHECK):
|
||||
self.url_connection.set_debuglevel(1)
|
||||
self.url_connection.connect(host)
|
||||
log.debug(LOG_CHECK, "SMTP connected!")
|
||||
smtpconnect = 1
|
||||
self.url_connection.helo()
|
||||
mailaddress = "%s@%s" % (username, domain)
|
||||
status, info = self.url_connection.verify(mailaddress)
|
||||
log.debug(LOG_CHECK, "SMTP info %d %r", status, info)
|
||||
d = {
|
||||
'info': "%d %s" % (status, str(info)),
|
||||
'mail': mailaddress,
|
||||
}
|
||||
if status == 250:
|
||||
self.add_info(_("Verified address %(mail)s: %(info)s.") % d)
|
||||
# check for 25x status code which means that the address
|
||||
# could not be verified, but is sent anyway
|
||||
elif 250 < status < 260:
|
||||
self.add_info(_("Unverified but presumably valid"
|
||||
" address %(mail)s: %(info)s.") % d)
|
||||
else:
|
||||
self.add_warning(_("Unverified address: %(info)s.") % d,
|
||||
tag=WARN_MAIL_UNVERIFIED_ADDRESS)
|
||||
except smtplib.SMTPException as msg:
|
||||
self.add_warning(
|
||||
_("MX mail host %(host)s did not accept connections: "
|
||||
"%(error)s.") % {'host': host, 'error': str(msg)},
|
||||
tag=WARN_MAIL_NO_CONNECTION)
|
||||
if smtpconnect:
|
||||
break
|
||||
if not smtpconnect:
|
||||
self.set_result(_("Could not connect, but syntax is correct"),
|
||||
overwrite=True)
|
||||
else:
|
||||
self.set_result(_("Found MX mail host %(host)s") % {'host': host},
|
||||
overwrite=True)
|
||||
|
||||
def close_connection (self):
|
||||
"""
|
||||
Close a possibly opened SMTP connection.
|
||||
"""
|
||||
if self.url_connection is None:
|
||||
# no connection is open
|
||||
return
|
||||
connection = self.url_connection
|
||||
self.url_connection = None
|
||||
try:
|
||||
connection.quit()
|
||||
except (smtplib.SMTPException, socket.error):
|
||||
# ignore close errors
|
||||
# socket.error is raised for example on timeouts
|
||||
log.debug(LOG_CHECK, "MX host %r, preference %d", host, preference)
|
||||
pass
|
||||
self.set_result(_("Valid mail address syntax"))
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,40 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Mixin class for URLs that pool connections.
|
||||
"""
|
||||
|
||||
|
||||
class PooledConnection (object):
|
||||
"""Support for connection pooling."""
|
||||
|
||||
def get_pooled_connection(self, scheme, host, port, create_connection):
|
||||
"""Get a connection from the connection pool."""
|
||||
get_connection = self.aggregate.connections.get
|
||||
while True:
|
||||
connection = get_connection(scheme, host, port, create_connection)
|
||||
if hasattr(connection, 'acquire'):
|
||||
# It's a connection lock object.
|
||||
# This little trick avoids polling: wait for another
|
||||
# connection to be released by acquiring the lock.
|
||||
connection.acquire()
|
||||
# The lock is immediately released since the calling
|
||||
# connections.get() acquires it again.
|
||||
connection.release()
|
||||
else:
|
||||
self.url_connection = connection
|
||||
break
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ Handle uncheckable URLs.
|
|||
|
||||
import re
|
||||
from . import urlbase
|
||||
from .const import WARN_IGNORE_URL
|
||||
|
||||
# from http://www.iana.org/assignments/uri-schemes.html
|
||||
ignored_schemes_permanent = r"""
|
||||
|
|
@ -124,7 +123,7 @@ ignored_schemes_other = r"""
|
|||
"""
|
||||
|
||||
|
||||
ignored_schemes = "^(%s%s%s%s):" % (
|
||||
ignored_schemes = "^(%s%s%s%s)$" % (
|
||||
ignored_schemes_permanent,
|
||||
ignored_schemes_provisional,
|
||||
ignored_schemes_historical,
|
||||
|
|
@ -132,7 +131,7 @@ ignored_schemes = "^(%s%s%s%s):" % (
|
|||
)
|
||||
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
||||
|
||||
is_unknown_url = ignored_schemes_re.search
|
||||
is_unknown_scheme = ignored_schemes_re.match
|
||||
|
||||
|
||||
class UnknownUrl (urlbase.UrlBase):
|
||||
|
|
@ -140,19 +139,16 @@ class UnknownUrl (urlbase.UrlBase):
|
|||
|
||||
def local_check (self):
|
||||
"""Only logs that this URL is unknown."""
|
||||
if self.extern[0] and self.extern[1]:
|
||||
self.add_info(_("Outside of domain filter, checked only syntax."))
|
||||
elif self.ignored():
|
||||
self.add_warning(_("%(scheme)s URL ignored.") %
|
||||
{"scheme": self.scheme.capitalize()},
|
||||
tag=WARN_IGNORE_URL)
|
||||
if self.ignored():
|
||||
self.add_info(_("%(scheme)s URL ignored.") %
|
||||
{"scheme": self.scheme.capitalize()})
|
||||
else:
|
||||
self.set_result(_("URL is unrecognized or has invalid syntax"),
|
||||
valid=False)
|
||||
|
||||
def ignored (self):
|
||||
"""Return True if this URL scheme is ignored."""
|
||||
return ignored_schemes_re.search(self.url)
|
||||
return is_unknown_scheme(self.scheme)
|
||||
|
||||
def can_get_content (self):
|
||||
"""Unknown URLs have no content.
|
||||
|
|
|
|||
|
|
@ -26,21 +26,19 @@ import time
|
|||
import errno
|
||||
import socket
|
||||
import select
|
||||
from cStringIO import StringIO
|
||||
|
||||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip,
|
||||
fileutil, get_link_pat)
|
||||
from .. import (log, LOG_CHECK, LOG_CACHE,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat, parser)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
|
||||
WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
|
||||
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
|
||||
WARN_URL_WHITESPACE,
|
||||
WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
|
||||
WARN_SYNTAX_HTML, WARN_SYNTAX_CSS,
|
||||
ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
|
||||
# helper alias
|
||||
|
|
@ -71,17 +69,6 @@ def url_norm (url, encoding=None):
|
|||
raise LinkCheckerError(msg)
|
||||
|
||||
|
||||
def getXmlText (parent, tag):
|
||||
"""Return XML content of given tag in parent element."""
|
||||
elem = parent.getElementsByTagName(tag)[0]
|
||||
# Yes, the DOM standard is awful.
|
||||
rc = []
|
||||
for node in elem.childNodes:
|
||||
if node.nodeType == node.TEXT_NODE:
|
||||
rc.append(node.data)
|
||||
return ''.join(rc)
|
||||
|
||||
|
||||
class UrlBase (object):
|
||||
"""An URL with additional information like validity etc."""
|
||||
|
||||
|
|
@ -103,8 +90,8 @@ class UrlBase (object):
|
|||
"text/vnd.wap.wml": "wml",
|
||||
}
|
||||
|
||||
# Set maximum file size for downloaded files in bytes.
|
||||
MaxFilesizeBytes = 1024*1024*5
|
||||
# Read in 16kb chunks
|
||||
ReadChunkBytes = 1024*16
|
||||
|
||||
def __init__ (self, base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=-1, column=-1,
|
||||
|
|
@ -173,8 +160,6 @@ class UrlBase (object):
|
|||
self.urlparts = None
|
||||
# the scheme, host, port and anchor part of url
|
||||
self.scheme = self.host = self.port = self.anchor = None
|
||||
# list of parsed anchors
|
||||
self.anchors = []
|
||||
# the result message string and flag
|
||||
self.result = u""
|
||||
self.has_result = False
|
||||
|
|
@ -190,8 +175,6 @@ class UrlBase (object):
|
|||
self.modified = None
|
||||
# download time
|
||||
self.dltime = -1
|
||||
# download size
|
||||
self.dlsize = -1
|
||||
# check time
|
||||
self.checktime = 0
|
||||
# connection object
|
||||
|
|
@ -211,8 +194,6 @@ class UrlBase (object):
|
|||
self.do_check_content = True
|
||||
# MIME content type
|
||||
self.content_type = None
|
||||
# number of URLs in page content
|
||||
self.num_urls = 0
|
||||
|
||||
def set_result (self, msg, valid=True, overwrite=False):
|
||||
"""
|
||||
|
|
@ -229,6 +210,8 @@ class UrlBase (object):
|
|||
log.warn(LOG_CHECK, "Empty result for %s", self)
|
||||
self.result = msg
|
||||
self.valid = valid
|
||||
# free content data
|
||||
self.data = None
|
||||
|
||||
def get_title (self):
|
||||
"""Return title of page the URL refers to.
|
||||
|
|
@ -246,30 +229,6 @@ class UrlBase (object):
|
|||
self.title = title
|
||||
return self.title
|
||||
|
||||
def set_title_from_content (self):
|
||||
"""Set title of page the URL refers to.from page content."""
|
||||
if not self.valid:
|
||||
return
|
||||
try:
|
||||
handler = linkparse.TitleFinder()
|
||||
except tuple(ExcList):
|
||||
return
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
if self.charset:
|
||||
parser.encoding = self.charset
|
||||
# parse
|
||||
try:
|
||||
parser.feed(self.get_content())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
if handler.title:
|
||||
self.title = handler.title
|
||||
|
||||
def is_parseable (self):
|
||||
"""
|
||||
Return True iff content of this url is parseable.
|
||||
|
|
@ -287,15 +246,15 @@ class UrlBase (object):
|
|||
return False
|
||||
|
||||
def is_http (self):
|
||||
"""
|
||||
Return True for http:// URLs.
|
||||
"""
|
||||
"""Return True for http:// URLs."""
|
||||
return False
|
||||
|
||||
def is_file (self):
|
||||
"""
|
||||
Return True for file:// URLs.
|
||||
"""
|
||||
"""Return True for file:// URLs."""
|
||||
return False
|
||||
|
||||
def is_directory(self):
|
||||
"""Return True if current URL represents a directory."""
|
||||
return False
|
||||
|
||||
def is_local(self):
|
||||
|
|
@ -318,45 +277,6 @@ class UrlBase (object):
|
|||
if s not in self.info:
|
||||
self.info.append(s)
|
||||
|
||||
def copy_from_cache (self, cache_data):
|
||||
"""
|
||||
Fill attributes from cache data.
|
||||
"""
|
||||
self.url = cache_data["url"]
|
||||
self.result = cache_data["result"]
|
||||
self.has_result = True
|
||||
anchor_changed = (self.anchor != cache_data["anchor"])
|
||||
for tag, msg in cache_data["warnings"]:
|
||||
# do not copy anchor warnings, since the current anchor
|
||||
# might have changed
|
||||
if anchor_changed and tag == WARN_URL_ANCHOR_NOT_FOUND:
|
||||
continue
|
||||
self.add_warning(msg, tag=tag)
|
||||
for info in cache_data["info"]:
|
||||
self.add_info(info)
|
||||
self.valid = cache_data["valid"]
|
||||
self.dltime = cache_data["dltime"]
|
||||
self.dlsize = cache_data["dlsize"]
|
||||
self.anchors = cache_data["anchors"]
|
||||
self.content_type = cache_data["content_type"]
|
||||
if anchor_changed and self.valid:
|
||||
# recheck anchor
|
||||
self.check_anchor()
|
||||
|
||||
def get_cache_data (self):
|
||||
"""Return all data values that should be put in the cache."""
|
||||
return {"url": self.url,
|
||||
"result": self.result,
|
||||
"warnings": self.warnings,
|
||||
"info": self.info,
|
||||
"valid": self.valid,
|
||||
"dltime": self.dltime,
|
||||
"dlsize": self.dlsize,
|
||||
"anchors": self.anchors,
|
||||
"anchor": self.anchor,
|
||||
"content_type": self.get_content_type(),
|
||||
}
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""
|
||||
Set keys for URL checking and content recursion.
|
||||
|
|
@ -367,11 +287,7 @@ class UrlBase (object):
|
|||
assert isinstance(self.cache_content_key, unicode), self
|
||||
log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
|
||||
# construct cache key
|
||||
if self.aggregate.config["anchors"]:
|
||||
# add anchor to cache key
|
||||
self.cache_url_key = urlutil.urlunsplit(self.urlparts[:4]+[self.anchor or u""])
|
||||
else:
|
||||
self.cache_url_key = self.cache_content_key
|
||||
self.cache_url_key = self.cache_content_key
|
||||
assert isinstance(self.cache_url_key, unicode), self
|
||||
log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)
|
||||
|
||||
|
|
@ -442,9 +358,9 @@ class UrlBase (object):
|
|||
self.url = urlutil.urlunsplit(urlparts)
|
||||
# split into (modifiable) list
|
||||
self.urlparts = strformat.url_unicode_split(self.url)
|
||||
self.build_url_parts()
|
||||
# and unsplit again
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
self.build_url_parts()
|
||||
|
||||
def build_url_parts (self):
|
||||
"""Set userinfo, host, port and anchor from self.urlparts.
|
||||
|
|
@ -452,22 +368,28 @@ class UrlBase (object):
|
|||
"""
|
||||
# check userinfo@host:port syntax
|
||||
self.userinfo, host = urllib.splituser(self.urlparts[1])
|
||||
# set host lowercase
|
||||
if self.userinfo:
|
||||
self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
|
||||
else:
|
||||
self.urlparts[1] = host.lower()
|
||||
# safe anchor for later checking
|
||||
self.anchor = self.urlparts[4]
|
||||
port = urlutil.default_ports.get(self.scheme, 0)
|
||||
self.host, self.port = urlutil.splitport(host, port=port)
|
||||
if self.port is None:
|
||||
host, port = urlutil.splitport(host, port=port)
|
||||
if port is None:
|
||||
raise LinkCheckerError(_("URL host %(host)r has invalid port") %
|
||||
{"host": host})
|
||||
self.port = port
|
||||
# set host lowercase
|
||||
self.host = host.lower()
|
||||
if self.scheme in scheme_requires_host:
|
||||
if not self.host:
|
||||
raise LinkCheckerError(_("URL has empty hostname"))
|
||||
self.check_obfuscated_ip()
|
||||
if not self.port or self.port == urlutil.default_ports.get(self.scheme):
|
||||
host = self.host
|
||||
else:
|
||||
host = "%s:%d" % (self.host, self.port)
|
||||
if self.userinfo:
|
||||
self.urlparts[1] = "%s@%s" % (self.userinfo, host)
|
||||
else:
|
||||
self.urlparts[1] = host
|
||||
# safe anchor for later checking
|
||||
self.anchor = self.urlparts[4]
|
||||
|
||||
def check_obfuscated_ip (self):
|
||||
"""Warn if host of this URL is obfuscated IP address."""
|
||||
|
|
@ -476,9 +398,10 @@ class UrlBase (object):
|
|||
if iputil.is_obfuscated_ip(self.host):
|
||||
ips = iputil.resolve_host(self.host)
|
||||
if ips:
|
||||
self.host = ips[0]
|
||||
self.add_warning(
|
||||
_("URL %(url)s has obfuscated IP address %(ip)s") % \
|
||||
{"url": self.base_url, "ip": ips.pop()},
|
||||
{"url": self.base_url, "ip": ips[0]},
|
||||
tag=WARN_URL_OBFUSCATED_IP)
|
||||
|
||||
def check (self):
|
||||
|
|
@ -499,19 +422,6 @@ class UrlBase (object):
|
|||
# close/release possible open connection
|
||||
self.close_connection()
|
||||
|
||||
def add_country_info (self):
|
||||
"""Try to ask GeoIP database for country info."""
|
||||
if self.host:
|
||||
country = geoip.get_country(self.host)
|
||||
if country:
|
||||
self.add_info(_("URL is located in %(country)s.") %
|
||||
{"country": _(country)})
|
||||
|
||||
def add_size_info (self):
|
||||
"""Store size of URL content from meta info into self.size.
|
||||
Must be implemented in subclasses."""
|
||||
pass
|
||||
|
||||
def local_check (self):
|
||||
"""Local check function can be overridden in subclasses."""
|
||||
log.debug(LOG_CHECK, "Checking %s", self)
|
||||
|
|
@ -524,35 +434,28 @@ class UrlBase (object):
|
|||
try:
|
||||
self.check_connection()
|
||||
self.add_size_info()
|
||||
self.add_country_info()
|
||||
self.aggregate.plugin_manager.run_connection_plugins(self)
|
||||
except tuple(ExcList) as exc:
|
||||
value = self.handle_exception()
|
||||
# make nicer error msg for unknown hosts
|
||||
if isinstance(exc, socket.error) and exc.args[0] == -2:
|
||||
value = _('Hostname not found')
|
||||
# make nicer error msg for bad status line
|
||||
elif isinstance(exc, httplib.BadStatusLine):
|
||||
value = _('Bad HTTP response %(line)r') % {"line": str(value)}
|
||||
elif isinstance(exc, UnicodeError):
|
||||
# idna.encode(host) failed
|
||||
value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': str(value)}
|
||||
self.set_result(unicode_safe(value), valid=False)
|
||||
self.checktime = time.time() - check_start
|
||||
if self.do_check_content:
|
||||
# check content and recursion
|
||||
try:
|
||||
self.check_content()
|
||||
if self.valid and self.can_get_content():
|
||||
self.aggregate.plugin_manager.run_content_plugins(self)
|
||||
if self.allows_recursion():
|
||||
self.parse_url()
|
||||
# check content size
|
||||
self.check_size()
|
||||
parser.parse_url(self)
|
||||
except tuple(ExcList):
|
||||
value = self.handle_exception()
|
||||
# make nicer error msg for bad status line
|
||||
if isinstance(value, httplib.BadStatusLine):
|
||||
value = _('Bad HTTP response %(line)r') % {"line": str(value)}
|
||||
self.add_warning(_("could not get content: %(msg)s") %
|
||||
{"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)
|
||||
self.checktime = time.time() - check_start
|
||||
|
||||
def close_connection (self):
|
||||
"""
|
||||
|
|
@ -595,6 +498,17 @@ class UrlBase (object):
|
|||
"""
|
||||
self.url_connection = urllib2.urlopen(self.url)
|
||||
|
||||
def add_size_info (self):
|
||||
"""Set size of URL content (if any)..
|
||||
Should be overridden in subclasses."""
|
||||
maxbytes = self.aggregate.config["maxfilesizedownload"]
|
||||
if self.size > maxbytes:
|
||||
self.add_warning(
|
||||
_("Content size %(size)s is larger than %(maxbytes)s.") %
|
||||
dict(size=strformat.strsize(self.size),
|
||||
maxbytes=strformat.strsize(maxbytes)),
|
||||
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
|
||||
|
||||
def allows_recursion (self):
|
||||
"""
|
||||
Return True iff we can recurse into the url's content.
|
||||
|
|
@ -617,6 +531,9 @@ class UrlBase (object):
|
|||
if self.extern[0]:
|
||||
log.debug(LOG_CHECK, "... no, extern.")
|
||||
return False
|
||||
if self.size > self.aggregate.config["maxfilesizeparse"]:
|
||||
log.debug(LOG_CHECK, "... no, maximum parse size.")
|
||||
return False
|
||||
if not self.content_allows_robots():
|
||||
log.debug(LOG_CHECK, "... no, robots.")
|
||||
return False
|
||||
|
|
@ -628,6 +545,7 @@ class UrlBase (object):
|
|||
Return False if the content of this URL forbids robots to
|
||||
search for recursive links.
|
||||
"""
|
||||
# XXX cleanup
|
||||
if not self.is_html():
|
||||
return True
|
||||
if not (self.is_http() or self.is_file()):
|
||||
|
|
@ -644,63 +562,12 @@ class UrlBase (object):
|
|||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
return handler.follow
|
||||
|
||||
def get_anchors (self):
|
||||
"""Store anchors for this URL. Precondition: this URL is
|
||||
an HTML resource."""
|
||||
log.debug(LOG_CHECK, "Getting HTML anchors %s", self)
|
||||
self.find_links(self.add_anchor, tags=linkparse.AnchorTags)
|
||||
|
||||
def find_links (self, callback, tags=None):
|
||||
"""Parse into content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
# construct parser object
|
||||
handler = linkparse.LinkFinder(callback, tags=tags)
|
||||
parser = htmlsax.parser(handler)
|
||||
if self.charset:
|
||||
parser.encoding = self.charset
|
||||
handler.parser = parser
|
||||
# parse
|
||||
try:
|
||||
parser.feed(self.get_content())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
|
||||
def add_anchor (self, url, line, column, name, base):
|
||||
"""Add anchor URL."""
|
||||
self.anchors.append((url, line, column, name, base))
|
||||
|
||||
def check_anchor (self):
|
||||
"""If URL is valid, parseable and has an anchor, check it.
|
||||
A warning is logged and True is returned if the anchor is not found.
|
||||
"""
|
||||
if not (self.anchor and self.aggregate.config["anchors"] and
|
||||
self.valid and self.is_html()):
|
||||
return
|
||||
log.debug(LOG_CHECK, "checking anchor %r in %s", self.anchor, self.anchors)
|
||||
enc = lambda anchor: urlutil.url_quote_part(anchor, encoding=self.encoding)
|
||||
if any(x for x in self.anchors if enc(x[0]) == self.anchor):
|
||||
return
|
||||
if self.anchors:
|
||||
anchornames = sorted(set(u"`%s'" % x[0] for x in self.anchors))
|
||||
anchors = u", ".join(anchornames)
|
||||
else:
|
||||
anchors = u"-"
|
||||
args = {"name": self.anchor, "anchors": anchors}
|
||||
msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args,
|
||||
_("Available anchors: %(anchors)s.") % args)
|
||||
self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND)
|
||||
return True
|
||||
|
||||
def set_extern (self, url):
|
||||
"""
|
||||
Match URL against extern and intern link patterns. If no pattern
|
||||
|
|
@ -728,9 +595,15 @@ class UrlBase (object):
|
|||
log.debug(LOG_CHECK, "Intern URL %r", url)
|
||||
self.extern = (0, 0)
|
||||
return
|
||||
log.debug(LOG_CHECK, "Explicit extern URL %r", url)
|
||||
self.extern = (1, 0)
|
||||
return
|
||||
if self.aggregate.config['checkextern']:
|
||||
self.extern = (1, 0)
|
||||
else:
|
||||
self.extern = (1, 1)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
self.add_info(_("The URL is outside of the domain "
|
||||
"filter, checked only syntax."))
|
||||
if not self.has_result:
|
||||
self.set_result(_("filtered"))
|
||||
|
||||
def get_content_type (self):
|
||||
"""Return content MIME type or empty string.
|
||||
|
|
@ -741,188 +614,35 @@ class UrlBase (object):
|
|||
|
||||
def can_get_content (self):
|
||||
"""Indicate wether url get_content() can be called."""
|
||||
return True
|
||||
return self.size <= self.aggregate.config["maxfilesizedownload"]
|
||||
|
||||
def get_content (self):
|
||||
"""Precondition: url_connection is an opened URL."""
|
||||
if self.data is None:
|
||||
log.debug(LOG_CHECK, "Get content of %r", self.url)
|
||||
t = time.time()
|
||||
self.data, self.dlsize = self.read_content()
|
||||
self.data = self.read_content()
|
||||
self.size = len(self.data)
|
||||
self.dltime = time.time() - t
|
||||
if self.size == 0:
|
||||
self.add_warning(_("Content size is zero."),
|
||||
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||
return self.data
|
||||
|
||||
def read_content (self):
|
||||
"""Return data and data size for this URL.
|
||||
Can be overridden in subclasses."""
|
||||
if self.size > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
data = self.url_connection.read(self.MaxFilesizeBytes+1)
|
||||
if len(data) > self.MaxFilesizeBytes:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
if not self.is_local():
|
||||
self.aggregate.add_download_data(self.cache_content_key, data)
|
||||
return data, len(data)
|
||||
def read_content(self):
|
||||
"""Return data for this URL. Can be overridden in subclasses."""
|
||||
buf = StringIO()
|
||||
data = self.read_content_chunk()
|
||||
while data:
|
||||
if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
|
||||
raise LinkCheckerError(_("File size too large"))
|
||||
buf.write(data)
|
||||
data = self.read_content_chunk()
|
||||
return buf.getvalue()
|
||||
|
||||
def check_content (self):
|
||||
"""Check content data for warnings, syntax errors, viruses etc."""
|
||||
if not (self.valid and self.can_get_content()):
|
||||
return
|
||||
if self.is_html():
|
||||
self.set_title_from_content()
|
||||
if self.aggregate.config["anchors"]:
|
||||
self.get_anchors()
|
||||
self.check_anchor()
|
||||
self.check_warningregex()
|
||||
# is it an intern URL?
|
||||
if not self.extern[0]:
|
||||
# check HTML/CSS syntax
|
||||
if self.aggregate.config["checkhtml"] and self.is_html():
|
||||
self.check_html()
|
||||
if self.aggregate.config["checkcss"] and self.is_css():
|
||||
self.check_css()
|
||||
# check with clamav
|
||||
if self.aggregate.config["scanvirus"]:
|
||||
self.scan_virus()
|
||||
|
||||
def check_warningregex (self):
|
||||
"""Check if content matches a given regular expression."""
|
||||
config = self.aggregate.config
|
||||
warningregex = config["warningregex"]
|
||||
if not (warningregex and self.valid and self.is_parseable()):
|
||||
return
|
||||
log.debug(LOG_CHECK, "checking content for warning regex")
|
||||
try:
|
||||
content = self.get_content()
|
||||
curpos = 0
|
||||
curline = 1
|
||||
# add warnings for found matches, up to the maximum allowed number
|
||||
for num, match in enumerate(warningregex.finditer(content)):
|
||||
# calculate line number for match
|
||||
curline += content.count('\n', curpos, match.start())
|
||||
curpos = match.start()
|
||||
# add a warning message
|
||||
msg = _("Found %(match)r at line %(line)d in link contents.")
|
||||
self.add_warning(msg %
|
||||
{"match": match.group(), "line": curline},
|
||||
tag=WARN_URL_WARNREGEX_FOUND)
|
||||
# check for maximum number of warnings
|
||||
if num >= config["warningregex_max"]:
|
||||
break
|
||||
except tuple(ExcList):
|
||||
value = self.handle_exception()
|
||||
self.set_result(unicode_safe(value), valid=False)
|
||||
|
||||
def check_size (self):
|
||||
"""Check content size if it is zero or larger than a given
|
||||
maximum size.
|
||||
"""
|
||||
if self.dlsize == 0:
|
||||
self.add_warning(_("Content size is zero."),
|
||||
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||
else:
|
||||
maxbytes = self.aggregate.config["warnsizebytes"]
|
||||
if maxbytes is not None and self.dlsize >= maxbytes:
|
||||
self.add_warning(
|
||||
_("Content size %(dlsize)s is larger than %(maxbytes)s.") %
|
||||
{"dlsize": strformat.strsize(self.dlsize),
|
||||
"maxbytes": strformat.strsize(maxbytes)},
|
||||
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
|
||||
if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size:
|
||||
self.add_warning(_("Download size (%(dlsize)d Byte) "
|
||||
"does not equal content size (%(size)d Byte).") %
|
||||
{"dlsize": self.dlsize,
|
||||
"size": self.size},
|
||||
tag=WARN_URL_CONTENT_SIZE_UNEQUAL)
|
||||
|
||||
def check_w3_errors (self, xml, w3type):
|
||||
"""Add warnings for W3C HTML or CSS errors in xml format.
|
||||
w3type is either "W3C HTML" or "W3C CSS"."""
|
||||
from xml.dom.minidom import parseString
|
||||
dom = parseString(xml)
|
||||
for error in dom.getElementsByTagName('m:error'):
|
||||
warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
|
||||
attrs = {
|
||||
"w3type": w3type,
|
||||
"line": getXmlText(error, "m:line"),
|
||||
"column": getXmlText(error, "m:col"),
|
||||
"msg": getXmlText(error, "m:message"),
|
||||
}
|
||||
tag = WARN_SYNTAX_HTML if w3type == "W3C HTML" else WARN_SYNTAX_CSS
|
||||
self.add_warning(warnmsg % attrs, tag=tag)
|
||||
|
||||
def check_html (self):
|
||||
"""Check HTML syntax of this page (which is supposed to be HTML)
|
||||
with the online W3C HTML validator documented at
|
||||
http://validator.w3.org/docs/api.html
|
||||
"""
|
||||
self.aggregate.check_w3_time()
|
||||
try:
|
||||
body = {'fragment': self.get_content(), 'output': 'soap12'}
|
||||
data = urllib.urlencode(body)
|
||||
u = urllib2.urlopen('http://validator.w3.org/check', data)
|
||||
if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
|
||||
self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
|
||||
return
|
||||
self.check_w3_errors(u.read(), "W3C HTML")
|
||||
except Exception:
|
||||
# catch _all_ exceptions since we dont want third party module
|
||||
# errors to propagate into this library
|
||||
err = str(sys.exc_info()[1])
|
||||
log.warn(LOG_CHECK,
|
||||
_("HTML W3C validation caused error: %(msg)s ") %
|
||||
{"msg": err})
|
||||
|
||||
def check_css (self):
|
||||
"""Check CSS syntax of this page (which is supposed to be CSS)
|
||||
with the online W3C CSS validator documented at
|
||||
http://jigsaw.w3.org/css-validator/manual.html#expert
|
||||
"""
|
||||
self.aggregate.check_w3_time()
|
||||
try:
|
||||
host = 'jigsaw.w3.org'
|
||||
path = '/css-validator/validator'
|
||||
params = {
|
||||
'text': "div {}",
|
||||
'warning': '2',
|
||||
'output': 'soap12',
|
||||
}
|
||||
fields = params.items()
|
||||
content_type, body = httputil.encode_multipart_formdata(fields)
|
||||
h = httplib.HTTPConnection(host)
|
||||
h.putrequest('POST', path)
|
||||
h.putheader('Content-Type', content_type)
|
||||
h.putheader('Content-Length', str(len(body)))
|
||||
h.endheaders()
|
||||
h.send(body)
|
||||
r = h.getresponse(True)
|
||||
if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
|
||||
self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
|
||||
return
|
||||
self.check_w3_errors(r.read(), "W3C HTML")
|
||||
except Exception:
|
||||
# catch _all_ exceptions since we dont want third party module
|
||||
# errors to propagate into this library
|
||||
err = str(sys.exc_info()[1])
|
||||
log.warn(LOG_CHECK,
|
||||
_("CSS W3C validation caused error: %(msg)s ") %
|
||||
{"msg": err})
|
||||
|
||||
def scan_virus (self):
|
||||
"""Scan content for viruses."""
|
||||
infected, errors = clamav.scan(self.get_content())
|
||||
for msg in infected:
|
||||
self.add_warning(u"Virus scan infection: %s" % msg)
|
||||
for msg in errors:
|
||||
self.add_warning(u"Virus scan error: %s" % msg)
|
||||
|
||||
def parse_url (self):
|
||||
"""
|
||||
Parse url content and search for recursive links.
|
||||
Default parse type is html.
|
||||
"""
|
||||
self.parse_html()
|
||||
self.add_num_url_info()
|
||||
def read_content_chunk(self):
|
||||
"""Read one chunk of content from this URL."""
|
||||
return self.url_connection.read(self.ReadChunkBytes)
|
||||
|
||||
def get_user_password (self):
|
||||
"""Get tuple (user, password) from configured authentication.
|
||||
|
|
@ -933,16 +653,8 @@ class UrlBase (object):
|
|||
return urllib.splitpasswd(self.userinfo)
|
||||
return self.aggregate.config.get_user_password(self.url)
|
||||
|
||||
def parse_html (self):
|
||||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
log.debug(LOG_CHECK, "Parsing HTML %s", self)
|
||||
self.find_links(self.add_url)
|
||||
|
||||
def add_url (self, url, line=0, column=0, name=u"", base=None):
|
||||
"""Queue URL data for checking."""
|
||||
self.num_urls += 1
|
||||
if base:
|
||||
base_ref = urlutil.url_norm(base)[0]
|
||||
else:
|
||||
|
|
@ -954,108 +666,6 @@ class UrlBase (object):
|
|||
# Only queue URLs which have a result or are not strict extern.
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def add_num_url_info(self):
|
||||
"""Add number of URLs parsed to info."""
|
||||
if self.num_urls > 0:
|
||||
attrs = {"num": self.num_urls}
|
||||
msg = _n("%(num)d URL parsed.", "%(num)d URLs parsed.", self.num_urls)
|
||||
self.add_info(msg % attrs)
|
||||
|
||||
def parse_opera (self):
|
||||
"""Parse an opera bookmark file."""
|
||||
log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)
|
||||
from ..bookmarks.opera import parse_bookmark_data
|
||||
for url, name, lineno in parse_bookmark_data(self.get_content()):
|
||||
self.add_url(url, line=lineno, name=name)
|
||||
|
||||
def parse_chromium (self):
|
||||
"""Parse a Chromium or Google Chrome bookmark file."""
|
||||
log.debug(LOG_CHECK, "Parsing Chromium bookmarks %s", self)
|
||||
from ..bookmarks.chromium import parse_bookmark_data
|
||||
for url, name in parse_bookmark_data(self.get_content()):
|
||||
self.add_url(url, name=name)
|
||||
|
||||
def parse_safari (self):
|
||||
"""Parse a Safari bookmark file."""
|
||||
log.debug(LOG_CHECK, "Parsing Safari bookmarks %s", self)
|
||||
from ..bookmarks.safari import parse_bookmark_data
|
||||
for url, name in parse_bookmark_data(self.get_content()):
|
||||
self.add_url(url, name=name)
|
||||
|
||||
def parse_text (self):
|
||||
"""Parse a text file with one url per line; comment and blank
|
||||
lines are ignored."""
|
||||
log.debug(LOG_CHECK, "Parsing text %s", self)
|
||||
lineno = 0
|
||||
for line in self.get_content().splitlines():
|
||||
lineno += 1
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
self.add_url(line, line=lineno)
|
||||
|
||||
def parse_css (self):
|
||||
"""
|
||||
Parse a CSS file for url() patterns.
|
||||
"""
|
||||
log.debug(LOG_CHECK, "Parsing CSS %s", self)
|
||||
lineno = 0
|
||||
linkfinder = linkparse.css_url_re.finditer
|
||||
strip_comments = linkparse.strip_c_comments
|
||||
for line in strip_comments(self.get_content()).splitlines():
|
||||
lineno += 1
|
||||
for mo in linkfinder(line):
|
||||
column = mo.start("url")
|
||||
url = strformat.unquote(mo.group("url").strip())
|
||||
self.add_url(url, line=lineno, column=column)
|
||||
|
||||
def parse_swf (self):
|
||||
"""Parse a SWF file for URLs."""
|
||||
linkfinder = linkparse.swf_url_re.finditer
|
||||
for mo in linkfinder(self.get_content()):
|
||||
url = mo.group()
|
||||
self.add_url(url)
|
||||
|
||||
def parse_word (self):
|
||||
"""Parse a word file for hyperlinks."""
|
||||
if not winutil.has_word():
|
||||
return
|
||||
filename = self.get_temp_filename()
|
||||
# open word file and parse hyperlinks
|
||||
try:
|
||||
app = winutil.get_word_app()
|
||||
try:
|
||||
doc = winutil.open_wordfile(app, filename)
|
||||
if doc is None:
|
||||
raise winutil.Error("could not open word file %r" % filename)
|
||||
try:
|
||||
for link in doc.Hyperlinks:
|
||||
self.add_url(link.Address, name=link.TextToDisplay)
|
||||
finally:
|
||||
winutil.close_wordfile(doc)
|
||||
finally:
|
||||
winutil.close_word_app(app)
|
||||
except winutil.Error, msg:
|
||||
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
|
||||
|
||||
def parse_wml (self):
|
||||
"""Parse into WML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
log.debug(LOG_CHECK, "Parsing WML %s", self)
|
||||
self.find_links(self.add_url, tags=linkparse.WmlTags)
|
||||
|
||||
def get_temp_filename (self):
|
||||
"""Get temporary filename for content to parse."""
|
||||
# store content in temporary file
|
||||
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
|
||||
prefix='lc_')
|
||||
try:
|
||||
fd.write(self.get_content())
|
||||
finally:
|
||||
fd.close()
|
||||
return filename
|
||||
|
||||
def serialized (self, sep=os.linesep):
|
||||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
|
|
@ -1103,7 +713,7 @@ class UrlBase (object):
|
|||
if pat:
|
||||
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
|
||||
self.aggregate.config['internlinks'].append(get_link_pat(pat))
|
||||
except UnicodeError, msg:
|
||||
except UnicodeError as msg:
|
||||
res = _("URL has unparsable domain name: %(domain)s") % \
|
||||
{"domain": msg}
|
||||
self.set_result(res, valid=False)
|
||||
|
|
@ -1151,7 +761,7 @@ class UrlBase (object):
|
|||
Number of seconds needed to check this link, default: zero.
|
||||
- url_data.dltime: int
|
||||
Number of seconds needed to download URL content, default: -1
|
||||
- url_data.dlsize: int
|
||||
- url_data.size: int
|
||||
Size of downloaded URL content, default: -1
|
||||
- url_data.info: list of unicode
|
||||
Additional information about this URL.
|
||||
|
|
@ -1181,7 +791,7 @@ class UrlBase (object):
|
|||
domain=(self.urlparts[1] if self.urlparts else u""),
|
||||
checktime=self.checktime,
|
||||
dltime=self.dltime,
|
||||
dlsize=self.dlsize,
|
||||
size=self.size,
|
||||
info=self.info,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
|
|
@ -1211,7 +821,7 @@ urlDataAttr = [
|
|||
'domain',
|
||||
'checktime',
|
||||
'dltime',
|
||||
'dlsize',
|
||||
'size',
|
||||
'info',
|
||||
'modified',
|
||||
'line',
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Utility functions suitable for command line clients.
|
|||
from __future__ import print_function
|
||||
import sys
|
||||
import argparse
|
||||
from . import checker, fileutil, strformat
|
||||
from . import checker, fileutil, strformat, plugins
|
||||
from .director import console
|
||||
|
||||
|
||||
|
|
@ -42,6 +42,19 @@ def print_version(exit_code=0):
|
|||
sys.exit(exit_code)
|
||||
|
||||
|
||||
def print_plugins(folders, exit_code=0):
|
||||
"""Print available plugins and exit."""
|
||||
modules = plugins.get_plugin_modules(folders)
|
||||
pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__)
|
||||
|
||||
for pluginclass in pluginclasses:
|
||||
print(pluginclass.__name__)
|
||||
doc = strformat.wrap(pluginclass.__doc__, 80)
|
||||
print(strformat.indent(doc))
|
||||
print()
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
def print_usage (msg, exit_code=2):
|
||||
"""Print a program msg text to stderr and exit."""
|
||||
program = sys.argv[0]
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ import urlparse
|
|||
import shutil
|
||||
import socket
|
||||
import _LinkChecker_configdata as configdata
|
||||
from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames, clamav,
|
||||
from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames,
|
||||
get_config_dir, fileutil, configdict)
|
||||
from . import confparse
|
||||
from ..decorators import memoized
|
||||
|
|
@ -75,6 +75,9 @@ Modules = (
|
|||
def get_modules_info ():
|
||||
"""Return list of unicode strings with detected module info."""
|
||||
lines = []
|
||||
# requests
|
||||
import requests
|
||||
lines.append(u"Requests: %s" % requests.__version__)
|
||||
# PyQt
|
||||
try:
|
||||
from PyQt4 import QtCore
|
||||
|
|
@ -129,53 +132,48 @@ class Configuration (dict):
|
|||
Initialize the default options.
|
||||
"""
|
||||
super(Configuration, self).__init__()
|
||||
self['trace'] = False
|
||||
self["verbose"] = False
|
||||
self["complete"] = False
|
||||
self["warnings"] = True
|
||||
self["ignorewarnings"] = []
|
||||
self['quiet'] = False
|
||||
self["anchors"] = False
|
||||
self["externlinks"] = []
|
||||
self["internlinks"] = []
|
||||
# on ftp, password is set by Pythons ftplib
|
||||
## checking options
|
||||
self["allowedschemes"] = []
|
||||
self['cookiefile'] = None
|
||||
self["debugmemory"] = False
|
||||
self["localwebroot"] = None
|
||||
self["maxfilesizeparse"] = 1*1024*1024
|
||||
self["maxfilesizedownload"] = 5*1024*1024
|
||||
self["maxnumurls"] = None
|
||||
self["maxrunseconds"] = None
|
||||
self["maxrequestspersecond"] = 10
|
||||
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
|
||||
self["proxy"] = urllib.getproxies()
|
||||
self["sslverify"] = True
|
||||
self["threads"] = 100
|
||||
self["timeout"] = 60
|
||||
self["aborttimeout"] = 300
|
||||
self["recursionlevel"] = -1
|
||||
self["useragent"] = UserAgent
|
||||
## authentication
|
||||
self["authentication"] = []
|
||||
self["loginurl"] = None
|
||||
self["loginuserfield"] = "login"
|
||||
self["loginpasswordfield"] = "password"
|
||||
self["loginextrafields"] = {}
|
||||
self["proxy"] = urllib.getproxies()
|
||||
self["recursionlevel"] = -1
|
||||
self["wait"] = 0
|
||||
self['sendcookies'] = False
|
||||
self['storecookies'] = False
|
||||
self['cookiefile'] = None
|
||||
self["status"] = False
|
||||
self["status_wait_seconds"] = 5
|
||||
## filtering
|
||||
self["externlinks"] = []
|
||||
self["ignorewarnings"] = []
|
||||
self["internlinks"] = []
|
||||
self["checkextern"] = False
|
||||
## plugins
|
||||
self["pluginfolders"] = get_plugin_folders()
|
||||
self["enabledplugins"] = []
|
||||
## output
|
||||
self['trace'] = False
|
||||
self['quiet'] = False
|
||||
self["verbose"] = False
|
||||
self["warnings"] = True
|
||||
self["fileoutput"] = []
|
||||
self['output'] = 'text'
|
||||
self["status"] = False
|
||||
self["status_wait_seconds"] = 5
|
||||
self['logger'] = None
|
||||
self["warningregex"] = None
|
||||
self["warningregex_max"] = 5
|
||||
self["warnsizebytes"] = None
|
||||
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
|
||||
self["threads"] = 100
|
||||
# socket timeout in seconds
|
||||
self["timeout"] = 60
|
||||
self["checkhtml"] = False
|
||||
self["checkcss"] = False
|
||||
self["scanvirus"] = False
|
||||
self["clamavconf"] = clamav.canonical_clamav_conf()
|
||||
self["useragent"] = UserAgent
|
||||
self["debugmemory"] = False
|
||||
self["localwebroot"] = None
|
||||
self["sslverify"] = True
|
||||
self["warnsslcertdaysvalid"] = 14
|
||||
self["maxrunseconds"] = None
|
||||
self["maxnumurls"] = None
|
||||
self["maxconnectionshttp"] = 10
|
||||
self["maxconnectionshttps"] = 10
|
||||
self["maxconnectionsftp"] = 2
|
||||
self.loggers = {}
|
||||
from ..logger import LoggerClasses
|
||||
for c in LoggerClasses:
|
||||
|
|
@ -302,29 +300,15 @@ class Configuration (dict):
|
|||
|
||||
def sanitize (self):
|
||||
"Make sure the configuration is consistent."
|
||||
if self["anchors"]:
|
||||
self.sanitize_anchors()
|
||||
if self['logger'] is None:
|
||||
self.sanitize_logger()
|
||||
if self['scanvirus']:
|
||||
self.sanitize_scanvirus()
|
||||
if self['storecookies'] or self['cookiefile']:
|
||||
self.sanitize_cookies()
|
||||
if self['loginurl']:
|
||||
self.sanitize_loginurl()
|
||||
self.sanitize_proxies()
|
||||
self.sanitize_plugins()
|
||||
# set default socket timeout
|
||||
socket.setdefaulttimeout(self['timeout'])
|
||||
|
||||
def sanitize_anchors (self):
|
||||
"""Make anchor configuration consistent."""
|
||||
if not self["warnings"]:
|
||||
self["warnings"] = True
|
||||
from ..checker.const import Warnings
|
||||
self["ignorewarnings"] = Warnings.keys()
|
||||
if 'url-anchor-not-found' in self["ignorewarnings"]:
|
||||
self["ignorewarnings"].remove('url-anchor-not-found')
|
||||
|
||||
def sanitize_logger (self):
|
||||
"""Make logger configuration consistent."""
|
||||
if not self['output']:
|
||||
|
|
@ -332,24 +316,6 @@ class Configuration (dict):
|
|||
self['output'] = 'text'
|
||||
self['logger'] = self.logger_new(self['output'])
|
||||
|
||||
def sanitize_scanvirus (self):
|
||||
"""Ensure clamav is installed for virus checking."""
|
||||
try:
|
||||
clamav.init_clamav_conf(self['clamavconf'])
|
||||
except clamav.ClamavError:
|
||||
log.warn(LOG_CHECK,
|
||||
_("Clamav could not be initialized"))
|
||||
self['scanvirus'] = False
|
||||
|
||||
def sanitize_cookies (self):
|
||||
"""Make cookie configuration consistent."""
|
||||
if not self['sendcookies']:
|
||||
log.warn(LOG_CHECK, _("activating sendcookies."))
|
||||
self['sendcookies'] = True
|
||||
if not self['storecookies']:
|
||||
log.warn(LOG_CHECK, _("activating storecookies."))
|
||||
self['storecookies'] = True
|
||||
|
||||
def sanitize_loginurl (self):
|
||||
"""Make login configuration consistent."""
|
||||
url = self["loginurl"]
|
||||
|
|
@ -377,9 +343,6 @@ class Configuration (dict):
|
|||
log.warn(LOG_CHECK,
|
||||
_("disabling login URL %(url)s.") % {"url": url})
|
||||
self["loginurl"] = None
|
||||
elif not self['storecookies']:
|
||||
# login URL implies storing and sending cookies
|
||||
self['storecookies'] = self['sendcookies'] = True
|
||||
|
||||
def sanitize_proxies (self):
|
||||
"""Try to read additional proxy settings which urllib does not
|
||||
|
|
@ -395,6 +358,39 @@ class Configuration (dict):
|
|||
if ftp_proxy:
|
||||
self["proxy"]["ftp"] = ftp_proxy
|
||||
|
||||
def sanitize_plugins(self):
|
||||
"""Ensure each plugin is configurable."""
|
||||
for plugin in self["enabledplugins"]:
|
||||
if plugin not in self:
|
||||
self[plugin] = {}
|
||||
|
||||
|
||||
def get_plugin_folders():
|
||||
"""Get linkchecker plugin folders. Default is ~/.linkchecker/plugins/."""
|
||||
folders = []
|
||||
defaultfolder = normpath("~/.linkchecker/plugins")
|
||||
if not os.path.exists(defaultfolder) and not Portable:
|
||||
try:
|
||||
make_userdir(defaultfolder)
|
||||
except StandardError as errmsg:
|
||||
msg = _("could not create plugin directory %(dirname)r: %(errmsg)r")
|
||||
args = dict(dirname=defaultfolder, errmsg=errmsg)
|
||||
log.warn(LOG_CHECK, msg % args)
|
||||
if os.path.exists(defaultfolder):
|
||||
folders.append(defaultfolder)
|
||||
return folders
|
||||
|
||||
|
||||
def make_userdir(child):
|
||||
"""Create a child directory."""
|
||||
userdir = os.path.dirname(child)
|
||||
if not os.path.isdir(userdir):
|
||||
if os.name == 'nt':
|
||||
# Windows forbids filenames with leading dot unless
|
||||
# a trailing dot is added.
|
||||
userdir += "."
|
||||
os.mkdir(userdir, 0700)
|
||||
|
||||
|
||||
def get_user_config():
|
||||
"""Get the user configuration filename.
|
||||
|
|
@ -413,13 +409,7 @@ def get_user_config():
|
|||
not Portable:
|
||||
# copy the initial configuration to the user configuration
|
||||
try:
|
||||
userdir = os.path.dirname(userconf)
|
||||
if not os.path.isdir(userdir):
|
||||
if os.name == 'nt':
|
||||
# Windows forbids filenames with leading dot unless
|
||||
# a trailing dot is added.
|
||||
userdir += "."
|
||||
os.mkdir(userdir, 0700)
|
||||
make_userdir(userconf)
|
||||
shutil.copy(initialconf, userconf)
|
||||
except StandardError as errmsg:
|
||||
msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r")
|
||||
|
|
@ -445,6 +435,7 @@ def get_gconf_http_proxy ():
|
|||
return "%s:%d" % (host, port)
|
||||
except StandardError as msg:
|
||||
log.debug(LOG_CHECK, "error getting HTTP proxy from gconf: %s", msg)
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -464,6 +455,7 @@ def get_gconf_ftp_proxy ():
|
|||
return "%s:%d" % (host, port)
|
||||
except StandardError as msg:
|
||||
log.debug(LOG_CHECK, "error getting FTP proxy from gconf: %s", msg)
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -478,6 +470,7 @@ def get_kde_http_proxy ():
|
|||
return data.get("http_proxy")
|
||||
except StandardError as msg:
|
||||
log.debug(LOG_CHECK, "error getting HTTP proxy from KDE: %s", msg)
|
||||
pass
|
||||
|
||||
|
||||
def get_kde_ftp_proxy ():
|
||||
|
|
@ -491,6 +484,7 @@ def get_kde_ftp_proxy ():
|
|||
return data.get("ftp_proxy")
|
||||
except StandardError as msg:
|
||||
log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg)
|
||||
pass
|
||||
|
||||
# The following KDE functions are largely ported and ajusted from
|
||||
# Google Chromium:
|
||||
|
|
|
|||
|
|
@ -17,9 +17,8 @@
|
|||
"""Parse configuration files"""
|
||||
|
||||
import ConfigParser
|
||||
import re
|
||||
import os
|
||||
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil
|
||||
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins
|
||||
|
||||
|
||||
def read_multiline (value):
|
||||
|
|
@ -53,16 +52,17 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
failed_files = set(files) - set(self.read_ok)
|
||||
log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
|
||||
# Read all the configuration parameters from the given files.
|
||||
self.read_output_config()
|
||||
self.read_checking_config()
|
||||
self.read_authentication_config()
|
||||
self.read_filtering_config()
|
||||
self.read_output_config()
|
||||
self.read_plugin_config()
|
||||
except Exception as msg:
|
||||
raise LinkCheckerError(
|
||||
_("Error parsing configuration: %s") % unicode(msg))
|
||||
|
||||
def read_string_option (self, section, option, allowempty=False):
|
||||
"""Read a sring option."""
|
||||
"""Read a string option."""
|
||||
if self.has_option(section, option):
|
||||
value = self.get(section, option)
|
||||
if not allowempty and not value:
|
||||
|
|
@ -106,11 +106,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
if self.getboolean(section, "verbose"):
|
||||
self.config["verbose"] = True
|
||||
self.config["warnings"] = True
|
||||
if self.has_option(section, "complete"):
|
||||
if self.getboolean(section, "complete"):
|
||||
self.config["complete"] = True
|
||||
self.config["verbose"] = True
|
||||
self.config["warnings"] = True
|
||||
if self.has_option(section, "quiet"):
|
||||
if self.getboolean(section, "quiet"):
|
||||
self.config['output'] = 'none'
|
||||
|
|
@ -141,37 +136,24 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
self.read_int_option(section, "threads", min=-1)
|
||||
self.config['threads'] = max(0, self.config['threads'])
|
||||
self.read_int_option(section, "timeout", min=1)
|
||||
self.read_boolean_option(section, "anchors")
|
||||
self.read_int_option(section, "aborttimeout", min=1)
|
||||
self.read_int_option(section, "recursionlevel", min=-1)
|
||||
if self.has_option(section, "warningregex"):
|
||||
val = self.get(section, "warningregex")
|
||||
if val:
|
||||
self.config["warningregex"] = re.compile(val)
|
||||
self.read_int_option(section, "warnsizebytes", min=1)
|
||||
self.read_string_option(section, "nntpserver")
|
||||
self.read_string_option(section, "useragent")
|
||||
self.read_int_option(section, "pause", key="wait", min=0)
|
||||
for name in ("http", "https", "ftp"):
|
||||
self.read_int_option(section, "maxconnections%s" % name, min=1)
|
||||
self.read_check_options(section)
|
||||
|
||||
def read_check_options (self, section):
|
||||
"""Read check* options."""
|
||||
self.read_boolean_option(section, "checkhtml")
|
||||
self.read_boolean_option(section, "checkcss")
|
||||
self.read_boolean_option(section, "scanvirus")
|
||||
self.read_boolean_option(section, "clamavconf")
|
||||
self.read_int_option(section, "maxrequestspersecond", min=1)
|
||||
self.read_int_option(section, "maxnumurls", min=0)
|
||||
self.read_int_option(section, "maxfilesizeparse", min=1)
|
||||
self.read_int_option(section, "maxfilesizedownload", min=1)
|
||||
if self.has_option(section, "allowedschemes"):
|
||||
self.config['allowedschemes'] = [x.strip().lower() for x in \
|
||||
self.get(section, 'allowedschemes').split(',')]
|
||||
self.read_boolean_option(section, "debugmemory")
|
||||
if self.has_option(section, "cookies"):
|
||||
self.config["sendcookies"] = self.config["storecookies"] = \
|
||||
self.getboolean(section, "cookies")
|
||||
self.read_string_option(section, "cookiefile")
|
||||
self.read_string_option(section, "localwebroot")
|
||||
try:
|
||||
self.read_boolean_option(section, "sslverify")
|
||||
except ValueError:
|
||||
self.read_string_option(section, "sslverify")
|
||||
self.read_int_option(section, "warnsslcertdaysvalid", min=1)
|
||||
self.read_int_option(section, "maxrunseconds", min=0)
|
||||
|
||||
def read_authentication_config (self):
|
||||
|
|
@ -198,7 +180,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
raise LinkCheckerError(_("invalid login URL `%s'. Only " \
|
||||
"HTTP and HTTPS URLs are supported.") % val)
|
||||
self.config["loginurl"] = val
|
||||
self.config["storecookies"] = self.config["sendcookies"] = True
|
||||
self.read_string_option(section, "loginuserfield")
|
||||
self.read_string_option(section, "loginpasswordfield")
|
||||
# read login extra fields
|
||||
|
|
@ -231,7 +212,7 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
"""
|
||||
section = "filtering"
|
||||
if self.has_option(section, "ignorewarnings"):
|
||||
self.config['ignorewarnings'] = [f.strip() for f in \
|
||||
self.config['ignorewarnings'] = [f.strip().lower() for f in \
|
||||
self.get(section, 'ignorewarnings').split(',')]
|
||||
if self.has_option(section, "ignore"):
|
||||
for line in read_multiline(self.get(section, "ignore")):
|
||||
|
|
@ -244,3 +225,14 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
|
|||
if self.has_option(section, "internlinks"):
|
||||
pat = get_link_pat(self.get(section, "internlinks"))
|
||||
self.config["internlinks"].append(pat)
|
||||
self.read_boolean_option(section, "checkextern")
|
||||
|
||||
def read_plugin_config(self):
|
||||
"""Read plugin-specific configuration values."""
|
||||
folders = self.config["pluginfolders"]
|
||||
modules = plugins.get_plugin_modules(folders)
|
||||
for pluginclass in plugins.get_plugin_classes(modules):
|
||||
section = pluginclass.__name__
|
||||
if self.has_section(section):
|
||||
self.config["enabledplugins"].append(section)
|
||||
self.config[section] = pluginclass.read_config(self)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2004-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -15,510 +15,13 @@
|
|||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Parsing and storing of cookies. See [1]RFC 2965 and [2]RFC 2109.
|
||||
The reason for this module is that neither the cookielib nor the Cookie
|
||||
modules included in the Python standard library provide a usable interface
|
||||
for programmable cookie handling.
|
||||
This module provides parsing of cookies for all formats specified by
|
||||
the above RFCs, plus smart methods handling data conversion and formatting.
|
||||
And a cookie storage class is provided.
|
||||
|
||||
[1] http://www.faqs.org/rfcs/rfc2965.html
|
||||
[2] http://www.faqs.org/rfcs/rfc2109.html
|
||||
Parsing of cookies.
|
||||
"""
|
||||
|
||||
import time
|
||||
import string
|
||||
import re
|
||||
import cookielib
|
||||
import httplib
|
||||
import requests
|
||||
from cStringIO import StringIO
|
||||
from . import strformat
|
||||
|
||||
|
||||
_nulljoin = ''.join
|
||||
_semispacejoin = '; '.join
|
||||
_spacejoin = ' '.join
|
||||
|
||||
class CookieError (StandardError):
|
||||
"""Thrown for invalid cookie syntax or conflicting/impossible values."""
|
||||
pass
|
||||
|
||||
_LegalChars = string.ascii_letters + string.digits + "!#$%&'*+-.^_`|~:"
|
||||
_Translator = {
|
||||
'\000' : '\\000', '\001' : '\\001', '\002' : '\\002',
|
||||
'\003' : '\\003', '\004' : '\\004', '\005' : '\\005',
|
||||
'\006' : '\\006', '\007' : '\\007', '\010' : '\\010',
|
||||
'\011' : '\\011', '\012' : '\\012', '\013' : '\\013',
|
||||
'\014' : '\\014', '\015' : '\\015', '\016' : '\\016',
|
||||
'\017' : '\\017', '\020' : '\\020', '\021' : '\\021',
|
||||
'\022' : '\\022', '\023' : '\\023', '\024' : '\\024',
|
||||
'\025' : '\\025', '\026' : '\\026', '\027' : '\\027',
|
||||
'\030' : '\\030', '\031' : '\\031', '\032' : '\\032',
|
||||
'\033' : '\\033', '\034' : '\\034', '\035' : '\\035',
|
||||
'\036' : '\\036', '\037' : '\\037',
|
||||
|
||||
# Because of the way browsers really handle cookies (as opposed
|
||||
# to what the RFC says) we also encode , and ;
|
||||
|
||||
',' : '\\054', ';' : '\\073',
|
||||
|
||||
'"' : '\\"', '\\' : '\\\\',
|
||||
|
||||
'\177' : '\\177', '\200' : '\\200', '\201' : '\\201',
|
||||
'\202' : '\\202', '\203' : '\\203', '\204' : '\\204',
|
||||
'\205' : '\\205', '\206' : '\\206', '\207' : '\\207',
|
||||
'\210' : '\\210', '\211' : '\\211', '\212' : '\\212',
|
||||
'\213' : '\\213', '\214' : '\\214', '\215' : '\\215',
|
||||
'\216' : '\\216', '\217' : '\\217', '\220' : '\\220',
|
||||
'\221' : '\\221', '\222' : '\\222', '\223' : '\\223',
|
||||
'\224' : '\\224', '\225' : '\\225', '\226' : '\\226',
|
||||
'\227' : '\\227', '\230' : '\\230', '\231' : '\\231',
|
||||
'\232' : '\\232', '\233' : '\\233', '\234' : '\\234',
|
||||
'\235' : '\\235', '\236' : '\\236', '\237' : '\\237',
|
||||
'\240' : '\\240', '\241' : '\\241', '\242' : '\\242',
|
||||
'\243' : '\\243', '\244' : '\\244', '\245' : '\\245',
|
||||
'\246' : '\\246', '\247' : '\\247', '\250' : '\\250',
|
||||
'\251' : '\\251', '\252' : '\\252', '\253' : '\\253',
|
||||
'\254' : '\\254', '\255' : '\\255', '\256' : '\\256',
|
||||
'\257' : '\\257', '\260' : '\\260', '\261' : '\\261',
|
||||
'\262' : '\\262', '\263' : '\\263', '\264' : '\\264',
|
||||
'\265' : '\\265', '\266' : '\\266', '\267' : '\\267',
|
||||
'\270' : '\\270', '\271' : '\\271', '\272' : '\\272',
|
||||
'\273' : '\\273', '\274' : '\\274', '\275' : '\\275',
|
||||
'\276' : '\\276', '\277' : '\\277', '\300' : '\\300',
|
||||
'\301' : '\\301', '\302' : '\\302', '\303' : '\\303',
|
||||
'\304' : '\\304', '\305' : '\\305', '\306' : '\\306',
|
||||
'\307' : '\\307', '\310' : '\\310', '\311' : '\\311',
|
||||
'\312' : '\\312', '\313' : '\\313', '\314' : '\\314',
|
||||
'\315' : '\\315', '\316' : '\\316', '\317' : '\\317',
|
||||
'\320' : '\\320', '\321' : '\\321', '\322' : '\\322',
|
||||
'\323' : '\\323', '\324' : '\\324', '\325' : '\\325',
|
||||
'\326' : '\\326', '\327' : '\\327', '\330' : '\\330',
|
||||
'\331' : '\\331', '\332' : '\\332', '\333' : '\\333',
|
||||
'\334' : '\\334', '\335' : '\\335', '\336' : '\\336',
|
||||
'\337' : '\\337', '\340' : '\\340', '\341' : '\\341',
|
||||
'\342' : '\\342', '\343' : '\\343', '\344' : '\\344',
|
||||
'\345' : '\\345', '\346' : '\\346', '\347' : '\\347',
|
||||
'\350' : '\\350', '\351' : '\\351', '\352' : '\\352',
|
||||
'\353' : '\\353', '\354' : '\\354', '\355' : '\\355',
|
||||
'\356' : '\\356', '\357' : '\\357', '\360' : '\\360',
|
||||
'\361' : '\\361', '\362' : '\\362', '\363' : '\\363',
|
||||
'\364' : '\\364', '\365' : '\\365', '\366' : '\\366',
|
||||
'\367' : '\\367', '\370' : '\\370', '\371' : '\\371',
|
||||
'\372' : '\\372', '\373' : '\\373', '\374' : '\\374',
|
||||
'\375' : '\\375', '\376' : '\\376', '\377' : '\\377'
|
||||
}
|
||||
|
||||
def quote(str, LegalChars=_LegalChars):
|
||||
r"""Quote a string for use in a cookie header.
|
||||
|
||||
If the string does not need to be double-quoted, then just return the
|
||||
string. Otherwise, surround the string in doublequotes and quote
|
||||
(with a \) special characters.
|
||||
"""
|
||||
if all(c in LegalChars for c in str):
|
||||
return str
|
||||
else:
|
||||
return '"' + _nulljoin(_Translator.get(s, s) for s in str) + '"'
|
||||
|
||||
|
||||
_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]")
|
||||
_QuotePatt = re.compile(r"[\\].")
|
||||
|
||||
def unquote(str):
|
||||
"""Remove string quoting."""
|
||||
# If there aren't any doublequotes,
|
||||
# then there can't be any special characters. See RFC 2109.
|
||||
if len(str) < 2:
|
||||
return str
|
||||
if str[0] != '"' or str[-1] != '"':
|
||||
return str
|
||||
|
||||
# We have to assume that we must decode this string.
|
||||
# Down to work.
|
||||
|
||||
# Remove the "s
|
||||
str = str[1:-1]
|
||||
|
||||
# Check for special sequences. Examples:
|
||||
# \012 --> \n
|
||||
# \" --> "
|
||||
#
|
||||
i = 0
|
||||
n = len(str)
|
||||
res = []
|
||||
while 0 <= i < n:
|
||||
o_match = _OctalPatt.search(str, i)
|
||||
q_match = _QuotePatt.search(str, i)
|
||||
if not o_match and not q_match: # Neither matched
|
||||
res.append(str[i:])
|
||||
break
|
||||
# else:
|
||||
j = k = -1
|
||||
if o_match:
|
||||
j = o_match.start(0)
|
||||
if q_match:
|
||||
k = q_match.start(0)
|
||||
if q_match and (not o_match or k < j): # QuotePatt matched
|
||||
res.append(str[i:k])
|
||||
res.append(str[k+1])
|
||||
i = k + 2
|
||||
else: # OctalPatt matched
|
||||
res.append(str[i:j])
|
||||
res.append(chr(int(str[j+1:j+4], 8)))
|
||||
i = j + 4
|
||||
return _nulljoin(res)
|
||||
|
||||
|
||||
has_embedded_dot = re.compile(r"[a-zA-Z0-9]\.[a-zA-Z]").search
|
||||
|
||||
|
||||
# Pattern for finding cookie snatched from Pythons Cookie.py
|
||||
# Modification: allow whitespace in values.
|
||||
_LegalCharsPatt = r"[\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=]"
|
||||
_CookiePattern = re.compile(r"""
|
||||
(?x) # This is a verbose pattern
|
||||
(?P<key> # Start of group 'key'
|
||||
""" + _LegalCharsPatt + r"""+? # Any word of at least one letter
|
||||
) # End of group 'key'
|
||||
( # Optional group: there may not be a value.
|
||||
\s*=\s* # Equal Sign
|
||||
(?P<val> # Start of group 'val'
|
||||
"(?:[^\\"]|\\.)*" # Any doublequoted string
|
||||
| # or
|
||||
\w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr
|
||||
| # or
|
||||
""" + _LegalCharsPatt + r"""* # Any word or empty string
|
||||
) # End of group 'val'
|
||||
)? # End of optional value group
|
||||
\s* # Any number of spaces.
|
||||
(\s+|;|$) # Ending either at space, semicolon, or EOS.
|
||||
""")
|
||||
|
||||
class HttpCookie (object):
|
||||
"""A cookie consists of one name-value pair with attributes.
|
||||
Each attribute consists of a predefined name (see attribute_names)
|
||||
and a value (which is optional for some attributes)."""
|
||||
|
||||
# A mapping from the lowercase variant on the left to the
|
||||
# appropriate traditional formatting on the right.
|
||||
attribute_names = {
|
||||
# Old Netscape attribute
|
||||
"expires": "expires",
|
||||
# Defined by RFC 2109
|
||||
"path": "Path",
|
||||
"comment": "Comment",
|
||||
"domain": "Domain",
|
||||
"max-age": "Max-Age",
|
||||
"secure": "secure",
|
||||
"version": "Version",
|
||||
# Additional attributes defined by RFC 2965
|
||||
"commenturl": "CommentURL",
|
||||
"discard": "Discard",
|
||||
"port": "Port",
|
||||
# httponly to protect against XSS attacks
|
||||
"httponly": "httponly",
|
||||
}
|
||||
|
||||
def __init__ (self, name, value, attributes=None):
|
||||
"""Store name, value and attributes. Also calculates expiration
|
||||
if given in attributes."""
|
||||
self.name = name
|
||||
self.value = value
|
||||
if attributes is None:
|
||||
self.attributes = {}
|
||||
else:
|
||||
self.attributes = attributes
|
||||
self.calculate_expiration()
|
||||
|
||||
def calculate_expiration (self):
|
||||
"""If "max-age" or "expires" attributes are given, calculate
|
||||
the time when this cookie expires.
|
||||
Stores the time value in self.expires, or None if this cookie
|
||||
does not expire.
|
||||
"""
|
||||
# default: do not expire
|
||||
self.expire = None
|
||||
if "max-age" in self.attributes:
|
||||
now = time.time()
|
||||
try:
|
||||
maxage = int(self.attributes["max-age"])
|
||||
if maxage == 0:
|
||||
# Expire immediately: subtract 1 to be sure since
|
||||
# some clocks have only full second precision.
|
||||
self.expire = now - 1
|
||||
else:
|
||||
self.expire = now + maxage
|
||||
except (ValueError, OverflowError):
|
||||
# note: even self.now + maxage can overflow
|
||||
pass
|
||||
elif "expires" in self.attributes:
|
||||
expiration_date = self.attributes["expires"]
|
||||
try:
|
||||
self.expire = cookielib.http2time(expiration_date)
|
||||
except ValueError:
|
||||
# see http://bugs.python.org/issue16181
|
||||
raise CookieError("Invalid expiration date in %r" % expiration_date)
|
||||
|
||||
def is_expired (self, now=None):
|
||||
"""Return True if this cookie is expired, else False."""
|
||||
if self.expire is None:
|
||||
# Does not expire.
|
||||
return False
|
||||
if now is None:
|
||||
now = time.time()
|
||||
return now > self.expire
|
||||
|
||||
def __repr__ (self):
|
||||
"""Return cookie name, value and attributes as string."""
|
||||
attrs = "; ".join("%s=%r"%(k, v) for k, v in self.attributes.items())
|
||||
return "<%s %s=%r; %s>" % (self.__class__.__name__,
|
||||
self.name, self.value, attrs)
|
||||
|
||||
def is_valid_for (self, scheme, host, port, path):
|
||||
"""Check validity of this cookie against the desired scheme,
|
||||
host and path."""
|
||||
if self.check_expired() and \
|
||||
self.check_domain(host) and \
|
||||
self.check_port(port) and \
|
||||
self.check_path(path) and \
|
||||
self.check_secure(scheme):
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_expired (self):
|
||||
"""Return False if cookie is expired, else True."""
|
||||
return not self.is_expired()
|
||||
|
||||
def check_domain (self, domain):
|
||||
"""Return True if given domain matches this cookie, else False."""
|
||||
if "domain" not in self.attributes:
|
||||
return False
|
||||
cdomain = self.attributes["domain"]
|
||||
if domain == cdomain:
|
||||
# equality matches
|
||||
return True
|
||||
if "." not in domain and domain == cdomain[1:]:
|
||||
# "localhost" and ".localhost" match
|
||||
return True
|
||||
if not domain.endswith(cdomain):
|
||||
# any suffix matches
|
||||
return False
|
||||
if "." in domain[:-(len(cdomain)+1)]:
|
||||
# prefix must be dot-free
|
||||
return False
|
||||
return True
|
||||
|
||||
def check_port (self, port):
|
||||
"""Return True if given port matches this cookie, else False.
|
||||
For now, this returns always True."""
|
||||
return True
|
||||
|
||||
def check_path (self, path):
|
||||
"""Return True if given path matches this cookie, else False."""
|
||||
if "path" not in self.attributes:
|
||||
return False
|
||||
return path.startswith(self.attributes["path"])
|
||||
|
||||
def check_secure (self, scheme):
|
||||
"""Return True if given Scheme is allowed for this cookie, else
|
||||
False."""
|
||||
if "secure" in self.attributes:
|
||||
return scheme == "https"
|
||||
return True
|
||||
|
||||
def set_attribute (self, key, value):
|
||||
"""Helper method to set attribute values. Called when parsing
|
||||
cookie data.
|
||||
The attribute key and value are checked, and CookieError is
|
||||
raised in these cases."""
|
||||
if self.attributes is None:
|
||||
raise CookieError("no NAME=VALUE before attributes found")
|
||||
key = key.lower()
|
||||
if key not in self.attribute_names:
|
||||
raise CookieError("invalid attribute %r" % key)
|
||||
if value:
|
||||
value = unquote(value)
|
||||
else:
|
||||
value = ""
|
||||
if key == "domain":
|
||||
value = value.lower()
|
||||
if not value.startswith(".") and not has_embedded_dot(value):
|
||||
if "." in value:
|
||||
raise CookieError("invalid dot in domain %r" % value)
|
||||
# supply a leading dot
|
||||
value = "."+value
|
||||
if key == "max-age":
|
||||
try:
|
||||
if int(value) < 0:
|
||||
raise ValueError("Negative Max-Age")
|
||||
except (OverflowError, ValueError):
|
||||
raise CookieError("invalid Max-Age number: %r" % value)
|
||||
if key == "port":
|
||||
ports = value.split(",")
|
||||
for port in ports:
|
||||
try:
|
||||
if not (0 <= int(port) <= 65535):
|
||||
raise ValueError("Invalid port number")
|
||||
except (OverflowError, ValueError):
|
||||
raise CookieError("invalid port number: %r" % port)
|
||||
self.attributes[key] = value
|
||||
|
||||
def parse (self, text, patt=_CookiePattern):
|
||||
"""Parse cookie data."""
|
||||
text = strformat.ascii_safe(text.rstrip('\r\n'))
|
||||
# reset values
|
||||
self.name = None
|
||||
self.value = None
|
||||
self.attributes = None
|
||||
# Our starting point
|
||||
i = 0
|
||||
# Length of string
|
||||
n = len(text)
|
||||
|
||||
while 0 <= i < n:
|
||||
# Start looking for a key-value pair.
|
||||
match = patt.search(text, i)
|
||||
if not match:
|
||||
# No more key-value pairs.
|
||||
break
|
||||
key, value = match.group("key"), match.group("val")
|
||||
if value is None:
|
||||
value = ""
|
||||
i = match.end()
|
||||
# Parse the key, value in case it's metainfo.
|
||||
if self.name is None:
|
||||
# Set name and value.
|
||||
self.name = key
|
||||
self.value = unquote(value)
|
||||
self.attributes = {}
|
||||
else:
|
||||
if key.startswith("$"):
|
||||
key = key[1:]
|
||||
self.set_attribute(key, value)
|
||||
if self.name is None:
|
||||
raise CookieError("missing cookie name in %r" % text)
|
||||
self.calculate_expiration()
|
||||
|
||||
def set_default_attributes (self, scheme, host, path):
|
||||
"""Set domain and path attributes for given scheme, host and
|
||||
path."""
|
||||
scheme = strformat.ascii_safe(scheme)
|
||||
host = strformat.ascii_safe(host)
|
||||
path = strformat.ascii_safe(path)
|
||||
if "domain" not in self.attributes:
|
||||
self.attributes["domain"] = host.lower()
|
||||
if "path" not in self.attributes:
|
||||
i = path.rfind("/")
|
||||
if i == -1:
|
||||
path = "/"
|
||||
else:
|
||||
path = path[:i]
|
||||
if not path:
|
||||
path = "/"
|
||||
self.attributes["path"] = path
|
||||
if not self.check_domain(host):
|
||||
cdomain = self.attributes["domain"]
|
||||
raise CookieError("domain %r not for cookie %r" % (cdomain, host))
|
||||
if not self.check_path(path):
|
||||
cpath = self.attributes["path"]
|
||||
raise CookieError("domain %r not for cookie %r" % (cpath, path))
|
||||
if not self.check_secure(scheme):
|
||||
raise CookieError("no secure scheme %r" % scheme)
|
||||
|
||||
def quote (self, key, value):
|
||||
"""Quote value for given key."""
|
||||
return quote(value)
|
||||
|
||||
def server_header_value (self):
|
||||
"""Return HTTP header value to send to server."""
|
||||
parts = ["%s=%s" % (self.name, quote(self.value))]
|
||||
parts.extend(["%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
|
||||
for k, v in self.attributes.items()])
|
||||
return "; ".join(parts)
|
||||
|
||||
def client_header_value (self):
|
||||
"""Return HTTP header value to send to client."""
|
||||
parts = []
|
||||
if "version" in self.attributes:
|
||||
parts.append("$Version=%s" % quote(self.attributes["version"]))
|
||||
parts.append("%s=%s" % (self.name, quote(self.value)))
|
||||
parts.extend(["$%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
|
||||
for k, v in self.attributes.items() if k != "version"])
|
||||
return "; ".join(parts)
|
||||
|
||||
class NetscapeCookie (HttpCookie):
|
||||
"""Parses RFC 2109 (Netscape) cookies."""
|
||||
|
||||
def __init__ (self, text, scheme, host, path):
|
||||
"""Parse given cookie data."""
|
||||
self.parse(text)
|
||||
self.set_default_attributes(scheme, host, path)
|
||||
|
||||
def server_header_name (self):
|
||||
"""Return "Set-Cookie" as server header name."""
|
||||
return "Set-Cookie"
|
||||
|
||||
def __eq__ (self, other):
|
||||
"""Compare equality of cookie."""
|
||||
return (isinstance(other, NetscapeCookie) and
|
||||
self.name.lower() == other.name.lower() and
|
||||
self.attributes['domain'] == other.attributes['domain'] and
|
||||
self.attributes['path'] == other.attributes['path'])
|
||||
|
||||
def __hash__ (self):
|
||||
"""Cookie hash value"""
|
||||
data = (
|
||||
self.name.lower(),
|
||||
self.attributes['domain'],
|
||||
self.attributes['path'],
|
||||
)
|
||||
return hash(data)
|
||||
|
||||
|
||||
|
||||
class Rfc2965Cookie (HttpCookie):
|
||||
"""Parses RFC 2965 cookies."""
|
||||
|
||||
def __init__ (self, text, scheme, host, path):
|
||||
"""Parse given cookie data."""
|
||||
self.parse(text)
|
||||
self.set_default_attributes(scheme, host, path)
|
||||
|
||||
def check_port (self, port):
|
||||
"""Return True if given port matches this cookie, else False."""
|
||||
if "port" not in self.attributes:
|
||||
return True
|
||||
cport = self.attributes["port"]
|
||||
return port in [int(x) for x in cport.split(",")]
|
||||
|
||||
def server_header_name (self):
|
||||
"""Return "Set-Cookie2" as server header name."""
|
||||
return "Set-Cookie2"
|
||||
|
||||
def quote (self, key, value):
|
||||
"""Quote value for given key."""
|
||||
if key == "port":
|
||||
return quote(value, LegalChars="")
|
||||
return quote(value)
|
||||
|
||||
def __eq__ (self, other):
|
||||
"""Compare equality of cookie."""
|
||||
return (isinstance(other, Rfc2965Cookie) and
|
||||
self.name.lower() == other.name.lower() and
|
||||
self.attributes['domain'].lower() ==
|
||||
other.attributes['domain'].lower() and
|
||||
self.attributes['path'] == other.attributes['path'])
|
||||
|
||||
def __hash__ (self):
|
||||
"""Cookie hash value"""
|
||||
data = (
|
||||
self.name.lower(),
|
||||
self.attributes['domain'].lower(),
|
||||
self.attributes['path'],
|
||||
)
|
||||
return hash(data)
|
||||
|
||||
|
||||
def from_file (filename):
|
||||
|
|
@ -545,92 +48,21 @@ def from_file (filename):
|
|||
def from_headers (strheader):
|
||||
"""Parse cookie data from a string in HTTP header (RFC 2616) format.
|
||||
|
||||
@return: tuple (headers, scheme, host, path)
|
||||
@return: list of cookies
|
||||
@raises: ValueError for incomplete or invalid data
|
||||
"""
|
||||
res = []
|
||||
fp = StringIO(strheader)
|
||||
headers = httplib.HTTPMessage(fp, seekable=True)
|
||||
if "Host" not in headers:
|
||||
raise ValueError("Required header 'Host:' missing")
|
||||
host = headers["Host"]
|
||||
scheme = headers.get("Scheme", "http")
|
||||
path= headers.get("Path", "/")
|
||||
return (headers, scheme, host, path)
|
||||
|
||||
|
||||
## Taken and adpated from the _mechanize package included in Twill.
|
||||
|
||||
def cookie_str(cookie):
|
||||
"""Return string representation of Cookie."""
|
||||
h = [(cookie.name, unquote(cookie.value)),
|
||||
("path", cookie.path),
|
||||
("domain", cookie.domain)]
|
||||
if cookie.port is not None: h.append(("port", cookie.port))
|
||||
#if cookie.path_specified: h.append(("path_spec", None))
|
||||
#if cookie.port_specified: h.append(("port_spec", None))
|
||||
#if cookie.domain_initial_dot: h.append(("domain_dot", None))
|
||||
if cookie.secure: h.append(("secure", None))
|
||||
if cookie.httponly: h.append(("httponly", None))
|
||||
if cookie.expires: h.append(("expires",
|
||||
time2isoz(float(cookie.expires))))
|
||||
if cookie.discard: h.append(("discard", None))
|
||||
if cookie.comment: h.append(("comment", cookie.comment))
|
||||
if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
|
||||
#if cookie.rfc2109: h.append(("rfc2109", None))
|
||||
|
||||
keys = cookie.nonstandard_attr_keys()
|
||||
keys.sort()
|
||||
for k in keys:
|
||||
h.append((k, str(cookie.get_nonstandard_attr(k))))
|
||||
|
||||
h.append(("version", str(cookie.version)))
|
||||
|
||||
return join_header_words([h])
|
||||
|
||||
|
||||
def time2isoz(t=None):
|
||||
"""Return a string representing time in seconds since epoch, t.
|
||||
|
||||
If the function is called without an argument, it will use the current
|
||||
time.
|
||||
|
||||
The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
||||
representing Universal Time (UTC, aka GMT). An example of this format is:
|
||||
|
||||
1994-11-24 08:49:37Z
|
||||
|
||||
"""
|
||||
if t is None: t = time.time()
|
||||
year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
||||
return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
||||
year, mon, mday, hour, min, sec)
|
||||
|
||||
|
||||
join_escape_re = re.compile(r"([\"\\])")
|
||||
def join_header_words(lists):
|
||||
"""Do the inverse of the conversion done by split_header_words.
|
||||
|
||||
Takes a list of lists of (key, value) pairs and produces a single header
|
||||
value. Attribute values are quoted if needed.
|
||||
|
||||
>>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
||||
'text/plain; charset="iso-8859/1"'
|
||||
>>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
||||
'text/plain, charset="iso-8859/1"'
|
||||
|
||||
"""
|
||||
headers = []
|
||||
for pairs in lists:
|
||||
attr = []
|
||||
for k, v in pairs:
|
||||
if v is not None:
|
||||
if not re.search(r"^\w+$", v):
|
||||
v = join_escape_re.sub(r"\\\1", v) # escape " and \
|
||||
v = '"%s"' % v
|
||||
if k is None: # Netscape cookies may have no name
|
||||
k = v
|
||||
else:
|
||||
k = "%s=%s" % (k, v)
|
||||
attr.append(k)
|
||||
if attr: headers.append("; ".join(attr))
|
||||
return ", ".join(headers)
|
||||
for header in headers.getallmatchingheaders("Set-Cookie"):
|
||||
headervalue = header.split(':', 1)[1]
|
||||
for pairs in cookielib.split_header_words([headervalue]):
|
||||
for name, value in pairs:
|
||||
cookie = requests.cookies.create_cookie(name, value,
|
||||
domain=host, path=path)
|
||||
res.append(cookie)
|
||||
return res
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -19,13 +19,11 @@ Management of checking a queue of links with several threads.
|
|||
"""
|
||||
import os
|
||||
import thread
|
||||
import urlparse
|
||||
from cStringIO import StringIO
|
||||
from .. import log, LOG_CHECK, LinkCheckerInterrupt, cookies, dummy, \
|
||||
fileutil, strformat
|
||||
from ..cache import urlqueue, robots_txt, cookie, connection
|
||||
import time
|
||||
from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
|
||||
fileutil, strformat, plugins
|
||||
from ..cache import urlqueue, robots_txt
|
||||
from . import aggregator, console
|
||||
from ..httplib2 import HTTPMessage
|
||||
|
||||
|
||||
def visit_loginurl (aggregate):
|
||||
|
|
@ -53,7 +51,7 @@ def visit_loginurl (aggregate):
|
|||
log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
|
||||
{"url": url})
|
||||
return
|
||||
store_cookies(tc.get_browser().cj, aggregate.cookies, url)
|
||||
#XXX store_cookies(tc.get_browser().cj, aggregate.cookies, url)
|
||||
resulturl = tc.get_browser().get_url()
|
||||
log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
|
||||
# add result URL to check list
|
||||
|
|
@ -107,18 +105,6 @@ def search_formname (fieldnames, tc):
|
|||
return None
|
||||
|
||||
|
||||
def store_cookies (cookiejar, cookiecache, url):
|
||||
"""Store cookies in cookiejar into the cookiecache."""
|
||||
cookielst = []
|
||||
for c in cookiejar:
|
||||
cookielst.append("Set-Cookie2: %s" % cookies.cookie_str(c))
|
||||
log.debug(LOG_CHECK, "Store cookies %s", cookielst)
|
||||
headers = HTTPMessage(StringIO("\r\n".join(cookielst)))
|
||||
urlparts = urlparse.urlsplit(url)
|
||||
scheme, host, path = urlparts[0:3]
|
||||
cookiecache.add(headers, scheme, host, path)
|
||||
|
||||
|
||||
def check_urls (aggregate):
|
||||
"""Main check function; checks all configured URLs until interrupted
|
||||
with Ctrl-C.
|
||||
|
|
@ -194,14 +180,17 @@ def abort (aggregate):
|
|||
break
|
||||
except KeyboardInterrupt:
|
||||
log.warn(LOG_CHECK, _("user abort; force shutdown"))
|
||||
aggregate.logger.end_log_output()
|
||||
abort_now()
|
||||
|
||||
|
||||
def abort_now ():
|
||||
"""Force exit of current process without cleanup."""
|
||||
if os.name == 'posix':
|
||||
# Unix systems can use sigkill
|
||||
# Unix systems can use signals
|
||||
import signal
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
os.kill(os.getpid(), signal.SIGKILL)
|
||||
elif os.name == 'nt':
|
||||
# NT has os.abort()
|
||||
|
|
@ -214,8 +203,6 @@ def abort_now ():
|
|||
def get_aggregate (config):
|
||||
"""Get an aggregator instance with given configuration."""
|
||||
_urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"])
|
||||
connections = connection.ConnectionPool(config.get_connectionlimits(), wait=config["wait"])
|
||||
cookies = cookie.CookieJar()
|
||||
_robots_txt = robots_txt.RobotsTxt()
|
||||
return aggregator.Aggregate(config, _urlqueue, connections,
|
||||
cookies, _robots_txt)
|
||||
plugin_manager = plugins.PluginManager(config)
|
||||
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager)
|
||||
|
|
|
|||
|
|
@ -17,54 +17,93 @@
|
|||
"""
|
||||
Aggregate needed object instances for checker threads.
|
||||
"""
|
||||
import time
|
||||
import threading
|
||||
from .. import log, LOG_CHECK, strformat
|
||||
import thread
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from .. import log, LOG_CHECK, strformat, cookies
|
||||
from ..decorators import synchronized
|
||||
from ..cache import urlqueue
|
||||
from . import logger, status, checker, cleanup
|
||||
from . import logger, status, checker, interrupt
|
||||
|
||||
|
||||
_w3_time_lock = threading.Lock()
|
||||
_threads_lock = threading.RLock()
|
||||
_download_lock = threading.Lock()
|
||||
_hosts_lock = threading.RLock()
|
||||
|
||||
def new_request_session(config):
|
||||
"""Create a new request session."""
|
||||
session = requests.Session()
|
||||
# XXX proxies
|
||||
if config["cookiefile"]:
|
||||
for cookie in cookies.from_file(config["cookiefile"]):
|
||||
session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
|
||||
return session
|
||||
|
||||
|
||||
class Aggregate (object):
|
||||
"""Store thread-safe data collections for checker threads."""
|
||||
|
||||
def __init__ (self, config, urlqueue, connections, cookies, robots_txt):
|
||||
def __init__ (self, config, urlqueue, robots_txt, plugin_manager):
|
||||
"""Store given link checking objects."""
|
||||
self.config = config
|
||||
self.urlqueue = urlqueue
|
||||
self.connections = connections
|
||||
self.cookies = cookies
|
||||
self.robots_txt = robots_txt
|
||||
self.logger = logger.Logger(config)
|
||||
self.threads = []
|
||||
self.last_w3_call = 0
|
||||
self.downloaded_bytes = 0
|
||||
self.request_sessions = {}
|
||||
self.robots_txt = robots_txt
|
||||
self.plugin_manager = plugin_manager
|
||||
self.times = {}
|
||||
requests_per_second = config["maxrequestspersecond"]
|
||||
self.wait_time_min = 1.0 / requests_per_second
|
||||
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def start_threads (self):
|
||||
"""Spawn threads for URL checking and status printing."""
|
||||
if self.config["status"]:
|
||||
t = status.Status(self.urlqueue, self.config.status_logger,
|
||||
self.config["status_wait_seconds"],
|
||||
self.config["maxrunseconds"])
|
||||
self.config["status_wait_seconds"])
|
||||
t.start()
|
||||
self.threads.append(t)
|
||||
if self.config["maxrunseconds"]:
|
||||
t = interrupt.Interrupt(self.config["maxrunseconds"])
|
||||
t.start()
|
||||
self.threads.append(t)
|
||||
t = cleanup.Cleanup(self.connections)
|
||||
t.start()
|
||||
self.threads.append(t)
|
||||
num = self.config["threads"]
|
||||
if num > 0:
|
||||
for dummy in range(num):
|
||||
t = checker.Checker(self.urlqueue, self.logger)
|
||||
t.start()
|
||||
t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
|
||||
self.threads.append(t)
|
||||
t.start()
|
||||
else:
|
||||
self.request_sessions[thread.get_ident()] = new_request_session(self.config)
|
||||
checker.check_url(self.urlqueue, self.logger)
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def add_request_session(self):
|
||||
"""Add a request session for current thread."""
|
||||
session = new_request_session(self.config)
|
||||
self.request_sessions[thread.get_ident()] = session
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def get_request_session(self):
|
||||
"""Get the request session for current thread."""
|
||||
return self.request_sessions[thread.get_ident()]
|
||||
|
||||
@synchronized(_hosts_lock)
|
||||
def wait_for_host(self, host):
|
||||
"""Throttle requests to one host."""
|
||||
t = time.time()
|
||||
if host in self.times:
|
||||
due_time = self.times[host]
|
||||
if due_time > t:
|
||||
wait = due_time - t
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
|
||||
self.times[host] = t + wait_time
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def print_active_threads (self):
|
||||
"""Log all currently active threads."""
|
||||
|
|
@ -77,8 +116,8 @@ class Aggregate (object):
|
|||
first = False
|
||||
log.info(LOG_CHECK, name[12:])
|
||||
args = dict(
|
||||
num=len(self.threads),
|
||||
timeout=strformat.strduration_long(self.config["timeout"]),
|
||||
num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
|
||||
timeout=strformat.strduration_long(self.config["aborttimeout"]),
|
||||
)
|
||||
log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)
|
||||
|
||||
|
|
@ -98,7 +137,7 @@ class Aggregate (object):
|
|||
"""Print still-active URLs and empty the URL queue."""
|
||||
self.print_active_threads()
|
||||
self.cancel()
|
||||
timeout = self.config["timeout"]
|
||||
timeout = self.config["aborttimeout"]
|
||||
try:
|
||||
self.urlqueue.join(timeout=timeout)
|
||||
except urlqueue.Timeout:
|
||||
|
|
@ -118,36 +157,9 @@ class Aggregate (object):
|
|||
self.cancel()
|
||||
for t in self.threads:
|
||||
t.stop()
|
||||
self.connections.clear()
|
||||
self.gather_statistics()
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def is_finished (self):
|
||||
"""Determine if checking is finished."""
|
||||
self.remove_stopped_threads()
|
||||
return self.urlqueue.empty() and not self.threads
|
||||
|
||||
@synchronized(_w3_time_lock)
|
||||
def check_w3_time (self):
|
||||
"""Make sure the W3C validators are at most called once a second."""
|
||||
if time.time() - self.last_w3_call < 1:
|
||||
time.sleep(1)
|
||||
self.last_w3_call = time.time()
|
||||
|
||||
@synchronized(_download_lock)
|
||||
def add_download_data(self, url, data):
|
||||
"""Add given downloaded data.
|
||||
@param url: URL which data belongs to
|
||||
@ptype url: unicode
|
||||
@param data: downloaded data
|
||||
@ptype data: string
|
||||
"""
|
||||
self.downloaded_bytes += len(data)
|
||||
|
||||
def gather_statistics(self):
|
||||
"""Gather download and cache statistics and send them to the
|
||||
logger.
|
||||
"""
|
||||
robots_txt_stats = self.robots_txt.hits, self.robots_txt.misses
|
||||
download_stats = self.downloaded_bytes
|
||||
self.logger.add_statistics(robots_txt_stats, download_stats)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -36,14 +36,17 @@ def check_url (urlqueue, logger):
|
|||
class Checker (task.LoggedCheckedTask):
|
||||
"""URL check thread."""
|
||||
|
||||
def __init__ (self, urlqueue, logger):
|
||||
def __init__ (self, urlqueue, logger, add_request_session):
|
||||
"""Store URL queue and logger."""
|
||||
super(Checker, self).__init__(logger)
|
||||
self.urlqueue = urlqueue
|
||||
self.origname = self.getName()
|
||||
self.add_request_session = add_request_session
|
||||
|
||||
def run_checked (self):
|
||||
"""Check URLs in the queue."""
|
||||
# construct per-thread HTTP/S requests session
|
||||
self.add_request_session()
|
||||
while not self.stopped(0):
|
||||
self.check_url()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,40 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2007-2011 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""Cleanup task."""
|
||||
import time
|
||||
from . import task, console
|
||||
|
||||
|
||||
class Cleanup (task.CheckedTask):
|
||||
"""Cleanup task performing periodic cleanup of cached connections."""
|
||||
|
||||
def __init__ (self, connections):
|
||||
"""Store urlqueue object."""
|
||||
super(Cleanup, self).__init__()
|
||||
self.connections = connections
|
||||
|
||||
def run_checked (self):
|
||||
"""Print periodic status messages."""
|
||||
self.start_time = time.time()
|
||||
self.setName("Cleanup")
|
||||
# clean every 15 seconds
|
||||
while not self.stopped(15):
|
||||
self.connections.remove_expired()
|
||||
|
||||
def internal_error (self):
|
||||
"""Print internal error to console."""
|
||||
console.internal_error()
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2013 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
46
linkcheck/director/interrupt.py
Normal file
46
linkcheck/director/interrupt.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""Status message handling"""
|
||||
import time
|
||||
from . import task
|
||||
from .. import log, LOG_CHECK, strformat
|
||||
|
||||
|
||||
class Interrupt (task.CheckedTask):
|
||||
"""Thread that raises KeyboardInterrupt after a specified duration.
|
||||
This gives us a portable SIGALRM implementation.
|
||||
The duration is checked every 5 seconds.
|
||||
"""
|
||||
WaitSeconds = 5
|
||||
|
||||
def __init__ (self, duration):
|
||||
"""Initialize the task.
|
||||
@param duration: raise KeyboardInterrupt after given number of seconds
|
||||
@ptype duration: int
|
||||
"""
|
||||
super(Interrupt, self).__init__()
|
||||
self.duration = duration
|
||||
|
||||
def run_checked (self):
|
||||
"""Wait and raise KeyboardInterrupt after."""
|
||||
self.start_time = time.time()
|
||||
self.setName("Interrupt")
|
||||
while not self.stopped(self.WaitSeconds):
|
||||
duration = time.time() - self.start_time
|
||||
if duration > self.duration:
|
||||
log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration))
|
||||
raise KeyboardInterrupt()
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -29,7 +29,6 @@ class Logger (object):
|
|||
self.loggers = [config['logger']]
|
||||
self.loggers.extend(config['fileoutput'])
|
||||
self.verbose = config["verbose"]
|
||||
self.complete = config["complete"]
|
||||
self.warnings = config["warnings"]
|
||||
|
||||
def start_log_output (self):
|
||||
|
|
@ -46,15 +45,8 @@ class Logger (object):
|
|||
for logger in self.loggers:
|
||||
logger.end_output()
|
||||
|
||||
def add_statistics(self, robots_txt_stats, download_stats):
|
||||
"""Add statistics to logger."""
|
||||
for logger in self.loggers:
|
||||
logger.add_statistics(robots_txt_stats, download_stats)
|
||||
|
||||
def do_print (self, url_data):
|
||||
"""Determine if URL entry should be logged or not."""
|
||||
if self.complete:
|
||||
return True
|
||||
if self.verbose:
|
||||
return True
|
||||
if self.warnings and url_data.warnings:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -22,7 +22,7 @@ from . import task
|
|||
class Status (task.LoggedCheckedTask):
|
||||
"""Thread that gathers and logs the status periodically."""
|
||||
|
||||
def __init__ (self, urlqueue, logger, wait_seconds, max_duration):
|
||||
def __init__ (self, urlqueue, logger, wait_seconds):
|
||||
"""Initialize the status logger task.
|
||||
@param urlqueue: the URL queue
|
||||
@ptype urlqueue: Urlqueue
|
||||
|
|
@ -30,33 +30,27 @@ class Status (task.LoggedCheckedTask):
|
|||
@ptype logger: console.StatusLogger
|
||||
@param wait_seconds: interval in seconds to report status
|
||||
@ptype wait_seconds: int
|
||||
@param max_duration: abort checking after given number of seconds
|
||||
@ptype max_duration: int or None
|
||||
"""
|
||||
super(Status, self).__init__(logger)
|
||||
self.urlqueue = urlqueue
|
||||
self.wait_seconds = wait_seconds
|
||||
assert self.wait_seconds >= 1
|
||||
self.first_wait = True
|
||||
self.max_duration = max_duration
|
||||
|
||||
def run_checked (self):
|
||||
"""Print periodic status messages."""
|
||||
self.start_time = time.time()
|
||||
self.setName("Status")
|
||||
if not self.first_wait:
|
||||
wait_seconds = self.wait_seconds
|
||||
else:
|
||||
# the first status should be after a second
|
||||
self.first_wait = False
|
||||
wait_seconds = 1
|
||||
# the first status should be after a second
|
||||
wait_seconds = 1
|
||||
first_wait = True
|
||||
while not self.stopped(wait_seconds):
|
||||
self.log_status()
|
||||
if first_wait:
|
||||
wait_seconds = self.wait_seconds
|
||||
first_wait = False
|
||||
|
||||
def log_status (self):
|
||||
"""Log a status message."""
|
||||
duration = time.time() - self.start_time
|
||||
if self.max_duration is not None and duration > self.max_duration:
|
||||
raise KeyboardInterrupt()
|
||||
checked, in_progress, queue = self.urlqueue.status()
|
||||
self.logger.log_status(checked, in_progress, queue, duration)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import thread
|
||||
from ..decorators import notimplemented
|
||||
from .. import log, LOG_CHECK, threader
|
||||
from .. import threader
|
||||
from . import console
|
||||
|
||||
|
||||
|
|
@ -28,7 +28,6 @@ class CheckedTask (threader.StoppableThread):
|
|||
try:
|
||||
self.run_checked()
|
||||
except KeyboardInterrupt:
|
||||
log.warn(LOG_CHECK, "interrupt did not reach the main thread")
|
||||
thread.interrupt_main()
|
||||
except Exception:
|
||||
self.internal_error()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2005-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -275,6 +275,12 @@ def is_accessable_by_others(filename):
|
|||
return mode & (stat.S_IRWXG | stat.S_IRWXO)
|
||||
|
||||
|
||||
def is_writable_by_others(filename):
|
||||
"""Check if file or directory is world writable."""
|
||||
mode = os.stat(filename)[stat.ST_MODE]
|
||||
return mode & stat.S_IWOTH
|
||||
|
||||
|
||||
@memoized
|
||||
def is_writable(filename):
|
||||
"""Check if
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2009-2010 Bastian Kleineidam
|
||||
# Copyright (C) 2009-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ from PyQt4 import QtCore, QtGui
|
|||
from .linkchecker_ui_main import Ui_MainWindow
|
||||
from .properties import set_properties, clear_properties
|
||||
from .statistics import set_statistics, clear_statistics
|
||||
from .debug import LinkCheckerDebug, LinkCheckerDebugMemory
|
||||
from .debug import LinkCheckerDebug
|
||||
from .logger import SignalLogger, GuiLogHandler, StatusLogger
|
||||
from .help import HelpWindow
|
||||
from .options import LinkCheckerOptions
|
||||
|
|
@ -37,7 +37,7 @@ from .settings import Settings
|
|||
from .recentdocs import RecentDocumentModel
|
||||
from .projects import openproject, saveproject, loadproject, ProjectExt
|
||||
from .. import configuration, checker, director, get_link_pat, \
|
||||
strformat, fileutil, LinkCheckerError, memoryutil
|
||||
strformat, fileutil, LinkCheckerError
|
||||
from ..containers import enum
|
||||
from .. import url as urlutil
|
||||
from ..checker import httpheaders
|
||||
|
|
@ -99,7 +99,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
|
|||
# init subdialogs
|
||||
self.options = LinkCheckerOptions(parent=self)
|
||||
self.debug = LinkCheckerDebug(parent=self)
|
||||
self.debugmemory = LinkCheckerDebugMemory(parent=self)
|
||||
self.checker = CheckerThread(parent=self)
|
||||
self.contextmenu = ContextMenu(parent=self)
|
||||
self.editor = EditorWindow(parent=self)
|
||||
|
|
@ -175,8 +174,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
|
|||
def set_idle ():
|
||||
"""Set application status to idle."""
|
||||
self.status = Status.idle
|
||||
if self.config["debugmemory"]:
|
||||
self.dump_memory()
|
||||
self.set_statusmsg(_("Check finished."))
|
||||
self.controlButton.clicked.disconnect(self.checker.cancel)
|
||||
self.checker.finished.connect(set_idle)
|
||||
|
|
@ -250,7 +247,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
|
|||
self.config["threads"] = 1
|
||||
else:
|
||||
self.config.reset_loglevel()
|
||||
self.config["debugmemory"] = data["debugmemory"]
|
||||
if data["warninglines"]:
|
||||
lines = data["warninglines"].splitlines()
|
||||
ro = re.compile(warninglines2regex(lines))
|
||||
|
|
@ -313,7 +309,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
|
|||
elif status == Status.checking:
|
||||
self.treeView.setSortingEnabled(False)
|
||||
self.debug.reset()
|
||||
self.debugmemory.reset()
|
||||
self.set_statusmsg(u"Checking site...")
|
||||
# disable commands
|
||||
self.menubar.setEnabled(False)
|
||||
|
|
@ -423,7 +418,7 @@ Version 2 or later.
|
|||
def cancel (self):
|
||||
"""Note that checking is canceled."""
|
||||
self.controlButton.setEnabled(False)
|
||||
duration = strformat.strduration_long(self.config["timeout"])
|
||||
duration = strformat.strduration_long(self.config["aborttimeout"])
|
||||
self.set_statusmsg(_(u"Closing active URLs with timeout %s...") % duration)
|
||||
|
||||
@QtCore.pyqtSlot()
|
||||
|
|
@ -436,16 +431,6 @@ Version 2 or later.
|
|||
else:
|
||||
raise ValueError("Invalid application status %r" % self.status)
|
||||
|
||||
def dump_memory (self):
|
||||
"""Dump memory to temporary file and inform user with a modal
|
||||
dialog where the file is."""
|
||||
self.set_statusmsg(_(u"Dumping memory statistics..."))
|
||||
filename = memoryutil.write_memory_dump()
|
||||
title = _(u"LinkChecker memory dump written")
|
||||
message = _(u"The memory dump has been written to `%(filename)s'.")
|
||||
attrs = dict(filename=filename)
|
||||
QtGui.QMessageBox.information(self, title, message % attrs)
|
||||
|
||||
def get_url (self):
|
||||
"""Return URL to check from the urlinput widget."""
|
||||
url = strformat.stripurl(unicode(self.urlinput.text()))
|
||||
|
|
@ -524,9 +509,10 @@ Version 2 or later.
|
|||
"""View URL source in editor window."""
|
||||
self.editor.setWindowTitle(u"View %s" % url)
|
||||
self.editor.setUrl(url)
|
||||
info, data = urlutil.get_content(url, proxy=self.config["proxy"])
|
||||
if (info, data) == (None, None):
|
||||
self.editor.setText(u"An error occurred retreiving URL `%s'." % url)
|
||||
data, info = urlutil.get_content(url, proxy=self.config["proxy"])
|
||||
if data is None:
|
||||
msg = u"An error occurred retreiving URL `%s': %s." % (url, info)
|
||||
self.editor.setText(msg)
|
||||
else:
|
||||
content_type = httpheaders.get_content_type(info)
|
||||
if not content_type:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2008-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2008-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2009-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2009-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2009-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2009-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -41,23 +41,3 @@ class LinkCheckerDebug (QtGui.QDialog, Ui_DebugDialog):
|
|||
def getText (self):
|
||||
"""Get debug info as string."""
|
||||
return self.textEdit.toPlainText()
|
||||
|
||||
|
||||
class LinkCheckerDebugMemory (QtGui.QDialog, Ui_DebugDialog):
|
||||
"""Show memory debugging output."""
|
||||
|
||||
def __init__ (self, parent=None):
|
||||
"""Setup the debug memory dialog."""
|
||||
super(LinkCheckerDebugMemory, self).__init__(parent)
|
||||
self.setupUi(self)
|
||||
font = QtGui.QFont("Consolas", 11)
|
||||
font.setFixedPitch(True)
|
||||
self.textEdit.document().setDefaultFont(font)
|
||||
|
||||
def reset (self):
|
||||
"""Clear memory info."""
|
||||
self.textEdit.clear()
|
||||
|
||||
def setText (self, text):
|
||||
"""Set memory debug info."""
|
||||
return self.textEdit.setPlainText(text)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2009-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2009-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
# Form implementation generated from reading ui file 'ui/debug.ui'
|
||||
#
|
||||
# Created: Mon Dec 12 19:00:37 2011
|
||||
# by: PyQt4 UI code generator 4.8.6
|
||||
# Created: Fri Feb 28 21:24:59 2014
|
||||
# by: PyQt4 UI code generator 4.9.3
|
||||
#
|
||||
# WARNING! All changes made in this file will be lost!
|
||||
|
||||
|
|
@ -19,7 +19,6 @@ class Ui_DebugDialog(object):
|
|||
DebugDialog.setObjectName(_fromUtf8("DebugDialog"))
|
||||
DebugDialog.setWindowModality(QtCore.Qt.ApplicationModal)
|
||||
DebugDialog.resize(564, 547)
|
||||
DebugDialog.setWindowTitle(_("LinkChecker debug log"))
|
||||
self.verticalLayout = QtGui.QVBoxLayout(DebugDialog)
|
||||
self.verticalLayout.setObjectName(_fromUtf8("verticalLayout"))
|
||||
self.frame = QtGui.QFrame(DebugDialog)
|
||||
|
|
@ -40,5 +39,5 @@ class Ui_DebugDialog(object):
|
|||
QtCore.QMetaObject.connectSlotsByName(DebugDialog)
|
||||
|
||||
def retranslateUi(self, DebugDialog):
|
||||
pass
|
||||
DebugDialog.setWindowTitle(_("LinkChecker debug log"))
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# Form implementation generated from reading ui file 'ui/main.ui'
|
||||
#
|
||||
# Created: Tue Nov 6 21:47:39 2012
|
||||
# Created: Fri Feb 28 21:24:58 2014
|
||||
# by: PyQt4 UI code generator 4.9.3
|
||||
#
|
||||
# WARNING! All changes made in this file will be lost!
|
||||
|
|
@ -679,29 +679,6 @@ class Ui_MainWindow(object):
|
|||
self.stats_url_maxlen.setOpenExternalLinks(True)
|
||||
self.stats_url_maxlen.setObjectName(_fromUtf8("stats_url_maxlen"))
|
||||
self.gridLayout_3.addWidget(self.stats_url_maxlen, 1, 1, 1, 1)
|
||||
self.label_14 = QtGui.QLabel(self.groupBox_2)
|
||||
sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
sizePolicy.setVerticalStretch(0)
|
||||
sizePolicy.setHeightForWidth(self.label_14.sizePolicy().hasHeightForWidth())
|
||||
self.label_14.setSizePolicy(sizePolicy)
|
||||
self.label_14.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)
|
||||
self.label_14.setObjectName(_fromUtf8("label_14"))
|
||||
self.gridLayout_3.addWidget(self.label_14, 1, 2, 1, 1)
|
||||
self.stats_domains = QtGui.QLabel(self.groupBox_2)
|
||||
sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
|
||||
sizePolicy.setHorizontalStretch(0)
|
||||
sizePolicy.setVerticalStretch(0)
|
||||
sizePolicy.setHeightForWidth(self.stats_domains.sizePolicy().hasHeightForWidth())
|
||||
self.stats_domains.setSizePolicy(sizePolicy)
|
||||
self.stats_domains.setMinimumSize(QtCore.QSize(30, 0))
|
||||
self.stats_domains.setFrameShape(QtGui.QFrame.StyledPanel)
|
||||
self.stats_domains.setFrameShadow(QtGui.QFrame.Sunken)
|
||||
self.stats_domains.setText(_fromUtf8(""))
|
||||
self.stats_domains.setTextFormat(QtCore.Qt.RichText)
|
||||
self.stats_domains.setOpenExternalLinks(True)
|
||||
self.stats_domains.setObjectName(_fromUtf8("stats_domains"))
|
||||
self.gridLayout_3.addWidget(self.stats_domains, 1, 3, 1, 1)
|
||||
self.verticalLayout_2.addWidget(self.groupBox_2)
|
||||
self.horizontalLayout.addWidget(self.statistics)
|
||||
self.verticalLayout.addLayout(self.horizontalLayout)
|
||||
|
|
@ -831,7 +808,6 @@ class Ui_MainWindow(object):
|
|||
self.label_18.setText(_("Min. length"))
|
||||
self.label_20.setText(_("Avg. length"))
|
||||
self.label_19.setText(_("Max. length"))
|
||||
self.label_14.setText(_("Domains"))
|
||||
self.menuEdit.setTitle(_("&Edit"))
|
||||
self.menuFile.setTitle(_("&File"))
|
||||
self.menuHelp.setTitle(_("&Help"))
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
# Form implementation generated from reading ui file 'ui/options.ui'
|
||||
#
|
||||
# Created: Sun Jun 10 11:51:42 2012
|
||||
# by: PyQt4 UI code generator 4.9.1
|
||||
# Created: Fri Feb 28 21:24:59 2014
|
||||
# by: PyQt4 UI code generator 4.9.3
|
||||
#
|
||||
# WARNING! All changes made in this file will be lost!
|
||||
|
||||
|
|
@ -28,6 +28,7 @@ class Ui_Options(object):
|
|||
self.widget = QtGui.QWidget(self.groupBox_2)
|
||||
self.widget.setObjectName(_fromUtf8("widget"))
|
||||
self.formLayout = QtGui.QFormLayout(self.widget)
|
||||
self.formLayout.setFieldGrowthPolicy(QtGui.QFormLayout.ExpandingFieldsGrow)
|
||||
self.formLayout.setMargin(0)
|
||||
self.formLayout.setObjectName(_fromUtf8("formLayout"))
|
||||
self.label = QtGui.QLabel(self.widget)
|
||||
|
|
@ -60,14 +61,6 @@ class Ui_Options(object):
|
|||
self.debug.setText(_fromUtf8(""))
|
||||
self.debug.setObjectName(_fromUtf8("debug"))
|
||||
self.formLayout.setWidget(2, QtGui.QFormLayout.FieldRole, self.debug)
|
||||
self.label_7 = QtGui.QLabel(self.widget)
|
||||
self.label_7.setToolTip(_fromUtf8(""))
|
||||
self.label_7.setObjectName(_fromUtf8("label_7"))
|
||||
self.formLayout.setWidget(3, QtGui.QFormLayout.LabelRole, self.label_7)
|
||||
self.debugmemory = QtGui.QCheckBox(self.widget)
|
||||
self.debugmemory.setText(_fromUtf8(""))
|
||||
self.debugmemory.setObjectName(_fromUtf8("debugmemory"))
|
||||
self.formLayout.setWidget(3, QtGui.QFormLayout.FieldRole, self.debugmemory)
|
||||
self.verticalLayout.addWidget(self.widget)
|
||||
spacerItem = QtGui.QSpacerItem(20, 10, QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding)
|
||||
self.verticalLayout.addItem(spacerItem)
|
||||
|
|
@ -143,7 +136,6 @@ class Ui_Options(object):
|
|||
self.label_2.setText(_("Verbose output"))
|
||||
self.verbose.setToolTip(_("Log all checked URLs once. Default is to log only errors and warnings."))
|
||||
self.label_4.setText(_("Debug"))
|
||||
self.label_7.setText(_("Debug memory usage"))
|
||||
self.label_5.setText(_("Warn when one of these strings are found (one per line):"))
|
||||
self.label_6.setText(_("Ignore URLs matching one of these patterns (one per line):"))
|
||||
self.groupBox.setTitle(_("Configuration file"))
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2009-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2009-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -19,7 +19,7 @@ import os
|
|||
from PyQt4 import QtGui
|
||||
from .linkchecker_ui_options import Ui_Options
|
||||
from .editor import EditorWindow
|
||||
from ..fileutil import is_writable, has_module
|
||||
from ..fileutil import is_writable
|
||||
from .. import configuration
|
||||
|
||||
|
||||
|
|
@ -46,11 +46,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
|
|||
self.recursionlevel.setValue(-1)
|
||||
self.verbose.setChecked(False)
|
||||
self.debug.setChecked(False)
|
||||
self.debugmemory.setChecked(False)
|
||||
if not has_module("meliae"):
|
||||
self.debugmemory.setEnabled(False)
|
||||
from ..memoryutil import MemoryDebugMsg
|
||||
self.debugmemory.setToolTip(MemoryDebugMsg)
|
||||
self.warninglines.setPlainText(u"")
|
||||
self.ignorelines.setPlainText(u"")
|
||||
|
||||
|
|
@ -69,7 +64,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
|
|||
"""Return option data as dictionary."""
|
||||
return dict(
|
||||
debug=self.debug.isChecked(),
|
||||
debugmemory=self.debugmemory.isChecked(),
|
||||
verbose=self.verbose.isChecked(),
|
||||
recursionlevel=self.recursionlevel.value(),
|
||||
warninglines=unicode(self.warninglines.toPlainText()),
|
||||
|
|
@ -80,8 +74,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
|
|||
"""Set GUI options from given data."""
|
||||
if data.get("debug") is not None:
|
||||
self.debug.setChecked(data["debug"])
|
||||
if data.get("debugmemory") is not None:
|
||||
self.debugmemory.setChecked(data["debugmemory"])
|
||||
if data.get("verbose") is not None:
|
||||
self.verbose.setChecked(data["verbose"])
|
||||
if data.get("recursionlevel") is not None:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -57,9 +57,6 @@ class ProjectParser (confparse.LCConfigParser):
|
|||
return
|
||||
data = {}
|
||||
option = "debug"
|
||||
if self.has_option(section, option):
|
||||
data[option] = self.getboolean(section, option)
|
||||
option = "debugmemory"
|
||||
if self.has_option(section, option):
|
||||
data[option] = self.getboolean(section, option)
|
||||
option = "verbose"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -38,8 +38,8 @@ def set_properties (widget, data):
|
|||
widget.prop_dltime.setText(_("%.3f seconds") % data.dltime)
|
||||
else:
|
||||
widget.prop_dltime.setText(u"")
|
||||
if data.dlsize >= 0:
|
||||
widget.prop_size.setText(strformat.strsize(data.dlsize))
|
||||
if data.size >= 0:
|
||||
widget.prop_size.setText(strformat.strsize(data.size))
|
||||
else:
|
||||
widget.prop_size.setText(u"")
|
||||
if data.modified:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -85,10 +85,10 @@ class Settings (object):
|
|||
|
||||
def read_options (self):
|
||||
"""Return stored GUI options."""
|
||||
data = dict(debug=None, debugmemory=None, verbose=None,
|
||||
data = dict(debug=None, verbose=None,
|
||||
recursionlevel=None, warninglines=None, ignorelines=None)
|
||||
self.settings.beginGroup('output')
|
||||
for key in ("debug", "debugmemory", "verbose"):
|
||||
for key in ("debug", "verbose"):
|
||||
if self.settings.contains(key):
|
||||
data[key] = self.settings.value(key).toBool()
|
||||
self.settings.endGroup()
|
||||
|
|
@ -116,7 +116,7 @@ class Settings (object):
|
|||
def save_options (self, data):
|
||||
"""Save GUI options."""
|
||||
self.settings.beginGroup('output')
|
||||
for key in ("debug", "debugmemory", "verbose"):
|
||||
for key in ("debug", "verbose"):
|
||||
self.settings.setValue(key, QtCore.QVariant(data[key]))
|
||||
self.settings.endGroup()
|
||||
self.settings.beginGroup('checking')
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -19,7 +19,6 @@ from ..logger import ContentTypes
|
|||
|
||||
def set_statistics (widget, statistics):
|
||||
"""Set statistic information in given widget."""
|
||||
widget.stats_domains.setText(u"%d" % len(statistics.domains))
|
||||
widget.stats_url_minlen.setText(u"%d" % statistics.min_url_length)
|
||||
widget.stats_url_maxlen.setText(u"%d" % statistics.max_url_length)
|
||||
widget.stats_url_avglen.setText(u"%d" % statistics.avg_url_length)
|
||||
|
|
@ -38,7 +37,6 @@ def set_statistics (widget, statistics):
|
|||
|
||||
def clear_statistics (widget):
|
||||
"""Reset statistic information in given widget."""
|
||||
widget.stats_domains.setText(u"")
|
||||
widget.stats_url_minlen.setText(u"")
|
||||
widget.stats_url_maxlen.setText(u"")
|
||||
widget.stats_url_avglen.setText(u"")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1402,53 +1402,6 @@
|
|||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="2">
|
||||
<widget class="QLabel" name="label_14">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Domains</string>
|
||||
</property>
|
||||
<property name="alignment">
|
||||
<set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="1" column="3">
|
||||
<widget class="QLabel" name="stats_domains">
|
||||
<property name="sizePolicy">
|
||||
<sizepolicy hsizetype="Minimum" vsizetype="Preferred">
|
||||
<horstretch>0</horstretch>
|
||||
<verstretch>0</verstretch>
|
||||
</sizepolicy>
|
||||
</property>
|
||||
<property name="minimumSize">
|
||||
<size>
|
||||
<width>30</width>
|
||||
<height>0</height>
|
||||
</size>
|
||||
</property>
|
||||
<property name="frameShape">
|
||||
<enum>QFrame::StyledPanel</enum>
|
||||
</property>
|
||||
<property name="frameShadow">
|
||||
<enum>QFrame::Sunken</enum>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string/>
|
||||
</property>
|
||||
<property name="textFormat">
|
||||
<enum>Qt::RichText</enum>
|
||||
</property>
|
||||
<property name="openExternalLinks">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
|
|
|
|||
|
|
@ -29,6 +29,9 @@
|
|||
<item>
|
||||
<widget class="QWidget" name="widget" native="true">
|
||||
<layout class="QFormLayout" name="formLayout">
|
||||
<property name="fieldGrowthPolicy">
|
||||
<enum>QFormLayout::ExpandingFieldsGrow</enum>
|
||||
</property>
|
||||
<item row="0" column="0">
|
||||
<widget class="QLabel" name="label">
|
||||
<property name="toolTip">
|
||||
|
|
@ -104,23 +107,6 @@
|
|||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="0">
|
||||
<widget class="QLabel" name="label_7">
|
||||
<property name="toolTip">
|
||||
<string extracomment="When checking finishes, write a memory dump to a temporary file. The memory dump is written both when checking finishes normally and when checking gets canceled."/>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Debug memory usage</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="3" column="1">
|
||||
<widget class="QCheckBox" name="debugmemory">
|
||||
<property name="text">
|
||||
<string/>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010-2011 Bastian Kleineidam
|
||||
# Copyright (C) 2010-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2011 Bastian Kleineidam
|
||||
# Copyright (C) 2011-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2008-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2008-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2001-2010 Bastian Kleineidam
|
||||
# Copyright (C) 2001-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -201,9 +201,7 @@ class LinkFinder (TagFinder):
|
|||
def start_element (self, tag, attrs):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d",
|
||||
self.parser.lineno(), self.parser.column(),
|
||||
self.parser.last_lineno(), self.parser.last_column())
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
if tag == "base" and not self.base_ref:
|
||||
self.base_ref = unquote(attrs.get_true("href", u''))
|
||||
tagattrs = self.tags.get(tag, [])
|
||||
|
|
@ -282,7 +280,6 @@ class LinkFinder (TagFinder):
|
|||
return
|
||||
for u in urls:
|
||||
assert isinstance(u, unicode) or u is None, repr(u)
|
||||
log.debug(LOG_CHECK,
|
||||
u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
|
||||
log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
|
||||
self.callback(u, self.parser.last_lineno(),
|
||||
self.parser.last_column(), name, base)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,86 +1,9 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Various HTTP utils with a free license
|
||||
from cStringIO import StringIO
|
||||
from . import gzip2 as gzip
|
||||
from . import httplib2 as httplib
|
||||
from . import log, LOG_CHECK, fileutil
|
||||
import re
|
||||
import zlib
|
||||
import urllib
|
||||
import urllib2
|
||||
from . import fileutil
|
||||
import base64
|
||||
|
||||
|
||||
###########################################################################
|
||||
# urlutils.py - Simplified urllib handling
|
||||
#
|
||||
# Written by Chris Lawrence <lawrencc@debian.org>
|
||||
# (C) 1999-2002 Chris Lawrence
|
||||
#
|
||||
# This program is freely distributable per the following license:
|
||||
#
|
||||
## Permission to use, copy, modify, and distribute this software and its
|
||||
## documentation for any purpose and without fee is hereby granted,
|
||||
## provided that the above copyright notice appears in all copies and that
|
||||
## both that copyright notice and this permission notice appear in
|
||||
## supporting documentation.
|
||||
##
|
||||
## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
|
||||
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I
|
||||
## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
||||
## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
|
||||
## SOFTWARE.
|
||||
|
||||
def decode (page):
|
||||
"""Gunzip or deflate a compressed page."""
|
||||
log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
|
||||
encoding = page.info().get("Content-Encoding")
|
||||
if encoding in ('gzip', 'x-gzip', 'deflate'):
|
||||
# cannot seek in socket descriptors, so must get content now
|
||||
content = page.read()
|
||||
try:
|
||||
if encoding == 'deflate':
|
||||
fp = StringIO(zlib.decompress(content))
|
||||
else:
|
||||
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
|
||||
except zlib.error as msg:
|
||||
log.debug(LOG_CHECK, "uncompressing had error "
|
||||
"%s, assuming non-compressed content", str(msg))
|
||||
fp = StringIO(content)
|
||||
# remove content-encoding header
|
||||
headers = httplib.HTTPMessage(StringIO(""))
|
||||
ceheader = re.compile(r"(?i)content-encoding:")
|
||||
for h in page.info().keys():
|
||||
if not ceheader.match(h):
|
||||
headers[h] = page.info()[h]
|
||||
newpage = urllib.addinfourl(fp, headers, page.geturl())
|
||||
newpage.code = page.code
|
||||
newpage.msg = page.msg
|
||||
return newpage
|
||||
return page
|
||||
|
||||
|
||||
class HttpWithGzipHandler (urllib2.HTTPHandler):
|
||||
"""Support gzip encoding."""
|
||||
def http_open (self, req):
|
||||
"""Send request and decode answer."""
|
||||
return decode(urllib2.HTTPHandler.http_open(self, req))
|
||||
|
||||
|
||||
if hasattr(httplib, 'HTTPS'):
|
||||
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
|
||||
"""Support gzip encoding."""
|
||||
|
||||
def https_open (self, req):
|
||||
"""Send request and decode answer."""
|
||||
return decode(urllib2.HTTPSHandler.https_open(self, req))
|
||||
|
||||
# end of urlutils.py routines
|
||||
###########################################################################
|
||||
|
||||
|
||||
def encode_multipart_formdata(fields, files=None):
|
||||
"""
|
||||
From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2012 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -172,7 +172,7 @@ def get_configuration(form, out):
|
|||
config["logger"] = config.logger_new('html', fd=out, encoding=HTML_ENCODING)
|
||||
config["threads"] = 2
|
||||
if "anchors" in form:
|
||||
config["anchors"] = True
|
||||
config["enabledplugins"].append("AnchorCheck")
|
||||
if "errors" not in form:
|
||||
config["verbose"] = True
|
||||
# avoid checking of local files or other nasty stuff
|
||||
|
|
@ -246,15 +246,16 @@ def format_error (why):
|
|||
@return: HTML page content
|
||||
@rtype: unicode
|
||||
"""
|
||||
return _("""<html><head>
|
||||
return _("""<!DOCTYPE HTML>
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>LinkChecker Online Error</title></head>
|
||||
<body text=#192c83 bgcolor=#fff7e5 link=#191c83 vlink=#191c83 alink=#191c83>
|
||||
<blockquote>
|
||||
<b>Error: %s</b><br>
|
||||
<b>Error: %s</b><br/>
|
||||
The LinkChecker Online script has encountered an error. Please ensure
|
||||
that your provided URL link begins with <code>http://</code> and
|
||||
contains only these characters: <code>A-Za-z0-9./_~-</code><br><br>
|
||||
contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
|
||||
Errors are logged.
|
||||
</blockquote>
|
||||
</body>
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue