Introduce check plugins, use Python requests for http/s connections, and some code cleanups and improvements.

2026-05-06 05:34:47 +00:00 · 2014-03-01 00:12:34 +01:00 · 2014-03-01 00:12:34 +01:00 · 7b34be590b
commit 7b34be590b
parent adc17fbe77
194 changed files with 4817 additions and 8903 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,3 +36,4 @@ Changelog.linkchecker*
 /todo
 /alexa*.log
 /testresults.txt
+/linkchecker.prof
--- a/6
+++ b/6
@ -18,11 +18,11 @@ DEBORIGFILE:=$(DEBUILDDIR)/$(LAPPNAME)_$(VERSION).orig.tar.xz
 DEBPACKAGEDIR:=$(DEBUILDDIR)/$(APPNAME)-$(VERSION)
 FILESCHECK_URL:=http://localhost/~calvin/
 SRCDIR:=${HOME}/src
-PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc
+PY_FILES_DIRS:=linkcheck tests *.py linkchecker linkchecker-nagios linkchecker-gui cgi-bin config doc/examples
 MYPY_FILES_DIRS:=linkcheck/HtmlParser linkcheck/checker \
 	  linkcheck/cache linkcheck/configuration linkcheck/director \
 	  linkcheck/htmlutil linkcheck/logger linkcheck/network \
-	  linkcheck/bookmarks \
+	  linkcheck/bookmarks linkcheck/plugins linkcheck/parser \
 	  linkcheck/gui/__init__.py \
 	  linkcheck/gui/checker.py \
 	  linkcheck/gui/contextmenu.py \
@ -192,7 +192,7 @@ filescheck: localbuild
 	done

 update-copyright:
-	update-copyright --holder="Bastian Kleineidam"
+	update-copyright --holder="Bastian Kleineidam" $(PY_FILES_DIRS)

 releasecheck: check update-certificates
 	@if egrep -i "xx\.|xxxx|\.xx" doc/changelog.txt > /dev/null; then \
--- a/config/create.sql
+++ b/config/create.sql
@ -17,7 +17,7 @@ create table linksdb (
    name           varchar(256),
    checktime      int,
    dltime         int,
-    dlsize         int,
+    size           int,
    cached         int,
    level          int not null,
    modified       varchar(256)
--- a/config/linkcheckerrc
+++ b/config/linkcheckerrc
@ -131,32 +131,18 @@
 #threads=100
 # connection timeout in seconds
 #timeout=60
-# check anchors?
-#anchors=0
+# Time to wait for checks to finish after the user aborts the first time
+# (with Ctrl-C or the abort button).
+#aborttimeout=300
+# The recursion level determines how many times links inside pages are followed.
 #recursionlevel=1
-# supply a regular expression for which warnings are printed if found
-# in any HTML files.
-#warningregex=(Oracle DB Error|Page Not Found|badsite\.example\.com)
 # Basic NNTP server. Overrides NNTP_SERVER environment variable.
-# warn if size info exceeds given maximum of bytes
-#warnsizebytes=2000
 #nntpserver=
-# check HTML or CSS syntax with the W3C online validator
-#checkhtml=1
-#checkcss=1
-# scan URL content for viruses with ClamAV
-#scanvirus=1
-# ClamAV config file
-#clamavconf=/etc/clamav/clamd.conf
-# Send and store cookies
-#cookies=1
 # parse a cookiefile for initial cookie data
 #cookiefile=/path/to/cookies.txt
 # User-Agent header string to send to HTTP web servers
+# Note that robots.txt are always checked with the original User-Agent.
 #useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
-# Pause the given number of seconds between two subsequent connection
-# requests to the same host.
-#pause=0
 # When checking finishes, write a memory dump to a temporary file.
 # The memory dump is written both when checking finishes normally
 # and when checking gets canceled.
@ -175,22 +161,16 @@
 # Check SSL certificates. Set to an absolute pathname for a custom
 # CA cert bundle to use. Set to zero to disable SSL certificate verification.
 #sslverify=1
-# Check that SSL certificates are at least the given number of days valid.
-# The number must not be negative.
-# If the number of days is zero a warning is printed only for certificates
-# that are already expired.
-# The default number of days is 14.
-#sslcertwarndays=14
 # Stop checking new URLs after the given number of seconds. Same as if the
 # user hits Ctrl-C after X seconds.
 #maxrunseconds=600
 # Maximum number of URLs to check. New URLs will not be queued after the
 # given number of URLs is checked.
 #maxnumurls=153
-# Maximum number of connections to one single host for different connection types.
-#maxconnectionshttp=10
-#maxconnectionshttps=10
-#maxconnectionsftp=2
+# Maximum number of requests per second to one host.
+#maxrequestspersecond=10
+# Allowed URL schemes as a comma-separated list.
+#allowedschemes=http,https

 ##################### filtering options ##########################
 [filtering]
@ -211,11 +191,12 @@
 # recognized warnings). Add a comma-separated list of warnings here
 # that prevent a valid URL from being logged. Note that the warning
 # will be logged in invalid URLs.
-#ignorewarnings=url-unicode-domain,anchor-not-found
+#ignorewarnings=url-unicode-domain
 # Regular expression to add more URLs recognized as internal links.
 # Default is that URLs given on the command line are internal.
-
 #internlinks=^http://www\.example\.net/
+# Check external links
+#checkextern=1


 ##################### password authentication ##########################
@ -247,3 +228,30 @@
 #loginextrafields=
 #  name1:value1
 #  name 2:value 2
+
+############################ Plugins ###################################
+#
+# uncomment sections to enable plugins
+
+# Check HTML anchors
+#[AnchorCheck]
+
+# Add country info to URLs
+#[LocationInfo]
+
+# Run W3C syntax checks
+#[CssSyntaxCheck]
+#[HtmlSyntaxCheck]
+
+# Search for regular expression in page contents
+#[RegexCheck]
+#warningregex=Oracle Error
+
+# Search for viruses in page contents
+#[VirusCheck]
+#clamavconf=/etc/clamav/clam.conf
+
+# Check that SSL certificates are at least the given number of days valid.
+#[SslCertificateCheck]
+#sslcertwarndays=14
+
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -1,3 +1,34 @@
+8.7 "" (released xx.xx.2014)
+
+Features:
+- checking: Support connection and content check plugins.
+- checking: Move lots of custom checks like Antivirus and syntax
+  checks into plugins (see upgrading.txt for more info).
+- checking: Add options to limit the number of requests per second,
+  allowed URL schemes and maximum file or download size.
+
+Changes:
+- checking: Use the Python requests module for HTTP and HTTPS requests.
+- logging: Removed download, domains and robots.txt statistics.
+- logging: HTML output is now in HTML5.
+- checking: Removed 301 warning since 301 redirects are used
+  a lot without updating the old URL links.
+- checking: Disallowed access by robots.txt is an info now, not
+  a warning. Otherwise it produces a lot of warnings which
+  is counter-productive.
+- checking: Do not check SMTP connections for mailto: URLs anymore.
+  It resulted in lots of false warnings since spam prevention
+  usually disallows direct SMTP connections from unrecognized
+  client IPs.
+- checking: Only internal URLs are checked as default. To check
+  external urls use --check-extern.
+
+Fixes:
+- logging: Status was printed every second regardless of the
+  configured wait time.
+- checking: Several speed and memory usage improvements.
+
+
 8.6 "About Time" (released 8.1.2014)

 Changes:
--- a/doc/de.po
+++ b/doc/de.po
--- a/doc/de/linkchecker.1
+++ b/doc/de/linkchecker.1
@ -41,16 +41,15 @@ Antivirusprüfung
 .IP \(bu
 ein Kommandozeilenprogramm, GUI und web interface
 .SH BEISPIELE
-Der häufigste Gebrauchsfall prüft die angegebene Domäne rekursiv,
-inklusive aller einzelnen nach außen zeigenden Verknüpfungen:
-  \fBlinkchecker http://www.example.net/\fP
+The most common use checks the given domain recursively:
+  \fBlinkchecker http://www.example.com/\fP
 .br
 Beachten Sie dass dies die komplette Domäne überprüft, welche aus mehreren
 tausend URLs bestehen kann. Benutzen Sie die Option \fB\-r\fP, um die
 Rekursionstiefe zu beschränken.
 .br
-Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich geprüft:
-  \fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
+Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
+  \fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
 .br
 Überprüfung einer lokalen HTML Datei unter Unix:
  \fBlinkchecker ../bla.html\fP
@ -61,8 +60,8 @@ Prüfe keine \fBmailto:\fP URLs. Alle anderen Verknüpfungen werden wie üblich
 Sie können den \fBhttp://\fP URL Anteil weglassen wenn die Domäne mit \fBwww.\fP beginnt:
  \fBlinkchecker www.example.com\fP
 .br
-Sie können den \fBftp://\fP URL Anteil weglassen wenn die Domäne mit \fBftp.\fP beginnt:
-  \fBlinkchecker \-r0 ftp.example.org\fP
+You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
+  \fBlinkchecker \-r0 ftp.example.com\fP
 .br
 Erzeuge einen Sitemap Graphen und konvertiere ihn mit dem graphviz dot Programm:
  \fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
@ -88,19 +87,12 @@ positive Nummer an.
 .TP 
 \fB\-V\fP, \fB\-\-version\fP
 Gebe die Version aus und beende das Programm.
+.TP 
+\fB\-\-list\-plugins\fP
+Print available check plugins and exit.
 .
 .SS Ausgabeoptionen
 .TP 
-\fB\-\-check\-css\fP
-Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
-.TP 
-\fB\-\-check\-html\fP
-Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
-.TP 
-\fB\-\-complete\fP
-Gebe alle geprüften URLs aus. Standard ist es, doppelte URLs nur einmal
-auszugeben.
-.TP 
 \fB\-D\fP\fINAME\fP, \fB\-\-debug=\fP\fINAME\fP
 Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
 \fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP und \fBall\fP. Die Angabe
@ -144,12 +136,6 @@ lokalen Spracheinstellung. Gültige Enkodierungen sind aufgelistet unter
 Keine Ausgabe, ein Alias für \fB\-o none\fP. Dies ist nur in Verbindung mit
 \fB\-F\fP nützlich.
 .TP 
-\fB\-\-scan\-virus\fP
-Prüfe Inhalt von URLs auf Viren mit ClamAV.
-.TP 
-\fB\-\-trace\fP
-Trace\-Information ausgeben.
-.TP 
 \fB\-v\fP, \fB\-\-verbose\fP
 Gebe alle geprüften URLs aus. Standard ist es, nur fehlerhafte URLs und
 Warnungen auszugeben.
@ -168,27 +154,15 @@ werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
 Applikationsfehler)".
 .br
 Siehe Abschnitt \fBREGULAR EXPRESSIONS\fP für weitere Infos.
-.TP 
-\fB\-\-warning\-size\-bytes=\fP\fINUMMER\fP
-Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
-Anzahl von Bytes übersteigt.
-.
 .SS "Optionen zum Prüfen"
 .TP 
-\fB\-a\fP, \fB\-\-anchors\fP
-Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
-Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
-.TP 
-\fB\-C\fP, \fB\-\-cookies\fP
-Akzeptiere und sende HTTP Cookies nach der RFC 2109. Lediglich Cookies, die
-zum ursprünglichen Server zurückgesendet werden, werden akzeptiert.
-Gesendete und akzeptierte Cookies werden als zusätzlicheLoginformation
-aufgeführt.
-.TP 
 \fB\-\-cookiefile=\fP\fIDATEINAME\fP
 Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird weiter unten
 erklärt.
 .TP 
+\fB\-\-check\-extern\fP
+Check external URLs.
+.TP 
 \fB\-\-ignore\-url=\fP\fIREGEX\fP
 URLs welche dem angegebenen regulären Ausdruck entsprechen werden ignoriert
 und nicht geprüft.
@ -215,11 +189,6 @@ Liest ein Passwort von der Kommandozeile und verwende es für HTTP und FTP
 Autorisierung. Für FTP ist das Standardpasswort \fBanonymous@\fP. Für HTTP gibt
 es kein Standardpasswort. Siehe auch \fB\-u\fP.
 .TP 
-\fB\-P\fP\fINUMMER\fP, \fB\-\-pause=\fP\fINUMMER\fP
-Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
-folgenden Verbindungen zum demselben Rechner. Standard ist keine Pause
-zwischen Verbindungen.
-.TP 
 \fB\-r\fP\fINUMMER\fP, \fB\-\-recursion\-level=\fP\fINUMMER\fP
 Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
 bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
@ -301,17 +270,13 @@ Eine Cookie\-Datei enthält Standard HTTP\-Header (RFC 2616) mit den folgenden
 möglichen Namen:
 .
 .TP 
-\fBScheme\fP (optional)
-Setzt das Schema für das die Cookies gültig sind; Standardschema ist
-\fBhttp\fP.
-.TP 
 \fBHost\fP (erforderlich)
 Setzt die Domäne für die die Cookies gültig sind.
 .TP 
 \fBPath\fP (optional)
 Gibt den Pfad für den die Cookies gültig sind; Standardpfad ist \fB/\fP.
 .TP 
-\fBSet\-cookie\fP (optional)
+\fBSet\-cookie\fP (required)
 Setzt den Cookie Name/Wert. Kann mehrmals angegeben werden.
 .PP
 Mehrere Einträge sind durch eine Leerzeile zu trennen.
@ -325,7 +290,6 @@ Das untige Beispiel sendet zwei Cookies zu allen URLs die mit
 Set\-cookie: ID="smee"
 Set\-cookie: spam="egg"

- Scheme: https
 Host: example.org
 Set\-cookie: baggage="elitist"; comment="hologram"

@ -362,12 +326,10 @@ beschrieben.
 .
 .TP 
 HTTP Verknüpfungen (\fBhttp:\fP, \fBhttps:\fP)
-Nach Verbinden zu dem gegebenen HTTP\-Server wird der eingegebene Pfad oder
-Query angefordert. Alle Umleitungen werden verfolgt, und falls ein
-Benutzer/Passwort angegeben wurde werden diese falls notwendig als
-Authorisierung benutzt. Permanent umgezogene Webseiten werden als Warnung
-ausgegeben. Alle finalen HTTP Statuscodes, die nicht dem Muster 2xx
-entsprechen, werden als Fehler ausgegeben.
+After connecting to the given HTTP server the given path or query is
+requested. All redirections are followed, and if user/password is given it
+will be used as authorization when necessary.  All final HTTP status codes
+other than 2xx are errors.
 .
 Der Inhalt von HTML\-Seiten wird rekursiv geprüft.
 .TP 
@ -418,6 +380,19 @@ Nicht unterstützte Links (\*(lqjavascript:\*(lq, etc.)
  Die komplette Liste von erkannten, aber nicht unterstützten Links ist in der
  Quelldatei \fBlinkcheck/checker/unknownurl.py\fP. Die bekanntesten davon dürften JavaScript\-Links sein.

+.SH PLUGINS
+There are two plugin types: connection and content plugins.
+.
+Connection plugins are run after a successful connection to the URL host.
+.
+Content plugins are run if the URL type has content (mailto: URLs have no
+content for example) and if the check is not forbidden (ie. by HTTP
+robots.txt).
+.
+See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and their
+documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
+configuration file.
+
 .SH Rekursion
 Bevor eine URL rekursiv geprüft wird, hat diese mehrere Bedingungen zu
 erfüllen. Diese werden in folgender Reihenfolge geprüft:
--- a/doc/de/linkcheckerrc.5
+++ b/doc/de/linkcheckerrc.5
@ -14,52 +14,14 @@ in einem INI\-Format geschrieben.
 Die Standarddatei ist \fB~/.linkchecker/linkcheckerrc\fP unter Unix\-,
 \fB%HOMEPATH%\e.linkchecker\elinkcheckerrc\fP unter Windows\-Systemen.
 .SH EIGENSCHAFTEN
-
 .SS [checking]
 .TP 
-\fBanchors=\fP[\fB0\fP|\fB1\fP]
-Prüfe HTTP Ankerverweise. Standard ist, Ankerverweise nicht zu prüfen. Diese
-Option aktiviert die Ausgabe der Warnung \fBurl\-anchor\-not\-found\fP.
-.br
-Kommandozeilenoption: \fB\-\-anchors\fP
-.TP 
-\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
-Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
-.br
-Kommandozeilenoption: \fB\-\-check\-css\fP
-.TP 
-\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
-Prüfe Syntax von HTML URLs mit dem W3C Online Validator.
-.br
-Kommandozeilenoption: \fB\-\-check\-html\fP
-.TP 
-\fBclamavconf=\fP\fIDateiname\fP
-Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
-.br
-Kommandozeilenoption: keine
-.TP 
 \fBcookiefile=\fP\fIDateiname\fP
 Lese eine Datei mit Cookie\-Daten. Das Cookie Datenformat wird in
 linkchecker(1) erklärt.
 .br
 Kommandozeilenoption: \fB\-\-cookiefile\fP
 .TP 
-\fBcookies=\fP[\fB0\fP|\fB1\fP]
-Akzeptiere und sende HTTP cookies.
-.br
-Kommandozeilenoption: \fB\-\-cookies\fP
-.TP 
-\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
-Schreibe einen Speicherabzug in eine temporäre Datei wenn die Prüfung
-endet. Der Speicherabzug wird sowohl beim normalen Beenden der Prüfung als
-auch wenn die Prüfung abgebrochen wird geschrieben.
-.br
-Der Speicherabzug funktioniert nur falls das Paket python\-meliae installiert
-ist. Andernfalls wird eine Warnung angezeigt mit dem Hinweis dieses Paket zu
-installieren.
-.br
-Kommandozeilenoption: keine
-.TP 
 \fBlocalwebroot=\fP\fISTRING\fP
 Beim Prüfen von absoluten URLs in lokalen Dateien wird das angegebene
 Wurzelverzeichnis als Basis\-URL benutzt.
@ -78,23 +40,12 @@ korrekte Syntax des Links geprüft.
 .br
 Kommandozeilenoption: \fB\-\-nntp\-server\fP
 .TP 
-\fBpause=\fP\fINUMBER\fP
-Pausiere die angegebene Anzahl von Sekunden zwischen zwei aufeinander
-folgenden Verbindungen zum demselben Rechner.
-.br
-Kommandozeilenoption: \fB\-\-pause\fP
-.TP 
 \fBrecursionlevel=\fP\fINUMBER\fP
 Prüfe rekursiv alle URLs bis zu der angegebenen Tiefe. Eine negative Tiefe
 bewirkt unendliche Rekursion. Standard Tiefe ist unendlich.
 .br
 Kommandozeilenoption: \fB\-\-recursion\-level\fP
 .TP 
-\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
-Prüfe Inhalt von URLs auf Viren mit ClamAV.
-.br
-Kommandozeilenoption: \fB\-\-scan\-virus\fP
-.TP 
 \fBthreads=\fP\fINUMBER\fP
 Generiere nicht mehr als die angegebene Anzahl von Threads. Standard Anzahl
 von Threads ist 100. Um Threads zu deaktivieren, geben Sie eine nicht
@ -108,6 +59,12 @@ Setze den Timeout für TCP\-Verbindungen in Sekunden. Der Standard Timeout ist
 .br
 Kommandozeilenoption: \fB\-\-timeout\fP
 .TP 
+\fBaborttimeout=\fP\fINUMBER\fP
+Time to wait for checks to finish after the user aborts the first time (with
+Ctrl\-C or the abort button).  The default abort timeout is 300 seconds.
+.br
+Kommandozeilenoption: \fB\-\-timeout\fP
+.TP 
 \fBuseragent=\fP\fISTRING\fP
 Gibt den User\-Agent an, der zu HTTP\-Servern geschickt wird,
 z.B. "Mozilla/4.0". Der Standard ist "LinkChecker/X.Y", wobei X.Y die
@ -115,23 +72,6 @@ aktuelle Version von LinkChecker ist.
 .br
 Kommandozeilenoption: \fB\-\-user\-agent\fP
 .TP 
-\fBwarningregex=\fP=\fIREGEX\fP
-Definieren Sie einen regulären Ausdruck der eine Warnung ausgibt falls er
-auf den Inhalt einer geprüften URL zutrifft. Dies gilt nur für gültige
-Seiten deren Inhalt wir bekommen können.
-.br
-Benutzen Sie dies, um nach Seiten zu suchen, welche bestimmte Fehler
-enthalten, zum Beispiel "Diese Seite wurde entfernt" oder "Oracle
-Applikationsfehler".
-.br
-Kommandozeilenoption: \fB\-\-warning\-regex\fP
-.TP 
-\fBwarnsizebytes=\fP\fINUMBER\fP
-Gebe eine Warnung aus, wenn die Inhaltsgröße bekannt ist und die angegebene
-Anzahl von Bytes übersteigt.
-.br
-Kommandozeilenoption: \fB\-\-warning\-size\-bytes\fP
-.TP 
 \fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIdateiname\fP]
 Falls der Wert Null ist werden SSL Zertifikate nicht überprüft. Falls er auf
 Eins gesetzt wird (der Standard) werden SSL Zertifikate mit der gelieferten
@ -140,15 +80,6 @@ zur Prüfung verwendet.
 .br
 Kommandozeilenoption: keine
 .TP 
-\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
-Prüfe ob SSL\-Zertifikate mindestens die angegebene Anzahl an Tagen gültig
-sind. Die Anzahl darf nicht negativ sein. Falls die Anzahl Null ist wird
-eine Warnung nur für Zertifikate ausgegeben, die schon abgelaufen sind.
-.br
-The Standardanzahl an Tagen ist 14.
-.br
-Kommandozeilenoption: keine
-.TP 
 \fBmaxrunseconds=\fP\fINUMBER\fP
 Hört nach der angegebenen Anzahl von Sekunden auf, neue URLs zu prüfen. Dies
 ist dasselbe als wenn der Benutzer nach der gegebenen Anzahl von Sekunden
@ -167,26 +98,11 @@ Standard ist alle URLs anzunehmen und zu prüfen.
 .br
 Kommandozeilenoption: keine
 .TP 
-\fBmaxconnectionshttp=\fP\fINUMBER\fP
-Maximale Anzahl an HTTP\-Verbindungen.
-.br
-Der Standard ist 10.
-.br
-Kommandozeilenoption: keine
+\fBmaxrequestspersecond=\fP\fINUMBER\fP
+Limit the maximum number of requests per second to one host.
 .TP 
-\fBmaxconnectionshttps=\fP\fINUMBER\fP
-Maximale Anzahl an HTTPS\-Verbindungen.
-.br
-Der Standard ist 10.
-.br
-Kommandozeilenoption: keine
-.TP 
-\fBmaxconnectionsftp=\fP\fINUMBER\fP
-Maximale Anzahl an FTP\-Verbindungen.
-.br
-Der Standard ist 2.
-.br
-Kommandozeilenoption: keine
+\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
+Allowed URL schemes as comma\-separated list.
 .SS [filtering]
 .TP 
 \fBignore=\fP\fIREGEX\fP (MULTILINE)
@ -212,6 +128,11 @@ Prüfe URLs die auf den regulären Ausdruck zutreffen, aber führe keine
 Rekursion durch.
 .br
 Kommandozeilenoption: \fB\-\-no\-follow\-url\fP
+.TP 
+\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
+Check external links. Default is to check internal links only.
+.br
+Command line option: \fB\-\-checkextern\fP
 .SS [authentication]
 .TP 
 \fBentry=\fP\fIREGEX\fP \fIBENUTZER\fP [\fIPASSWORT\fP] (MULTILINE)
@ -232,9 +153,8 @@ wird Authentifizierung für http[s] und ftp Verknüpfungen benutzt.
 Kommandozeilenoption: \fB\-u\fP, \fB\-p\fP
 .TP 
 \fBloginurl=\fP\fIURL\fP
-Eine Anmelde\-URL, die vor der Prüfung besucht wird. Benötigt einen Eintrag
-zur Authentifizierung und impliziert die Benutzung von Cookies, weil die
-meisten Anmeldungen heutzutage Cookies benutzen.
+A login URL to be visited before checking. Also needs authentication data
+set for it.
 .TP 
 \fBloginuserfield=\fP\fINAME\fP
 Der Name für das Benutzer CGI\-Feld. Der Standardname ist \fBlogin\fP.
@ -247,12 +167,6 @@ Optional zusätzliche CGI Namen/Werte\-Paare. Die Default\-Werte werden
 automatisch übermittelt.
 .SS [output]
 .TP 
-\fBcomplete=\fP[\fB0\fP|\fB1\fP]
-Falls gesetzt, gebe alle geprüften URLs aus, sogar Duplikate. Standard ist
-es, URLs nur einmal auszugeben.
-.br
-Kommandozeilenoption: \fB\-\-complete\fP
-.TP 
 \fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
 Gebe Testmeldungen aus für den angegebenen Logger. Verfügbare Logger sind
 \fBcmdline\fP, \fBchecking\fP,\fBcache\fP, \fBgui\fP, \fBdns\fP, \fBthread\fP und \fBall\fP. Die
@ -528,6 +442,52 @@ ignoriert, müssen aber eingerückt sein.

 [filtering]
 ignorewarnings=http\-moved\-permanent
+
+.SH PLUGINS
+All plugins have a separate section. If the section appears in the
+configuration file the plugin is enabled.  Some plugins read extra options
+in their section.
+
+.SS [AnchorCheck]
+Checks validity of HTML anchors.
+
+.SS [LocationInfo]
+Adds the country and if possible city name of the URL host as info.  Needs
+GeoIP or pygeoip and a local country or city lookup DB installed.
+
+.SS [RegexCheck]
+Define a regular expression which prints a warning if it matches any content
+of the checked link. This applies only to valid pages, so we can get their
+content.
+
+Use this to check for pages that contain some form of error message, for
+example 'This page has moved' or 'Oracle Application error'.
+
+Man beachte, dass mehrere Werte in dem regulären Ausdruck kombiniert
+werden können, zum Beispiel "(Diese Seite ist umgezogen|Oracle
+Applikationsfehler)".
+
+.SS [SslCertificateCheck]
+Check SSL certificate expiration date. Only internal https: links will be
+checked. A domain will only be checked once to avoid duplicate warnings.
+.TP 
+\fBsslcertwarndays=\fP\fINUMBER\fP
+Configures the expiration warning time in days.
+
+.SS [HtmlSyntaxCheck]
+Check the syntax of HTML pages with the online W3C HTML validator.  See
+http://validator.w3.org/docs/api.html.
+
+.SS [CssSyntaxCheck]
+Check the syntax of HTML pages with the online W3C CSS validator.  See
+http://jigsaw.w3.org/css\-validator/manual.html#expert.
+
+.SS [VirusCheck]
+Checks the page content for virus infections with clamav.  A local clamav
+daemon must be installed.
+.TP 
+\fBclamavconf=\fP\fIDateiname\fP
+Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
 .
 .SH WARNUNGEN
 Die folgenden Warnungen werden vom Konfigurationseintrag 'ignorewarnings'
@ -543,57 +503,21 @@ Der file: Pfad ist nicht derselbe wie der Systempfad.
 \fBftp\-missing\-slash\fP
 Der ftp: URL fehlt ein abschließender Schrägstrich.
 .TP 
-\fBhttp\-auth\-unknonwn\fP
-Nicht unterstützte HTTP Authentifizierungsmethode.
-.TP 
 \fBhttp\-cookie\-store\-error\fP
 Ein Fehler trat auf während des Speicherns eines Cookies.
 .TP 
-\fBhttp\-decompress\-error\fP
-Ein Fehler trat beim Dekomprimieren des URL Inhalts auf.
-.TP 
 \fBhttp\-empty\-content\fP
 Die URL besitzt keinen Inhalt.
 .TP 
-\fBhttp\-moved\-permanent\fP
-Die URL wurde dauerhaft verschoben.
-.TP 
-\fBhttp\-robots\-denied\fP
-Die http: URL\-Überprüfung wurde verweigert.
-.TP 
-\fBhttp\-unsupported\-encoding\fP
-Der URL\-Inhalt ist in einer unbekannten Kodierung verfasst.
-.TP 
-\fBhttp\-wrong\-redirect\fP
-Die URL wurde zu einem anderen URL\-Typ umgeleitet.
-.TP 
-\fBhttps\-certificate\-error\fP
-Das SSL\-Zertifikat ist ungültig oder abgelaufen.
-.TP 
-\fBignore\-url\fP
-Die URL wurde ignoriert.
-.TP 
-\fBmail\-no\-connection\fP
-Es konnte keine Verbindung zu einem MX\-Rechner hergestellt werden.
-.TP 
 \fBmail\-no\-mx\-host\fP
 Der MX Mail\-Rechner konnte nicht gefunden werden.
 .TP 
-\fBmail\-unverified\-address\fP
-Die mailto: Addresse konnte nicht überprüft werden.
-.TP 
 \fBnntp\-no\-newsgroup\fP
 Die NNTP Nachrichtengruppe konnte nicht gefunden werden.
 .TP 
 \fBnntp\-no\-server\fP
 Es wurde kein NNTP Server gefunden.
 .TP 
-\fBurl\-anchor\-not\-found\fP
-URL Anker wurde nicht gefunden.
-.TP 
-\fBurl\-content\-size\-unequal\fP
-Der URL Inhaltsgrößenangabe und die Download\-Größe sind unterschiedlich.
-.TP 
 \fBurl\-content\-size\-zero\fP
 Der URL Inhaltsgrößenangabe ist Null.
 .TP 
@ -609,9 +533,6 @@ Konnte den Inhalt der URL nicht bekommen.
 \fBurl\-obfuscated\-ip\fP
 Die IP\-Adresse ist verschleiert.
 .TP 
-\fBurl\-warnregex\-found\fP
-Der reguläre Ausdruck für Warnungen wurde in den URL Inhalten gefunden.
-.TP 
 \fBurl\-whitespace\fP
 Die URL %(url)s enthält Leerzeichen am Anfang oder Ende.

--- a/doc/en/linkchecker.1
+++ b/doc/en/linkchecker.1
@ -33,15 +33,14 @@ Antivirus check
 .IP \(bu
 a command line, GUI and web interface
 .SH EXAMPLES
-The most common use checks the given domain recursively, plus any
-URL pointing outside of the domain:
-  \fBlinkchecker http://www.example.net/\fP
+The most common use checks the given domain recursively:
+  \fBlinkchecker http://www.example.com/\fP
 .br
 Beware that this checks the whole site which can have thousands of URLs.
 Use the \fB\-r\fP option to restrict the recursion depth.
 .br
-Don't check \fBmailto:\fP URLs. All other links are checked as usual:
-  \fBlinkchecker \-\-ignore\-url=^mailto: mysite.example.org\fP
+Don't check URLs with \fB/secret\fP in its name. All other links are checked as usual:
+  \fBlinkchecker \-\-ignore\-url=/secret mysite.example.com\fP
 .br
 Checking a local HTML file on Unix:
  \fBlinkchecker ../bla.html\fP
@ -53,7 +52,7 @@ You can skip the \fBhttp://\fP url part if the domain starts with \fBwww.\fP:
  \fBlinkchecker www.example.com\fP
 .br
 You can skip the \fBftp://\fP url part if the domain starts with \fBftp.\fP:
-  \fBlinkchecker \-r0 ftp.example.org\fP
+  \fBlinkchecker \-r0 ftp.example.com\fP
 .br
 Generate a sitemap graph and convert it with the graphviz dot utility:
  \fBlinkchecker \-odot \-v www.example.com | dot \-Tps > sitemap.ps\fP
@ -77,18 +76,12 @@ of threads is 100. To disable threading specify a non-positive number.
 .TP
 \fB\-V\fP, \fB\-\-version\fP
 Print version and exit.
+.TP
+\fB\-\-list\-plugins\fP
+Print available check plugins and exit.
 .
 .SS Output options
 .TP
-\fB\-\-check\-css\fP
-Check syntax of CSS URLs with the W3C online validator.
-.TP
-\fB\-\-check\-html\fP
-Check syntax of HTML URLs with the W3C online validator.
-.TP
-\fB\-\-complete\fP
-Log all URLs, including duplicates. Default is to log duplicate URLs only once.
-.TP
 \fB\-D\fP\fISTRING\fP, \fB\-\-debug=\fP\fISTRING\fP
 Print debugging output for the given logger.
 Available loggers are \fBcmdline\fP, \fBchecking\fP,
@ -139,12 +132,6 @@ that of your locale. Valid encodings are listed at
 Quiet operation, an alias for \fB\-o none\fP.
 This is only useful with \fB\-F\fP.
 .TP
-\fB\-\-scan\-virus\fP
-Scan content of URLs for viruses with ClamAV.
-.TP
-\fB\-\-trace\fP
-Print tracing information.
-.TP
 \fB\-v\fP, \fB\-\-verbose\fP
 Log all checked URLs. Default is to log only errors and warnings.
 .TP
@ -160,27 +147,15 @@ Note that multiple values can be combined in the regular expression,
 for example "(This page has moved|Oracle Application error)".
 .br
 See section \fBREGULAR EXPRESSIONS\fP for more info.
-.TP
-\fB\-\-warning\-size\-bytes=\fP\fINUMBER\fP
-Print a warning if content size info is available and exceeds the given
-number of \fIbytes\fP.
-.
 .SS Checking options
 .TP
-\fB\-a\fP, \fB\-\-anchors\fP
-Check HTTP anchor references. Default is not to check anchors.
-This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
-.TP
-\fB\-C\fP, \fB\-\-cookies\fP
-Accept and send HTTP cookies according to RFC 2109. Only cookies
-which are sent back to the originating server are accepted.
-Sent and accepted cookies are provided as additional logging
-information.
-.TP
 \fB\-\-cookiefile=\fP\fIFILENAME\fP
 Read a file with initial cookie data. The cookie data
 format is explained below.
 .TP
+\fB\-\-check\-extern
+Check external URLs.
+.TP
 \fB\-\-ignore\-url=\fP\fIREGEX\fP
 URLs matching the given regular expression will be ignored and not checked.
 .br
@ -206,10 +181,6 @@ Read a password from console and use it for HTTP and FTP authorization.
 For FTP the default password is \fBanonymous@\fP. For HTTP there is
 no default password. See also \fB\-u\fP.
 .TP
-\fB\-P\fP\fINUMBER\fP, \fB\-\-pause=\fP\fINUMBER\fP
-Pause the given number of seconds between two subsequent connection
-requests to the same host. Default is no pause between requests.
-.TP
 \fB\-r\fP\fINUMBER\fP, \fB\-\-recursion\-level=\fP\fINUMBER\fP
 Check recursively all links up to given depth.
 A negative depth will enable infinite recursion.
@ -291,16 +262,13 @@ A cookie file contains standard HTTP header (RFC 2616) data with the
 following possible names:
 .
 .TP
-\fBScheme\fP (optional)
-Sets the scheme the cookies are valid for; default scheme is \fBhttp\fP.
-.TP
 \fBHost\fP (required)
 Sets the domain the cookies are valid for.
 .TP
 \fBPath\fP (optional)
 Gives the path the cookies are value for; default path is \fB/\fP.
 .TP
-\fBSet-cookie\fP (optional)
+\fBSet-cookie\fP (required)
 Set cookie name/value. Can be given more than once.
 .PP
 Multiple entries are separated by a blank line.
@ -314,7 +282,6 @@ with \fBhttps://example.org/\fP:
 Set-cookie: ID="smee"
 Set-cookie: spam="egg"

- Scheme: https
 Host: example.org
 Set-cookie: baggage="elitist"; comment="hologram"

@ -353,7 +320,6 @@ After connecting to the given HTTP server the given path
 or query is requested. All redirections are followed, and
 if user/password is given it will be used as authorization
 when necessary.
-Permanently moved pages issue a warning.
 All final HTTP status codes other than 2xx are errors.
 .
 HTML page contents are checked for recursion.
@ -412,6 +378,20 @@ Unsupported links (``javascript:``, etc.)
  in the \fBlinkcheck/checker/unknownurl.py\fP source file.
  The most prominent of them should be JavaScript links.

+.SH PLUGINS
+There are two plugin types: connection and content plugins.
+.
+Connection plugins are run after a successful connection to the
+URL host.
+.
+Content plugins are run if the URL type has content
+(mailto: URLs have no content for example) and if the check is not
+forbidden (ie. by HTTP robots.txt).
+.
+See \fBlinkchecker \-\-list\-plugins\fP for a list of plugins and
+their documentation. All plugins are enabled via the \fBlinkcheckerrc\fP(5)
+configuration file.
+
 .SH RECURSION
 Before descending recursively into a URL, it has to fulfill several
 conditions. They are checked in this order:
--- a/doc/en/linkcheckerrc.5
+++ b/doc/en/linkcheckerrc.5
@ -9,51 +9,14 @@ The file is written in an INI-style format.
 The default file location is \fB~/.linkchecker/linkcheckerrc\fP on Unix,
 \fB%HOMEPATH%\\.linkchecker\\linkcheckerrc\fP on Windows systems.
 .SH SETTINGS
-
 .SS \fB[checking]\fP
 .TP
-\fBanchors=\fP[\fB0\fP|\fB1\fP]
-Check HTTP anchor references. Default is not to check anchors.
-This option enables logging of the warning \fBurl\-anchor\-not\-found\fP.
-.br
-Command line option: \fB\-\-anchors\fP
-.TP
-\fBcheckcss=\fP[\fB0\fP|\fB1\fP]
-Check syntax of CSS URLs with the W3C online validator.
-.br
-Command line option: \fB\-\-check\-css\fP
-.TP
-\fBcheckhtml=\fP[\fB0\fP|\fB1\fP]
-Check syntax of HTML URLs with the W3C online validator.
-.br
-Command line option: \fB\-\-check\-html\fP
-.TP
-\fBclamavconf=\fP\fIfilename\fP
-Filename of \fBclamd.conf\fP config file.
-.br
-Command line option: none
-.TP
 \fBcookiefile=\fP\fIfilename\fP
 Read a file with initial cookie data. The cookie data
 format is explained in linkchecker(1).
 .br
 Command line option: \fB\-\-cookiefile\fP
 .TP
-\fBcookies=\fP[\fB0\fP|\fB1\fP]
-Accept and send HTTP cookies.
-.br
-Command line option: \fB\-\-cookies\fP
-.TP
-\fBdebugmemory=\fP[\fB0\fP|\fB1\fP]
-When checking finishes, write a memory dump to a temporary file.
-The memory dump is written both when checking finishes normally
-and when checking gets canceled.
-.br
-The memory dump only works if the python-meliae package is installed.
-Otherwise a warning is printed to install it.
-.br
-Command line option: none
-.TP
 \fBlocalwebroot=\fP\fISTRING\fP
 When checking absolute URLs inside local files, the given root directory
 is used as base URL.
@ -71,12 +34,6 @@ only the syntax of the link is checked.
 .br
 Command line option: \fB\-\-nntp\-server\fP
 .TP
-\fBpause=\fP\fINUMBER\fP
-Pause the given number of seconds between two subsequent connection
-requests to the same host.
-.br
-Command line option: \fB\-\-pause\fP
-.TP
 \fBrecursionlevel=\fP\fINUMBER\fP
 Check recursively all links up to given depth.
 A negative depth will enable infinite recursion.
@ -84,11 +41,6 @@ Default depth is infinite.
 .br
 Command line option: \fB\-\-recursion\-level\fP
 .TP
-\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
-Scan content of URLs for viruses with ClamAV.
-.br
-Command line option: \fB\-\-scan\-virus\fP
-.TP
 \fBthreads=\fP\fINUMBER\fP
 Generate no more than the given number of threads. Default number
 of threads is 100. To disable threading specify a non-positive number.
@ -101,6 +53,13 @@ is 60 seconds.
 .br
 Command line option: \fB\-\-timeout\fP
 .TP
+\fBaborttimeout=\fP\fINUMBER\fP
+Time to wait for checks to finish after the user aborts the first time
+(with Ctrl-C or the abort button).
+The default abort timeout is 300 seconds.
+.br
+Command line option: \fB\-\-timeout\fP
+.TP
 \fBuseragent=\fP\fISTRING\fP
 Specify the User-Agent string to send to the HTTP server, for example
 "Mozilla/4.0". The default is "LinkChecker/X.Y" where X.Y is the current
@ -108,22 +67,6 @@ version of LinkChecker.
 .br
 Command line option: \fB\-\-user\-agent\fP
 .TP
-\fBwarningregex=\fP=\fIREGEX\fP
-Define a regular expression which prints a warning if it matches any
-content of the checked link.
-This applies only to valid pages, so we can get their content.
-.br
-Use this to check for pages that contain some form of error, for example
-"This page has moved" or "Oracle Application Server error".
-.br
-Command line option: \fB\-\-warning\-regex\fP
-.TP
-\fBwarnsizebytes=\fP\fINUMBER\fP
-Print a warning if content size info is available and exceeds the given
-number of \fIbytes\fP.
-.br
-Command line option: \fB\-\-warning\-size\-bytes\fP
-.TP
 \fBsslverify=\fP[\fB0\fP|\fB1\fP|\fIfilename\fP]
 If set to zero disables SSL certificate checking.
 If set to one (the default) enables SSL certificate checking with
@ -132,16 +75,6 @@ will be used as the certificate file.
 .br
 Command line option: none
 .TP
-\fBwarnsslcertdaysvalid=\fP\fINUMBER\fP
-Check that SSL certificates are at least the given number of days valid.
-The number must not be negative.
-If the number of days is zero a warning is printed only for certificates
-that are already expired.
-.br
-The default number of days is 14.
-.br
-Command line option: none
-.TP
 \fBmaxrunseconds=\fP\fINUMBER\fP
 Stop checking new URLs after the given number of seconds. Same as if the
 user stops (by hitting Ctrl-C or clicking the abort buttin in the GUI)
@ -159,26 +92,11 @@ The default is to queue and check all URLs.
 .br
 Command line option: none
 .TP
-\fBmaxconnectionshttp=\fP\fINUMBER\fP
-Maximum number of connections to HTTP servers.
-.br
-The default is 10.
-.br
-Command line option: none
+\fBmaxrequestspersecond=\fP\fINUMBER\fP
+Limit the maximum number of requests per second to one host.
 .TP
-\fBmaxconnectionshttps=\fP\fINUMBER\fP
-Maximum number of connections to HTTPS servers.
-.br
-The default is 10.
-.br
-Command line option: none
-.TP
-\fBmaxconnectionsftp=\fP\fINUMBER\fP
-Maximum number of connections to FTP servers.
-.br
-The default is 2.
-.br
-Command line option: none
+\fBallowedschemes=\fP\fINAME\fP[\fB,\fP\fINAME\fP...]
+Allowed URL schemes as comma-separated list.
 .SS \fB[filtering]\fP
 .TP
 \fBignore=\fP\fIREGEX\fP (MULTILINE)
@ -203,6 +121,11 @@ Check but do not recurse into URLs matching the given regular
 expressions.
 .br
 Command line option: \fB\-\-no\-follow\-url\fP
+.TP
+\fBcheckextern=\fP[\fB0\fP|\fB1\fP]
+Check external links. Default is to check internal links only.
+.br
+Command line option: \fB\-\-checkextern\fP
 .SS \fB[authentication]\fP
 .TP
 \fBentry=\fP\fIREGEX\fP \fIUSER\fP [\fIPASS\fP] (MULTILINE)
@ -224,8 +147,7 @@ Command line option: \fB\-u\fP, \fB\-p\fP
 .TP
 \fBloginurl=\fP\fIURL\fP
 A login URL to be visited before checking. Also needs authentication
-data set for it, and implies using cookies because most logins use
-cookies nowadays.
+data set for it.
 .TP
 \fBloginuserfield=\fP\fISTRING\fP
 The name of the user CGI field. Default name is \fBlogin\fP.
@ -238,12 +160,6 @@ Optionally any additional CGI name/value pairs. Note that the default
 values are submitted automatically.
 .SS \fB[output]\fP
 .TP
-\fBcomplete=\fP[\fB0\fP|\fB1\fP]
-If set log all checked URLs, even duplicates. Default is to log
-duplicate URLs only once.
-.br
-Command line option: \fB\-\-complete\fP
-.TP
 \fBdebug=\fP\fISTRING\fP[\fB,\fP\fISTRING\fP...]
 Print debugging output for the given loggers.
 Available loggers are \fBcmdline\fP, \fBchecking\fP,
@ -524,6 +440,53 @@ though they must still be indented.

 [filtering]
 ignorewarnings=http-moved-permanent
+
+.SH PLUGINS
+All plugins have a separate section. If the section
+appears in the configuration file the plugin is enabled.
+Some plugins read extra options in their section.
+
+.SS \fB[AnchorCheck]\fP
+Checks validity of HTML anchors.
+
+.SS \fB[LocationInfo]\fP
+Adds the country and if possible city name of the URL host as info.
+Needs GeoIP or pygeoip and a local country or city lookup DB installed.
+
+.SS \fB[RegexCheck]\fP
+Define a regular expression which prints a warning if it matches
+any content of the checked link. This applies only to valid pages,
+so we can get their content.
+
+Use this to check for pages that contain some form of error
+message, for example 'This page has moved' or 'Oracle
+Application error'.
+
+Note that multiple values can be combined in the regular expression,
+for example "(This page has moved|Oracle Application error)".
+
+.SS \fB[SslCertificateCheck]\fP
+Check SSL certificate expiration date. Only internal https: links
+will be checked. A domain will only be checked once to avoid duplicate
+warnings.
+.TP
+\fBsslcertwarndays=\fP\fINUMBER\fP
+Configures the expiration warning time in days.
+
+.SS \fB[HtmlSyntaxCheck]\fP
+Check the syntax of HTML pages with the online W3C HTML validator.
+See http://validator.w3.org/docs/api.html.
+
+.SS \fB[CssSyntaxCheck]\fP
+Check the syntax of HTML pages with the online W3C CSS validator.
+See http://jigsaw.w3.org/css-validator/manual.html#expert.
+
+.SS \fB[VirusCheck]\fP
+Checks the page content for virus infections with clamav.
+A local clamav daemon must be installed.
+.TP
+\fBclamavconf=\fP\fIfilename\fP
+Filename of \fBclamd.conf\fP config file.
 .
 .SH WARNINGS
 The following warnings are recognized in the 'ignorewarnings' config
@ -539,57 +502,21 @@ The file: path is not the same as the system specific path.
 \fBftp-missing-slash\fP
 The ftp: URL is missing a trailing slash.
 .TP
-\fBhttp-auth-unknonwn\fP
-Unsupported HTTP authentication method.
-.TP
 \fBhttp-cookie-store-error\fP
 An error occurred while storing a cookie.
 .TP
-\fBhttp-decompress-error\fP
-An error occurred while decompressing the URL content.
-.TP
 \fBhttp-empty-content\fP
 The URL had no content.
 .TP
-\fBhttp-moved-permanent\fP
-The URL has moved permanently.
-.TP
-\fBhttp-robots-denied\fP
-The http: URL checking has been denied.
-.TP
-\fBhttp-unsupported-encoding\fP
-The URL content is encoded with an unknown encoding.
-.TP
-\fBhttp-wrong-redirect\fP
-The URL has been redirected to an URL of a different type.
-.TP
-\fBhttps-certificate-error\fP
-The SSL certificate is invalid or expired.
-.TP
-\fBignore-url\fP
-The URL has been ignored.
-.TP
-\fBmail-no-connection\fP
-No connection to a MX host could be established.
-.TP
 \fBmail-no-mx-host\fP
 The mail MX host could not be found.
 .TP
-\fBmail-unverified-address\fP
-The mailto: address could not be verified.
-.TP
 \fBnntp-no-newsgroup\fP
 The NNTP newsgroup could not be found.
 .TP
 \fBnntp-no-server\fP
 No NNTP server was found.
 .TP
-\fBurl-anchor-not-found\fP
-URL anchor was not found.
-.TP
-\fBurl-content-size-unequal\fP
-The URL content size and download size are unequal.
-.TP
 \fBurl-content-size-zero\fP
 The URL content size is zero.
 .TP
@ -605,9 +532,6 @@ Could not get the content of the URL.
 \fBurl-obfuscated-ip\fP
 The IP is obfuscated.
 .TP
-\fBurl-warnregex-found\fP
-The warning regular expression was found in the URL contents.
-.TP
 \fBurl-whitespace\fP
 The URL contains leading or trailing whitespace.

--- a/doc/install.txt
+++ b/doc/install.txt
@ -50,7 +50,9 @@ First, install the required software.
   On Debian or Ubuntu systems, install the package qt4-dev-tools.
   On Redhat systems, install the package qt-devel.

-4. *Optional, for bash-completion:*
+4. Python requests module from https://pypi.python.org/pypi/requests
+
+5. *Optional, for bash-completion:*
   argcomplete Python module from https://pypi.python.org/pypi/argcomplete

 6. *Optional, for displaying country codes:*
--- a/doc/linkchecker.doc.pot
+++ b/doc/linkchecker.doc.pot
--- a/doc/upgrading.txt
+++ b/doc/upgrading.txt
@ -1,5 +1,43 @@
 Upgrading
 =========
+Migrating from 8.x to 9.0
+-------------------------
+The Python requests module is now required.
+
+Several checks have been moved to plugins (see below).
+Plugins have to be enabled in the configuration file.
+
+The following commandline and configuration options have been deprecated
+and do not have any effect:
+--anchors, anchors: moved to plugin AnchorCheck
+--check-css, checkcss: moved to plugin CssSyntaxCheck
+--check-html, checkhtml: moved to plugin HtmlSyntaxCheck
+--complete: feature removed
+--cookies, sendcookies, storecookies: cookies are sent/stored per default
+--pause, wait: replaced with numrequestspersecond
+--scan-virus, scanvirus: moved to plugin VirusCheck
+--warning-regex: moved to plugin RegexCheck
+--warning-size-bytes, warnsizebytes: feature removed
+warnsslcertdaysvalid: moved to plugin SslCertificationCheck
+
+The "html" logger generates HTML5 documents now.
+
+The following warnings have been removed:
+- http-auth-unauthorized: removed
+- http-auth-unknonwn: removed
+- http-decompress-error: removed
+- http-robots-denied: downgraded to info
+- http-moved-permanent: downgraded to info
+- http-unsupported-encoding: removed
+- https-certificate-error: is an error now
+- mail-unverified-address: removed
+- mail-no-connection: removed
+- syntax-css: moved to plugin
+- syntax-html: moved to plugin
+- url-anchor-not-found: moved to plugin
+- url-content-size-unequal: removed
+- url-warnregex-found: moved to plugin
+
 Migrating from 8.4 to 8.5
 --------------------------
 Custom output loggers have been changed.
--- a/doc/web/content/index.md
+++ b/doc/web/content/index.md
@ -21,8 +21,9 @@ Features
 - honors robots.txt exclusion protocol
 - Cookie support
 - HTML5 support
- HTML and CSS syntax check
- Antivirus check
+- [Plugin support](plugins.html)
+  allowing custom page checks. Currently available are 
+  HTML and CSS syntax checks, Antivirus checks, and more.
 - Different interfaces: command line, GUI and web interface
 - ... and a lot more check options documented in the
  [manual page](man1/linkchecker.1.html).
--- a/doc/web/content/plugins.md
+++ b/doc/web/content/plugins.md
@ -0,0 +1,11 @@
+title: Plugin support
+---
+
+Plugin documentation
+=====================
+
+Standard plugins
+=================
+
+Custom plugins
+===============
--- a/linkcheck/HtmlParser/init.py
+++ b/linkcheck/HtmlParser/init.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/HtmlParser/htmllex.c
+++ b/linkcheck/HtmlParser/htmllex.c
@ -2612,7 +2612,7 @@ static yyconst flex_int32_t yy_rule_linenum[131] =
 #define YY_MORE_ADJ 0
 #define YY_RESTORE_YY_MORE_OFFSET
 #line 1 "htmllex.l"
-/* Copyright (C) 2000-2012 Bastian Kleineidam
+/* Copyright (C) 2000-2014 Bastian Kleineidam

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -2951,6 +2951,10 @@ int yyget_lineno (yyscan_t yyscanner );

 void yyset_lineno (int line_number ,yyscan_t yyscanner );

+int yyget_column  (yyscan_t yyscanner );
+
+void yyset_column (int column_no ,yyscan_t yyscanner );
+
 /* %if-bison-bridge */

 YYSTYPE * yyget_lval (yyscan_t yyscanner );
@ -3132,7 +3136,7 @@ YY_DECL


  /*********************** EOF ************************/
-#line 3135 "htmllex.c"
+#line 3139 "htmllex.c"

    yylval = yylval_param;

@ -4683,7 +4687,7 @@ YY_RULE_SETUP
 #line 1091 "htmllex.l"
 ECHO;
 	YY_BREAK
-#line 4686 "htmllex.c"
+#line 4690 "htmllex.c"

 	case YY_END_OF_BUFFER:
 		{
--- a/linkcheck/HtmlParser/htmllex.l
+++ b/linkcheck/HtmlParser/htmllex.l
@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2012 Bastian Kleineidam
+/* Copyright (C) 2000-2014 Bastian Kleineidam

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/linkcheck/HtmlParser/htmllib.py
+++ b/linkcheck/HtmlParser/htmllib.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2009 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/HtmlParser/htmlparse.c
+++ b/linkcheck/HtmlParser/htmlparse.c
@ -68,7 +68,7 @@
 /* Line 268 of yacc.c  */
 #line 1 "htmlparse.y"

-/* Copyright (C) 2000-2011 Bastian Kleineidam
+/* Copyright (C) 2000-2014 Bastian Kleineidam

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/linkcheck/HtmlParser/htmlparse.y
+++ b/linkcheck/HtmlParser/htmlparse.y
@ -1,5 +1,5 @@
 %{
-/* Copyright (C) 2000-2011 Bastian Kleineidam
+/* Copyright (C) 2000-2014 Bastian Kleineidam

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/linkcheck/HtmlParser/htmlsax.h
+++ b/linkcheck/HtmlParser/htmlsax.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2000-2010 Bastian Kleineidam
+/* Copyright (C) 2000-2014 Bastian Kleineidam

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
--- a/linkcheck/init.py
+++ b/linkcheck/init.py
@ -68,12 +68,14 @@ LOG_CHECK = "linkcheck.check"
 LOG_CACHE = "linkcheck.cache"
 LOG_GUI = "linkcheck.gui"
 LOG_THREAD = "linkcheck.thread"
+LOG_PLUGIN = "linkcheck.plugin"
 lognames = {
    "cmdline": LOG_CMDLINE,
    "checking": LOG_CHECK,
    "cache": LOG_CACHE,
    "gui": LOG_GUI,
    "thread": LOG_THREAD,
+    "plugin": LOG_PLUGIN,
    "all": LOG_ROOT,
 }
 lognamelist = ", ".join(repr(name) for name in lognames)
--- a/linkcheck/ansicolor.py
+++ b/linkcheck/ansicolor.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/init.py
+++ b/linkcheck/bookmarks/init.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/chrome.py
+++ b/linkcheck/bookmarks/chrome.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011-2012 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/chromium.py
+++ b/linkcheck/bookmarks/chromium.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011-2012 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/firefox.py
+++ b/linkcheck/bookmarks/firefox.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2012 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/opera.py
+++ b/linkcheck/bookmarks/opera.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011-2012 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/bookmarks/safari.py
+++ b/linkcheck/bookmarks/safari.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011-2012 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/cache/init.py
+++ b/linkcheck/cache/init.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2009 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/cache/connection.py
+++ b/linkcheck/cache/connection.py
@ -1,223 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2005-2014 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Store and retrieve open connections.
-"""
-
-import time
-from .. import log, LOG_CACHE
-from ..decorators import synchronized
-from ..lock import get_lock, get_semaphore
-from ..containers import enum
-
-_lock = get_lock("connection")
-_wait_lock = get_lock("connwait")
-
-ConnectionTypes = ("ftp", "http", "https")
-ConnectionState = enum("available", "busy")
-
-
-def get_connection_id(connection):
-    """Return unique id for connection object."""
-    return id(connection)
-
-
-def is_expired(curtime, conn_data):
-    """Test if connection is expired."""
-    return (curtime+5.0) >= conn_data[2]
-
-
-class ConnectionPool (object):
-    """Thread-safe cache, storing a set of connections for URL retrieval."""
-
-    def __init__ (self, limits, wait=0):
-        """
-        Initialize an empty connection dictionary which will have the form:
-        {(type, host, port) -> (lock, {id -> [connection, state, expiration time]})}
-
-        Connection can be any open connection object (HTTP, FTP, ...).
-        State is of type ConnectionState (either 'available' or 'busy').
-        Expiration time is the point of time in seconds when this
-        connection will be timed out.
-
-        The type is the connection type and an either 'ftp' or 'http'.
-        The host is the hostname as string, port the port number as an integer.
-
-        For each type, the maximum number of connections to one single host is defined
-        in limits.
-        """
-        # open connections
-        self.connections = {}
-        # {host -> due time}
-        self.times = {}
-        # {host -> wait}
-        self.host_waits = {}
-        if wait < 0:
-            raise ValueError("negative wait value %d" % wait)
-        self.wait = wait
-        # {connection type -> max number of connections to one host}
-        self.limits = limits
-
-    @synchronized(_wait_lock)
-    def host_wait (self, host, wait):
-        """Set a host specific time to wait between requests."""
-        if wait < 0:
-            raise ValueError("negative wait value %d" % wait)
-        self.host_waits[host] = wait
-
-    @synchronized(_wait_lock)
-    def wait_for_host (self, host):
-        """Honor wait time for given host."""
-        t = time.time()
-        if host in self.times:
-            due_time = self.times[host]
-            if due_time > t:
-                wait = due_time - t
-                log.debug(LOG_CACHE,
-                  "waiting for %.01f seconds on connection to %s", wait, host)
-                time.sleep(wait)
-                t = time.time()
-        self.times[host] = t + self.host_waits.get(host, self.wait)
-
-    def _add (self, type, host, port, create_connection):
-        """Add connection to the pool with given parameters.
-
-        @param type: the connection scheme (eg. http)
-        @ptype type: string
-        @param host: the hostname
-        @ptype host: string
-        @param port: the port number
-        @ptype port: int
-        @param create_connection: function to create a new connection object
-        @ptype create_connection: callable
-        @return: newly created connection
-        @rtype: HTTP(S)Connection or FTPConnection
-        """
-        self.wait_for_host(host)
-        connection = create_connection(type, host, port)
-        cid = get_connection_id(connection)
-        expiration = None
-        conn_data = [connection, 'busy', expiration]
-        key = (type, host, port)
-        if key in self.connections:
-            lock, entries = self.connections[key]
-            entries[cid] = conn_data
-        else:
-            lock = get_semaphore("%s:%d" % (host, port), self.limits[type])
-            lock.acquire()
-            log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
-            entries = {cid: conn_data}
-            self.connections[key] = (lock, entries)
-        return connection
-
-    @synchronized(_lock)
-    def get (self, type, host, port, create_connection):
-        """Get open connection if available or create a new one.
-
-        @param type: connection type
-        @ptype type: ConnectionType
-        @param host: hostname
-        @ptype host: string
-        @param port: port number
-        @ptype port: int
-        @return: Open connection object or None if none is available.
-        @rtype None or FTPConnection or HTTP(S)Connection
-        """
-        assert type in ConnectionTypes, 'invalid type %r' % type
-        # 65536 == 2**16
-        assert 0 < port < 65536, 'invalid port number %r' % port
-        key = (type, host, port)
-        if key not in self.connections:
-            return self._add(type, host, port, create_connection)
-        lock, entries = self.connections[key]
-        if not lock.acquire(False):
-            log.debug(LOG_CACHE, "wait for %s connection to %s:%d",
-                      type, host, port)
-            return lock
-        log.debug(LOG_CACHE, "Acquired lock for %s://%s:%d" % key)
-        # either a connection is available or a new one can be created
-        t = time.time()
-        delete_entries = []
-        try:
-            for id, conn_data in entries.items():
-                if conn_data[1] == ConnectionState.available:
-                    if is_expired(t, conn_data):
-                        delete_entries.append(id)
-                    else:
-                        conn_data[1] = ConnectionState.busy
-                        log.debug(LOG_CACHE,
-                          "reusing connection %s timing out in %.01f seconds",
-                           key, (conn_data[2] - t))
-                        return conn_data[0]
-        finally:
-            for id in delete_entries:
-                del entries[id]
-        # make a new connection
-        return self._add(type, host, port, create_connection)
-
-    @synchronized(_lock)
-    def release (self, type, host, port, connection, expiration=None):
-        """Release a used connection."""
-        key = (type, host, port)
-        if key in self.connections:
-            lock, entries = self.connections[key]
-            id = get_connection_id(connection)
-            if id in entries:
-                log.debug(LOG_CACHE, "Release lock for %s://%s:%d and expiration %s", type, host, port, expiration)
-                # if the connection is reusable, set it to available, else delete it
-                if expiration is None:
-                    del entries[id]
-                else:
-                    entries[id][1] = ConnectionState.available
-                    entries[id][2] = expiration
-                lock.release()
-            else:
-                log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d from entries %s", type, host, port, entries.keys())
-        else:
-            log.warn(LOG_CACHE, "Release unknown connection %s://%s:%d", type, host, port)
-
-    @synchronized(_lock)
-    def remove_expired (self):
-        """Remove expired or soon to be expired connections from this pool."""
-        t = time.time()
-        for lock, entries in self.connections.values():
-            delete_entries = []
-            for id, conn_data in entries.items():
-                if conn_data[1] == 'available' and (t+5.0) >= conn_data[2]:
-                    try_close(conn_data[0])
-                    delete_entries.add(id)
-            for id in delete_entries:
-                del entries[id]
-                lock.release()
-                log.debug(LOG_CACHE, "released lock for id %s", id)
-
-    @synchronized(_lock)
-    def clear (self):
-        """Remove all connections from this cache, even if busy."""
-        for lock, entries in self.connections.values():
-            for conn_data in entries.values():
-                try_close(conn_data[0])
-        self.connections.clear()
-
-
-def try_close (connection):
-    """Close and remove a connection (not thread-safe, internal use only)."""
-    try:
-        connection.close()
-    except Exception:
-        # ignore close errors
-        pass
--- a/linkcheck/cache/cookie.py
+++ b/linkcheck/cache/cookie.py
@ -1,83 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2014 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Store and retrieve cookies.
-"""
-from .. import log, LOG_CACHE, cookies
-from ..decorators import synchronized
-from ..lock import get_lock
-
-
-_lock = get_lock("cookie")
-
-class CookieJar (object):
-    """Cookie storage, implementing the cookie handling policy."""
-
-    def __init__ (self):
-        """Initialize empty cookie cache."""
-        # Store all cookies in a set.
-        self.cache = set()
-
-    @synchronized(_lock)
-    def add (self, headers, scheme, host, path):
-        """Parse cookie values, add to cache."""
-        errors = []
-        for h in headers.getallmatchingheaders("Set-Cookie"):
-            # RFC 2109 (Netscape) cookie type
-            name, value = h.split(':', 1)
-            try:
-                cookie = cookies.NetscapeCookie(value, scheme, host, path)
-                if cookie in self.cache:
-                    self.cache.remove(cookie)
-                if not cookie.is_expired():
-                    self.cache.add(cookie)
-            except cookies.CookieError as msg:
-                errmsg = "Invalid cookie %r for %s:%s%s: %s" % (
-                         h, scheme, host, path, msg)
-                errors.append(errmsg)
-        for h in headers.getallmatchingheaders("Set-Cookie2"):
-            # RFC 2965 cookie type
-            name, value = h.split(':', 1)
-            try:
-                cookie = cookies.Rfc2965Cookie(value, scheme, host, path)
-                if cookie in self.cache:
-                    self.cache.remove(cookie)
-                if not cookie.is_expired():
-                    self.cache.add(cookie)
-            except cookies.CookieError as msg:
-                errmsg = "Invalid cookie2 %r for %s:%s%s: %s" % (
-                         h, scheme, host, path, msg)
-                errors.append(errmsg)
-        return errors
-
-    @synchronized(_lock)
-    def get (self, scheme, host, port, path):
-        """Cookie cache getter function. Return ordered list of cookies
-        which match the given host, port and path.
-        Cookies with more specific paths are listed first."""
-        cookies = [x for x in self.cache if x.check_expired() and \
-                   x.is_valid_for(scheme, host, port, path)]
-        # order cookies with more specific (ie. longer) paths first
-        cookies.sort(key=lambda c: len(c.attributes['path']), reverse=True)
-        log.debug(LOG_CACHE, "Found %d cookies for host %r path %r",
-                  len(cookies), host, path)
-        return cookies
-
-    @synchronized(_lock)
-    def __str__ (self):
-        """Return stored cookies as string."""
-        return "<CookieJar with %s>" % self.cache
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2012 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/cache/urlqueue.py
+++ b/linkcheck/cache/urlqueue.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -23,9 +23,6 @@ from time import time as _time
 from .. import log, LOG_CACHE


-LARGE_QUEUE_THRESHOLD = 1000
-FRONT_CHUNK_SIZE = 100
-
 class Timeout (StandardError):
    """Raised by join()"""
    pass
@ -55,8 +52,8 @@ class UrlQueue (object):
        self.all_tasks_done = threading.Condition(self.mutex)
        self.unfinished_tasks = 0
        self.finished_tasks = 0
-        self.in_progress = {}
-        self.seen = {}
+        self.in_progress = 0
+        self.seen = set()
        self.shutdown = False
        # Each put() decreases the number of allowed puts.
        # This way we can restrict the number of URLs that are checked.
@ -103,24 +100,29 @@ class UrlQueue (object):
                if remaining <= 0.0:
                    raise Empty()
                self.not_empty.wait(remaining)
-        url_data = self.queue.popleft()
-        if url_data.has_result:
-            # Already checked and copied from cache.
-            pass
-        else:
-            key = url_data.cache_url_key
-            assert key is not None
-            self.in_progress[key] = url_data
-        return url_data
+        self.in_progress += 1
+        return self.queue.popleft()

    def put (self, item):
        """Put an item into the queue.
        Block if necessary until a free slot is available.
        """
+        if self.put_denied(item):
+            return
        with self.mutex:
            self._put(item)
            self.not_empty.notify()

+    def put_denied(self, url_data):
+        """Determine if put() will not append the item on the queue.
+        @return True (reliable) or False (unreliable)
+        """
+        if self.shutdown or self.allowed_puts == 0:
+            return True
+        if url_data.cache_url_key is not None and url_data.cache_url_key in self.seen:
+            return True
+        return False
+
    def _put (self, url_data):
        """Put URL in queue, increase number of unfished tasks."""
        if self.shutdown:
@ -133,17 +135,16 @@ class UrlQueue (object):
            self.allowed_puts -= 1
        log.debug(LOG_CACHE, "queueing %s", url_data)
        key = url_data.cache_url_key
-        # cache key is None for URLs with invalid syntax
-        assert key is not None or url_data.has_result, "invalid cache key in %s" % url_data
-        if key in self.seen:
-            self.seen[key] += 1
-            if key is not None:
-                # do not check duplicate URLs
+        if key is not None:
+            if key in self.seen:
+                # don't check duplicate URLs
                return
-        else:
-            self.seen[key] = 0
-        self.queue.append(url_data)
+            self.seen.add(key)
        self.unfinished_tasks += 1
+        if url_data.has_result:
+            self.queue.appendleft(url_data)
+        else:
+            self.queue.append(url_data)

    def task_done (self, url_data):
        """
@ -163,17 +164,11 @@ class UrlQueue (object):
        with self.all_tasks_done:
            log.debug(LOG_CACHE, "task_done %s", url_data)
            # check for aliases (eg. through HTTP redirections)
-            if hasattr(url_data, "aliases"):
-                for key in url_data.aliases:
-                    if key in self.seen:
-                        self.seen[key] += 1
-                    else:
-                        self.seen[key] = 0
-            key = url_data.cache_url_key
-            if key in self.in_progress:
-                del self.in_progress[key]
+            if hasattr(url_data, "aliases") and url_data.aliases:
+                self.seen.update(url_data.aliases)
            self.finished_tasks += 1
            self.unfinished_tasks -= 1
+            self.in_progress -= 1
            if self.unfinished_tasks <= 0:
                if self.unfinished_tasks < 0:
                    raise ValueError('task_done() called too many times')
@ -216,7 +211,5 @@ class UrlQueue (object):

    def status (self):
        """Get tuple (finished tasks, in progress, queue size)."""
-        with self.mutex:
-            return (self.finished_tasks,
-                    len(self.in_progress), len(self.queue))
-
+        # no need to acquire self.mutex since the numbers are unreliable anyways.
+        return (self.finished_tasks, self.in_progress, len(self.queue))
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -101,43 +101,46 @@ def get_url_from (base_url, recursion_level, aggregate,
        base_ref = strformat.unicode_safe(base_ref)
    name = strformat.unicode_safe(name)
    url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
+    scheme = None
    if not (url or name):
        # use filename as base url, with slash as path seperator
        name = base_url.replace("\\", "/")
-    if parent_content_type == 'application/x-httpd-php' and \
-       '<?' in base_url and '?>' in base_url and url.startswith('file:'):
-        # ignore but warn about URLs from local PHP files with execution directives
+    elif ":" in url:
+        scheme = url.split(":", 1)[0].lower()
+    allowed_schemes = aggregate.config["allowedschemes"]
+    # ignore local PHP files with execution directives
+    local_php = (parent_content_type == 'application/x-httpd-php' and
+       '<?' in base_url and '?>' in base_url and scheme == 'file')
+    if local_php or (allowed_schemes and scheme not in allowed_schemes):
        klass = ignoreurl.IgnoreUrl
    else:
-        assume_local_file = recursion_level == 0
-        klass = get_urlclass_from(url, assume_local_file=assume_local_file)
+        assume_local_file = (recursion_level == 0)
+        klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
    log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
    return klass(base_url, recursion_level, aggregate,
                 parent_url=parent_url, base_ref=base_ref,
                 line=line, column=column, name=name, extern=extern)


-def get_urlclass_from (url, assume_local_file=False):
-    """Return checker class for given URL. If URL does not start
-    with a URL scheme and assume_local_file is True, assume that
-    the given URL is a local file."""
-    if url.startswith("http:"):
+def get_urlclass_from (scheme, assume_local_file=False):
+    """Return checker class for given URL scheme. If the scheme
+    cannot be matched and assume_local_file is True, assume a local file.
+    """
+    if scheme in ("http", "https"):
        klass = httpurl.HttpUrl
-    elif url.startswith("ftp:"):
+    elif scheme == "ftp":
        klass = ftpurl.FtpUrl
-    elif url.startswith("file:"):
+    elif scheme == "file":
        klass = fileurl.FileUrl
-    elif url.startswith("telnet:"):
+    elif scheme == "telnet":
        klass = telneturl.TelnetUrl
-    elif url.startswith("mailto:"):
+    elif scheme == "mailto":
        klass = mailtourl.MailtoUrl
-    elif url.startswith("https:"):
-        klass = httpsurl.HttpsUrl
-    elif url.startswith(("nntp:", "news:", "snews:")):
+    elif scheme in ("nntp", "news", "snews"):
        klass = nntpurl.NntpUrl
-    elif url.startswith('dns:'):
+    elif scheme == "dns":
        klass = dnsurl.DnsUrl
-    elif unknownurl.is_unknown_url(url):
+    elif scheme and unknownurl.is_unknown_scheme(scheme):
        klass = unknownurl.UnknownUrl
    elif assume_local_file:
        klass = fileurl.FileUrl
@ -168,4 +171,4 @@ def get_index_html (urls):

 # all the URL classes
 from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
-    httpsurl, mailtourl, telneturl, nntpurl, ignoreurl)
+    mailtourl, telneturl, nntpurl, ignoreurl)
--- a/linkcheck/checker/const.py
+++ b/linkcheck/checker/const.py
@ -21,8 +21,8 @@ import socket
 import select
 import nntplib
 import ftplib
-import httplib as orighttplib
-from .. import LinkCheckerError, httplib2 as httplib
+import requests
+from .. import LinkCheckerError
 from dns.exception import DNSException

 # Catch these exception on syntax checks.
@ -45,9 +45,8 @@ ExcCacheList = [
    nntplib.error_perm,
    nntplib.error_proto,
    EOFError,
-    # http error
-    httplib.error,
-    orighttplib.error,
+    # http errors
+    requests.exceptions.RequestException,
    # ftp errors
    ftplib.error_reply,
    ftplib.error_temp,
@ -75,39 +74,25 @@ ExcList = ExcCacheList + ExcNoCacheList

 # some constants
 URL_MAX_LENGTH = 2000
-URL_WARN_LENGTH = 255
+URL_WARN_LENGTH = 1024

 # the warnings
 WARN_URL_EFFECTIVE_URL = "url-effective-url"
 WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
-WARN_URL_ANCHOR_NOT_FOUND = "url-anchor-not-found"
-WARN_URL_WARNREGEX_FOUND = "url-warnregex-found"
 WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
 WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
-WARN_URL_CONTENT_SIZE_UNEQUAL = "url-content-size-unequal"
 WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
 WARN_URL_TOO_LONG = "url-too-long"
 WARN_URL_WHITESPACE = "url-whitespace"
 WARN_FILE_MISSING_SLASH = "file-missing-slash"
 WARN_FILE_SYSTEM_PATH = "file-system-path"
 WARN_FTP_MISSING_SLASH = "ftp-missing-slash"
-WARN_HTTP_ROBOTS_DENIED = "http-robots-denied"
-WARN_HTTP_MOVED_PERMANENT = "http-moved-permanent"
 WARN_HTTP_EMPTY_CONTENT = "http-empty-content"
 WARN_HTTP_COOKIE_STORE_ERROR = "http-cookie-store-error"
-WARN_HTTP_DECOMPRESS_ERROR = "http-decompress-error"
-WARN_HTTP_UNSUPPORTED_ENCODING = "http-unsupported-encoding"
-WARN_HTTP_AUTH_UNKNOWN = "http-auth-unknonwn"
-WARN_HTTP_AUTH_UNAUTHORIZED = "http-auth-unauthorized"
-WARN_HTTPS_CERTIFICATE = "https-certificate-error"
 WARN_IGNORE_URL = "ignore-url"
 WARN_MAIL_NO_MX_HOST = "mail-no-mx-host"
-WARN_MAIL_UNVERIFIED_ADDRESS = "mail-unverified-address"
-WARN_MAIL_NO_CONNECTION = "mail-no-connection"
 WARN_NNTP_NO_SERVER = "nntp-no-server"
 WARN_NNTP_NO_NEWSGROUP = "nntp-no-newsgroup"
-WARN_SYNTAX_HTML = "syntax-html"
-WARN_SYNTAX_CSS = "syntax-css"

 # registered warnings
 Warnings = {
@ -115,41 +100,20 @@ Warnings = {
        _("The effective URL is different from the original."),
    WARN_URL_ERROR_GETTING_CONTENT:
        _("Could not get the content of the URL."),
-    WARN_URL_ANCHOR_NOT_FOUND: _("URL anchor was not found."),
-    WARN_URL_WARNREGEX_FOUND:
-        _("The warning regular expression was found in the URL contents."),
    WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
    WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
-    WARN_URL_CONTENT_SIZE_UNEQUAL: _("The URL content size and download size are unequal."),
    WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
    WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
    WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
    WARN_FILE_SYSTEM_PATH:
        _("The file: path is not the same as the system specific path."),
    WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
-    WARN_HTTP_ROBOTS_DENIED: _("The http: URL checking has been denied."),
-    WARN_HTTP_MOVED_PERMANENT: _("The URL has moved permanently."),
    WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
    WARN_HTTP_COOKIE_STORE_ERROR:
        _("An error occurred while storing a cookie."),
-    WARN_HTTP_DECOMPRESS_ERROR:
-        _("An error occurred while decompressing the URL content."),
-    WARN_HTTP_UNSUPPORTED_ENCODING:
-        _("The URL content is encoded with an unknown encoding."),
-    WARN_HTTP_AUTH_UNKNOWN:
-        _("Unsupported HTTP authentication method."),
-    WARN_HTTP_AUTH_UNAUTHORIZED:
-        _("Unauthorized access without HTTP authentication."),
-    WARN_HTTPS_CERTIFICATE: _("The SSL certificate is invalid or expired."),
    WARN_IGNORE_URL: _("The URL has been ignored."),
    WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
-    WARN_MAIL_UNVERIFIED_ADDRESS:
-        _("The mailto: address could not be verified."),
-    WARN_MAIL_NO_CONNECTION:
-        _("No connection to a MX host could be established."),
    WARN_NNTP_NO_SERVER: _("No NNTP server was found."),
    WARN_NNTP_NO_NEWSGROUP: _("The NNTP newsgroup could not be found."),
    WARN_URL_OBFUSCATED_IP: _("The IP is obfuscated."),
-    WARN_SYNTAX_HTML: _("HTML syntax error."),
-    WARN_SYNTAX_CSS: _("CSS syntax error."),
 }
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -25,7 +25,7 @@ import urllib
 import urllib2
 from datetime import datetime

-from . import urlbase, get_index_html, get_url_from
+from . import urlbase, get_index_html
 from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil
 from ..bookmarks import firefox
 from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
@ -163,8 +163,6 @@ class FileUrl (urlbase.UrlBase):
            return
        filename = self.get_os_filename()
        self.size = fileutil.get_size(filename)
-        if self.dlsize == -1:
-            self.dlsize = self.size
        self.modified = datetime.utcfromtimestamp(fileutil.get_mtime(filename))

    def check_connection (self):
@ -203,16 +201,13 @@ class FileUrl (urlbase.UrlBase):
    def read_content (self):
        """Return file content, or in case of directories a dummy HTML file
        with links to the files."""
-        if self.size > self.MaxFilesizeBytes:
-            raise LinkCheckerError(_("File size too large"))
        if self.is_directory():
            data = get_index_html(get_files(self.get_os_filename()))
            if isinstance(data, unicode):
                data = data.encode("iso8859-1", "ignore")
-            size = len(data)
        else:
-            data, size = super(FileUrl, self).read_content()
-        return data, size
+            data = super(FileUrl, self).read_content()
+        return data

    def is_html (self):
        """Check if file is a HTML file."""
@ -272,27 +267,6 @@ class FileUrl (urlbase.UrlBase):
        log.debug(LOG_CHECK, "File with content type %r is not parseable.", ctype)
        return False

-    def parse_url (self):
-        """Parse file contents for new links to check."""
-        if self.is_directory():
-            self.parse_html()
-        elif firefox.has_sqlite and firefox.extension.search(self.url):
-            self.parse_firefox()
-        else:
-            mime = self.get_content_type()
-            key = self.ContentMimetypes[mime]
-            getattr(self, "parse_"+key)()
-        self.add_num_url_info()
-
-    def parse_firefox (self):
-        """Parse a Firefox3 bookmark file."""
-        log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
-        filename = self.get_os_filename()
-        for url, name in firefox.parse_bookmark_file(filename):
-            url_data = get_url_from(url, self.recursion_level+1,
-                self.aggregate, parent_url=self.url, name=name)
-            self.aggregate.urlqueue.put(url_data)
-
    def get_content_type (self):
        """Return URL content type, or an empty string if content
        type could not be found."""
@ -326,6 +300,5 @@ class FileUrl (urlbase.UrlBase):
        webroot = self.aggregate.config["localwebroot"]
        if webroot and url and url.startswith(u"/"):
            url = webroot + url[1:]
-            log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.",
-                webroot, url)
+            log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
        super(FileUrl, self).add_url(url, line=line, column=column, name=name, base=base)
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -22,11 +22,11 @@ import ftplib
 from cStringIO import StringIO

 from .. import log, LOG_CHECK, LinkCheckerError, fileutil
-from . import proxysupport, httpurl, internpaturl, get_index_html, pooledconnection
+from . import proxysupport, httpurl, internpaturl, get_index_html
 from .const import WARN_FTP_MISSING_SLASH


-class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
+class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with ftp scheme.
    """
@ -70,14 +70,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco

    def login (self):
        """Log into ftp server and check the welcome message."""
-        def create_connection(scheme, host, port):
-            """Create a new ftp connection."""
-            connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
-            if log.is_debug(LOG_CHECK):
-                connection.set_debuglevel(1)
-            return connection
-        scheme, host, port = self.get_netloc()
-        self.get_pooled_connection(scheme, host, port, create_connection)
+        self.url_connection = ftplib.FTP(timeout=self.aggregate.config["timeout"])
+        if log.is_debug(LOG_CHECK):
+            self.url_connection.set_debuglevel(1)
        try:
            self.url_connection.connect(self.host, self.port)
            _user, _password = self.get_user_password()
@ -92,6 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
                # note that the info may change every time a user logs in,
                # so don't add it to the url_data info.
                log.debug(LOG_CHECK, "FTP info %s", info)
+                pass
            else:
                raise LinkCheckerError(_("Got no answer from FTP server"))
        except EOFError as msg:
@ -105,6 +101,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
            features = self.url_connection.sendcmd("FEAT")
        except ftplib.error_perm as msg:
            log.debug(LOG_CHECK, "Ignoring error when getting FTP features: %s" % msg)
+            pass
        else:
            log.debug(LOG_CHECK, "FTP features %s", features)
            if " UTF-8" in features.splitlines():
@ -176,7 +173,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
        """See if URL target is parseable for recursion."""
        if self.is_directory():
            return True
-        ctype = self.get_content_type(self.get_content)
+        ctype = self.get_content_type()
        if ctype in self.ContentMimetypes:
            return True
        log.debug(LOG_CHECK, "URL with content type %r is not parseable.", ctype)
@ -188,20 +185,11 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
        path = self.urlparts[2]
        return (not path) or path.endswith('/')

-    def parse_url (self):
-        """Parse URL target for links."""
-        if self.is_directory():
-            self.parse_html()
-            return
-        key = self.ContentMimetypes[self.get_content_type(self.get_content)]
-        getattr(self, "parse_"+key)()
-        self.add_num_url_info()
-
-    def get_content_type (self, read=None):
+    def get_content_type (self):
        """Return URL content type, or an empty string if content
        type could not be found."""
        if self.content_type is None:
-            self.content_type = fileutil.guess_mimetype(self.url, read=read)
+            self.content_type = fileutil.guess_mimetype(self.url, read=self.get_content)
        return self.content_type

    def read_content (self):
@ -210,6 +198,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
        if self.is_directory():
            self.url_connection.cwd(self.filename)
            self.files = self.get_files()
+            # XXX limit number of files?
            data = get_index_html(self.files)
        else:
            # download file in BINARY mode
@ -217,20 +206,20 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
            buf = StringIO()
            def stor_data (s):
                """Helper method storing given data"""
-                self.aggregate.add_download_data(self.cache_content_key, s)
                # limit the download size
-                if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
+                if (buf.tell() + len(s)) > self.max_size:
                    raise LinkCheckerError(_("FTP file size too large"))
                buf.write(s)
            self.url_connection.retrbinary(ftpcmd, stor_data)
            data = buf.getvalue()
            buf.close()
-        return data, len(data)
+        return data

    def close_connection (self):
        """Release the open connection from the connection pool."""
-        if self.url_connection is None:
-            return
-        scheme, host, port = self.get_netloc()
-        self.aggregate.connections.release(scheme, host, port, self.url_connection)
-        self.url_connection = None
+        if self.url_connection is not None:
+            try:
+                self.url_connection.quit()
+            except Exception:
+                pass
+            self.url_connection = None
--- a/linkcheck/checker/httpheaders.py
+++ b/linkcheck/checker/httpheaders.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2005-2012 Bastian Kleineidam
+# Copyright (C) 2005-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/httpsurl.py
+++ b/linkcheck/checker/httpsurl.py
@ -1,179 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2014 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Handle https links.
-"""
-import time
-from . import httpurl
-from .const import WARN_HTTPS_CERTIFICATE
-from .. import log, LOG_CHECK, strformat
-
-
-class HttpsUrl (httpurl.HttpUrl):
-    """
-    Url link with https scheme.
-    """
-
-    def local_check (self):
-        """
-        Check connection if SSL is supported, else ignore.
-        """
-        if httpurl.supportHttps:
-            super(HttpsUrl, self).local_check()
-        else:
-            self.add_info(_("%s URL ignored.") % self.scheme.capitalize())
-
-    def get_http_object (self, scheme, host, port):
-        """Open a HTTP connection and check the SSL certificate."""
-        super(HttpsUrl, self).get_http_object(scheme, host, port)
-        self.check_ssl_certificate(self.url_connection.sock, host)
-
-    def check_ssl_certificate(self, ssl_sock, host):
-        """Run all SSL certificate checks that have not yet been done.
-        OpenSSL already checked the SSL notBefore and notAfter dates.
-        """
-        if not hasattr(ssl_sock, "getpeercert"):
-            # the URL was a HTTPS -> HTTP redirect
-            return
-        cert = ssl_sock.getpeercert()
-        log.debug(LOG_CHECK, "Got SSL certificate %s", cert)
-        if not cert:
-            return
-        if 'subject' in cert:
-            self.check_ssl_hostname(ssl_sock, cert, host)
-        else:
-            msg = _('certificate did not include "subject" information')
-            self.add_ssl_warning(ssl_sock, msg)
-        if 'notAfter' in cert:
-            self.check_ssl_valid_date(ssl_sock, cert)
-        else:
-            msg = _('certificate did not include "notAfter" information')
-            self.add_ssl_warning(ssl_sock, msg)
-
-    def check_ssl_hostname(self, ssl_sock, cert, host):
-        """Check the hostname against the certificate according to
-        RFC2818.
-        """
-        try:
-            match_hostname(cert, host)
-        except CertificateError as msg:
-            self.add_ssl_warning(ssl_sock, msg)
-
-    def check_ssl_valid_date(self, ssl_sock, cert):
-        """Check if the certificate is still valid, or if configured check
-        if it's at least a number of days valid.
-        """
-        import ssl
-        checkDaysValid = self.aggregate.config["warnsslcertdaysvalid"]
-        try:
-            notAfter = ssl.cert_time_to_seconds(cert['notAfter'])
-        except ValueError as msg:
-            msg = _('invalid certficate "notAfter" value %r') % cert['notAfter']
-            self.add_ssl_warning(ssl_sock, msg)
-            return
-        curTime = time.time()
-        # Calculate seconds until certifcate expires. Can be negative if
-        # the certificate is already expired.
-        secondsValid = notAfter - curTime
-        if secondsValid < 0:
-            msg = _('certficate is expired on %s') % cert['notAfter']
-            self.add_ssl_warning(ssl_sock, msg)
-        elif checkDaysValid > 0 and \
-              secondsValid < (checkDaysValid * strformat.SECONDS_PER_DAY):
-            strSecondsValid = strformat.strduration_long(secondsValid)
-            msg = _('certificate is only %s valid') % strSecondsValid
-            self.add_ssl_warning(ssl_sock, msg)
-
-    def add_ssl_warning(self, ssl_sock, msg):
-        """Add a warning message about an SSL certificate error."""
-        cipher_name, ssl_protocol, secret_bits = ssl_sock.cipher()
-        err = _(u"SSL warning: %(msg)s. Cipher %(cipher)s, %(protocol)s.")
-        attrs = dict(msg=msg, cipher=cipher_name, protocol=ssl_protocol)
-        self.add_warning(err % attrs, tag=WARN_HTTPS_CERTIFICATE)
-
-
-# Copied from ssl.py in Python 3:
-# Wrapper module for _ssl, providing some additional facilities
-# implemented in Python.  Written by Bill Janssen.
-import re
-
-class CertificateError(ValueError):
-    """Raised on certificate errors."""
-    pass
-
-
-def _dnsname_to_pat(dn, max_wildcards=1):
-    """Convert a DNS certificate name to a hostname matcher."""
-    pats = []
-    for frag in dn.split(r'.'):
-        if frag.count('*') > max_wildcards:
-            # Issue #17980: avoid denials of service by refusing more
-            # than one wildcard per fragment.  A survery of established
-            # policy among SSL implementations showed it to be a
-            # reasonable choice.
-            raise CertificateError(
-                "too many wildcards in certificate DNS name: " + repr(dn))
-        if frag == '*':
-            # When '*' is a fragment by itself, it matches a non-empty dotless
-            # fragment.
-            pats.append('[^.]+')
-        else:
-            # Otherwise, '*' matches any dotless fragment.
-            frag = re.escape(frag)
-            pats.append(frag.replace(r'\*', '[^.]*'))
-    return re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE)
-
-
-def match_hostname(cert, hostname):
-    """Verify that *cert* (in decoded format as returned by
-    SSLSocket.getpeercert()) matches the *hostname*.  RFC 2818 rules
-    are mostly followed, but IP addresses are not accepted for *hostname*.
-
-    CertificateError is raised on failure. On success, the function
-    returns nothing.
-    """
-    if not cert:
-        raise ValueError("empty or no certificate")
-    dnsnames = []
-    san = cert.get('subjectAltName', ())
-    for key, value in san:
-        if key == 'DNS':
-            if _dnsname_to_pat(value).match(hostname):
-                return
-            dnsnames.append(value)
-    if not dnsnames:
-        # The subject is only checked when there is no dNSName entry
-        # in subjectAltName
-        for sub in cert.get('subject', ()):
-            for key, value in sub:
-                # XXX according to RFC 2818, the most specific Common Name
-                # must be used.
-                if key == 'commonName':
-                    if _dnsname_to_pat(value).match(hostname):
-                        return
-                    dnsnames.append(value)
-    if len(dnsnames) > 1:
-        raise CertificateError("hostname %r "
-            "doesn't match either of %s"
-            % (hostname, ', '.join(map(repr, dnsnames))))
-    elif len(dnsnames) == 1:
-        raise CertificateError("hostname %r "
-            "doesn't match %r"
-            % (hostname, dnsnames[0]))
-    else:
-        raise CertificateError("no appropriate commonName or "
-            "subjectAltName fields were found")
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -18,26 +18,14 @@
 Handle http links.
 """

-import urlparse
-import os
-import errno
-import zlib
-import socket
-import rfc822
-import time
+import requests
 from cStringIO import StringIO
-from datetime import datetime

-from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
-    httplib2 as httplib, LinkCheckerError, httputil, configuration)
-from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
-    get_url_from, pooledconnection)
+from .. import (log, LOG_CHECK, strformat,
+    url as urlutil, LinkCheckerError)
+from . import (internpaturl, proxysupport, httpheaders as headers)
 # import warnings
-from .const import WARN_HTTP_ROBOTS_DENIED, \
-    WARN_HTTP_MOVED_PERMANENT, \
-    WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
-    WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
-    WARN_HTTP_AUTH_UNKNOWN, WARN_HTTP_AUTH_UNAUTHORIZED
+from .const import WARN_HTTP_EMPTY_CONTENT

 # assumed HTTP header encoding
 HEADER_ENCODING = "iso-8859-1"
@ -46,18 +34,7 @@ HTTP_SCHEMAS = ('http://', 'https://')
 # helper alias
 unicode_safe = strformat.unicode_safe

-supportHttps = hasattr(httplib, "HTTPSConnection")
-
-SUPPORTED_ENCODINGS = ('x-gzip', 'gzip', 'deflate')
-# Accept-Encoding header value
-ACCEPT_ENCODING = ",".join(SUPPORTED_ENCODINGS)
-# Accept-Charset header value
-ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.3"
-# Accept mime type header value
-ACCEPT = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
-
-
-class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
+class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with http scheme.
    """
@ -67,28 +44,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
        Initialize HTTP specific variables.
        """
        super(HttpUrl, self).reset()
-        self.max_redirects = 5
-        self.has301status = False
-        # flag if connection is persistent
-        self.persistent = False
-        # URLs seen through 301/302 redirections
+        # URLs seen through redirections
        self.aliases = []
        # initialize check data
-        self.headers = None
+        self.headers = {}
        self.auth = None
-        self.cookies = []
-        # temporary data filled when reading redirections
-        self._data = None
-        # flag telling if GET method is allowed; determined by robots.txt
-        self.method_get_allowed = True
-        # HttpResponse object
-        self.response = None

    def allows_robots (self, url):
        """
        Fetch and parse the robots.txt of given url. Checks if LinkChecker
-        can get the requested resource content. HEAD requests however are
-        still allowed.
+        can get the requested resource content.

        @param url: the url to be requested
        @type url: string
@ -98,9 +63,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
        roboturl = self.get_robots_txt_url()
        user, password = self.get_user_password()
        rb = self.aggregate.robots_txt
-        callback = self.aggregate.connections.host_wait
-        return rb.allows_url(roboturl, url, self.proxy, user, password,
-            callback=callback)
+        #callback = self.aggregate.connections.host_wait
+        return rb.allows_url(roboturl, self.url, self.proxy, user, password)

    def add_size_info (self):
        """Get size of URL content from HTTP header."""
@ -110,8 +74,6 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
            # the content data is always decoded.
            try:
                self.size = int(self.getheader("Content-Length"))
-                if self.dlsize == -1:
-                    self.dlsize = self.size
            except (ValueError, OverflowError):
                pass
        else:
@ -134,164 +96,56 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
          - 5xx: Server Error - The server failed to fulfill an apparently
            valid request
        """
+        self.session = self.aggregate.get_request_session()
        # set the proxy, so a 407 status after this is an error
        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        self.construct_auth()
        # check robots.txt
        if not self.allows_robots(self.url):
-            # remove all previously stored results
-            self.add_warning(
-                 _("Access denied by robots.txt, skipping content checks."),
-                 tag=WARN_HTTP_ROBOTS_DENIED)
-            self.method_get_allowed = False
-        # first try with HEAD
-        self.method = "HEAD"
+            self.add_info(_("Access denied by robots.txt, checked only syntax."))
+            self.set_result(_("syntax OK"))
+            self.do_check_content = False
+            return
        # check the http connection
-        self.check_http_connection()
-        # redirections might have changed the URL
-        self.url = urlutil.urlunsplit(self.urlparts)
-        # check response
-        if self.response is not None:
-            self.check_response()
-            self.close_response()
+        request = self.build_request()
+        self.send_request(request)
+        self.follow_redirections(request)
+        self.check_response()

-    def check_http_connection (self):
-        """
-        Check HTTP connection and return get response and a flag
-        if the check algorithm had to fall back to the GET method.
+    def build_request(self):
+        """Build a prepared request object."""
+        clientheaders = {
+            "User-Agent": self.aggregate.config["useragent"],
+            "DNT": "1",
+        }
+        if (self.parent_url and
+            self.parent_url.lower().startswith(HTTP_SCHEMAS)):
+            clientheaders["Referer"] = self.parent_url
+        kwargs = dict(
+            method='GET',
+            url=self.url,
+            headers=clientheaders,
+        )
+        if self.auth:
+            kwargs['auth'] = self.auth
+        log.debug(LOG_CHECK, "Prepare request with %s", kwargs)
+        request = requests.Request(**kwargs)
+        return self.session.prepare_request(request)

-        @return: response or None if url is already handled
-        @rtype: HttpResponse or None
-        """
-        while True:
-            # XXX refactor this
-            self.close_response()
-            try:
-                self._try_http_response()
-            except httplib.BadStatusLine as msg:
-                # some servers send empty HEAD replies
-                if self.method == "HEAD" and self.method_get_allowed:
-                    log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
-                    self.fallback_to_get()
-                    continue
-                raise
-            except socket.error as msg:
-                # some servers reset the connection on HEAD requests
-                if self.method == "HEAD" and self.method_get_allowed and \
-                   msg[0] == errno.ECONNRESET:
-                    self.fallback_to_get()
-                    continue
-                raise
-
-            uheaders = unicode_safe(self.headers, encoding=HEADER_ENCODING)
-            log.debug(LOG_CHECK, "Headers: %s", uheaders)
-            # proxy enforcement (overrides standard proxy)
-            if self.response.status == 305 and self.headers:
-                oldproxy = (self.proxy, self.proxyauth)
-                newproxy = self.getheader("Location")
-                if newproxy:
-                    self.add_info(_("Enforced proxy `%(name)s'.") %
-                                  {"name": newproxy})
-                self.set_proxy(newproxy)
-                self.close_response()
-                if self.proxy is None:
-                    self.set_result(
-                         _("Missing 'Location' header with enforced proxy status 305, aborting."),
-                         valid=False)
-                    return
-                elif not self.proxy:
-                    self.set_result(
-                         _("Empty 'Location' header value with enforced proxy status 305, aborting."),
-                         valid=False)
-                    return
-                self._try_http_response()
-                # restore old proxy settings
-                self.proxy, self.proxyauth = oldproxy
-            try:
-                tries = self.follow_redirections()
-            except httplib.BadStatusLine as msg:
-                # some servers send empty HEAD replies
-                if self.method == "HEAD" and self.method_get_allowed:
-                    log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
-                    self.fallback_to_get()
-                    continue
-                raise
-            if tries == -1:
-                log.debug(LOG_CHECK, "already handled")
-                self.close_response()
-                self.do_check_content = False
-                return
-            if tries >= self.max_redirects:
-                if self.method == "HEAD" and self.method_get_allowed:
-                    # Microsoft servers tend to recurse HEAD requests
-                    self.fallback_to_get()
-                    continue
-                self.set_result(_("more than %d redirections, aborting") %
-                                self.max_redirects, valid=False)
-                self.close_response()
-                self.do_check_content = False
-                return
-            if self.do_fallback(self.response.status):
-                self.fallback_to_get()
-                continue
-            # user authentication
-            if self.response.status == 401:
-                authenticate = self.getheader('WWW-Authenticate')
-                if authenticate is None:
-                    # Either the server intentionally blocked this request,
-                    # or there is a form on this page which requires
-                    # manual user/password input.
-                    # Either way, this is a warning.
-                    self.add_warning(_("Unauthorized access without HTTP authentication."),
-                       tag=WARN_HTTP_AUTH_UNAUTHORIZED)
-                    return
-                if not authenticate.startswith("Basic"):
-                    # LinkChecker only supports Basic authorization
-                    args = {"auth": authenticate}
-                    self.add_warning(
-                       _("Unsupported HTTP authentication `%(auth)s', " \
-                         "only `Basic' authentication is supported.") % args,
-                       tag=WARN_HTTP_AUTH_UNKNOWN)
-                    return
-                if not self.auth:
-                    self.construct_auth()
-                    if self.auth:
-                        continue
-            break
-
-    def do_fallback(self, status):
-        """Check for fallback according to response status.
-        @param status: The HTTP response status
-        @ptype status: int
-        @return: True if checker should use GET, else False
-        @rtype: bool
-        """
-        if self.method == "HEAD":
-            # Some sites do not support HEAD requests, for example
-            # youtube sends a 404 with HEAD, 200 with GET. Doh.
-            # A 405 "Method not allowed" status should also use GET.
-            if status >= 400:
-                log.debug(LOG_CHECK, "Method HEAD error %d, falling back to GET", status)
-                return True
-            # Other sites send 200 with HEAD, but 404 with GET. Bummer.
-            poweredby = self.getheader('X-Powered-By', u'')
-            server = self.getheader('Server', u'')
-            # Some servers (Zope, Apache Coyote/Tomcat, IIS have wrong
-            # content type with HEAD. This seems to be a common problem.
-            if (poweredby.startswith('Zope') or server.startswith('Zope')
-             or server.startswith('Apache-Coyote')
-             or ('ASP.NET' in poweredby and 'Microsoft-IIS' in server)):
-                return True
-        return False
-
-    def fallback_to_get(self):
-        """Set method to GET and clear aliases."""
-        self.close_response()
-        self.close_connection()
-        self.method = "GET"
-        self.aliases = []
-        self.urlparts = strformat.url_unicode_split(self.url)
-        self.build_url_parts()
+    def send_request(self, request):
+        """Send request and store response in self.url_connection."""
+        # throttle the number of requests to each host
+        self.aggregate.wait_for_host(self.urlparts[1])
+        kwargs = dict(
+            stream=True,
+            timeout=self.aggregate.config["timeout"],
+            allow_redirects=False,
+        )
+        if self.scheme == "https" and self.aggregate.config["sslverify"]:
+            kwargs["verify"] = self.aggregate.config["sslverify"]
+        log.debug(LOG_CHECK, "Send request with %s", kwargs)
+        self.url_connection = self.session.send(request, **kwargs)
+        self.headers = self.url_connection.headers

    def construct_auth (self):
        """Construct HTTP Basic authentication credentials if there
@ -301,162 +155,34 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
            return
        _user, _password = self.get_user_password()
        if _user is not None and _password is not None:
-            credentials = httputil.encode_base64("%s:%s" % (_user, _password))
-            self.auth = "Basic " + credentials
-            log.debug(LOG_CHECK, "Using basic authentication")
+            self.auth = (_user, _password)

    def get_content_type (self):
        """Return content MIME type or empty string."""
-        if self.content_type is None:
-            if self.headers:
-                self.content_type = headers.get_content_type(self.headers)
-            else:
-                self.content_type = u""
+        if not self.content_type:
+            self.content_type = headers.get_content_type(self.headers)
        return self.content_type

-    def follow_redirections (self, set_result=True):
+    def follow_redirections(self, request):
        """Follow all redirections of http response."""
        log.debug(LOG_CHECK, "follow all redirections")
-        redirected = self.url
-        tries = 0
-        while self.response.status in [301, 302] and self.headers and \
-              tries < self.max_redirects:
-            num = self.follow_redirection(set_result, redirected)
-            if num == -1:
-                return num
-            redirected = urlutil.urlunsplit(self.urlparts)
-            tries += num
-        return tries
-
-    def follow_redirection (self, set_result, redirected):
-        """Follow one redirection of http response."""
-        newurl = self.getheader("Location",
-                     self.getheader("Uri", u""))
-        # make new url absolute and unicode
-        newurl = urlparse.urljoin(redirected, unicode_safe(newurl))
-        log.debug(LOG_CHECK, "Redirected to %r", newurl)
-        self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
-        # norm base url - can raise UnicodeError from url.idna_encode()
-        redirected, is_idn = urlbase.url_norm(newurl)
-        log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
-        urlparts = strformat.url_unicode_split(redirected)
-        if not self.check_redirection_scheme(redirected, urlparts, set_result):
-            return -1
-        if not self.check_redirection_newscheme(redirected, urlparts, set_result):
-            return -1
-        if not self.check_redirection_domain(redirected, urlparts,
-                                             set_result):
-            return -1
-        if not self.check_redirection_robots(redirected, set_result):
-            return -1
-        num = self.check_redirection_recursion(redirected, set_result)
-        if num != 0:
-            return num
-        if set_result:
-            self.check301status()
-        self.close_response()
-        self.close_connection()
-        # remember redirected url as alias
-        self.aliases.append(redirected)
-        if self.anchor:
-            urlparts[4] = self.anchor
-        # note: urlparts has to be a list
-        self.urlparts = urlparts
-        self.build_url_parts()
-        # store cookies from redirect response
-        self.store_cookies()
-        # new response data
-        self._try_http_response()
-        return 1
-
-    def check_redirection_scheme (self, redirected, urlparts, set_result):
-        """Return True if redirection scheme is ok, else False."""
-        if urlparts[0] in ('ftp', 'http', 'https'):
-            return True
-        # For security reasons do not allow redirects to protocols
-        # other than HTTP, HTTPS or FTP.
-        if set_result:
-            self.add_warning(
-              _("Redirection to url `%(newurl)s' is not allowed.") %
-              {'newurl': redirected})
-            self.set_result(_("syntax OK"))
-        return False
-
-    def check_redirection_domain (self, redirected, urlparts, set_result):
-        """Return True if redirection domain is ok, else False."""
-        # XXX does not support user:pass@netloc format
-        if urlparts[1] != self.urlparts[1]:
-            # URL domain changed
-            if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
-                # Add intern patterns for redirection of URLs given by the
-                # user for HTTP schemes.
-                self.add_intern_pattern(url=redirected)
-                return True
-        # check extern filter again
-        self.extern = None
-        self.set_extern(redirected)
-        if self.extern[0] and self.extern[1]:
-            if set_result:
-                self.check301status()
-                self.add_info(_("The redirected URL is outside of the domain "
-                              "filter, checked only syntax."))
-                self.set_result(_("filtered"))
-            return False
-        return True
-
-    def check_redirection_robots (self, redirected, set_result):
-        """Check robots.txt allowance for redirections. Return True if
-        allowed, else False."""
-        if self.allows_robots(redirected):
-            return True
-        if set_result:
-            self.add_warning(
-               _("Access to redirected URL denied by robots.txt, "
-                 "checked only syntax."), tag=WARN_HTTP_ROBOTS_DENIED)
-            self.set_result(_("syntax OK"))
-        return False
-
-    def check_redirection_recursion (self, redirected, set_result):
-        """Check for recursive redirect. Return zero if no recursion
-        detected, max_redirects for recursion with HEAD request,
-        -1 otherwise."""
-        all_seen = [self.cache_url_key] + self.aliases
-        if redirected not in all_seen:
-            return 0
-        if self.method == "HEAD" and self.method_get_allowed:
-            # Microsoft servers tend to recurse HEAD requests
-            # fall back to the original url and use GET
-            return self.max_redirects
-        if set_result:
-            urls = "\n  => ".join(all_seen + [redirected])
-            self.set_result(_("recursive redirection encountered:\n %(urls)s") %
-                            {"urls": urls}, valid=False)
-        return -1
-
-    def check_redirection_newscheme (self, redirected, urlparts, set_result):
-        """Check for HTTP(S)/FTP redirection. Return True for
-        redirection with same scheme, else False."""
-        if urlparts[0] != self.urlparts[0]:
-            # changed scheme
-            newobj = get_url_from(
-                  redirected, self.recursion_level, self.aggregate,
-                  parent_url=self.parent_url, base_ref=self.base_ref,
-                  line=self.line, column=self.column, name=self.name)
-            if set_result:
-                self.set_result(_("syntax OK"))
-                # append new object to queue
-                self.aggregate.urlqueue.put(newobj)
-                return False
-            raise LinkCheckerError(_('Cannot redirect to different scheme without result'))
-        return True
-
-    def check301status (self):
-        """If response page has been permanently moved add a warning."""
-        if self.response.status == 301 and not self.has301status:
-            self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
-                               " should update this link."),
-                             tag=WARN_HTTP_MOVED_PERMANENT)
-            self.has301status = True
+        kwargs = dict(
+            stream=True,
+        )
+        response = None
+        for response in self.session.resolve_redirects(self.url_connection, request, **kwargs):
+            newurl = response.url
+            log.debug(LOG_CHECK, "Redirected to %r", newurl)
+            self.aliases.append(newurl)
+            self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
+            urlparts = strformat.url_unicode_split(newurl)
+        if response is not None:
+            self.urlparts = urlparts
+            self.build_url_parts()
+            self.url_connection = response
+            self.headers = response.headers
+            self.url = urlutil.urlunsplit(urlparts)
+            self.scheme = urlparts[0].lower()

    def getheader (self, name, default=None):
        """Get decoded header value.
@ -471,271 +197,29 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc

    def check_response (self):
        """Check final result and log it."""
-        if self.response.status >= 400:
-            self.set_result(u"%r %s" % (self.response.status, self.response.reason),
+        if self.url_connection.status_code >= 400:
+            self.set_result(u"%d %s" % (self.url_connection.status_code, self.url_connection.reason),
                            valid=False)
        else:
-            if self.response.status == 204:
+            if self.url_connection.status_code == 204:
                # no content
-                self.add_warning(self.response.reason,
+                self.add_warning(self.url_connection.reason,
                                 tag=WARN_HTTP_EMPTY_CONTENT)
-            # store cookies for valid links
-            self.store_cookies()
-            if self.response.status >= 200:
-                self.set_result(u"%r %s" % (self.response.status, self.response.reason))
+            if self.url_connection.status_code >= 200:
+                self.set_result(u"%r %s" % (self.url_connection.status_code, self.url_connection.reason))
            else:
                self.set_result(_("OK"))
-        modified = rfc822.parsedate(self.getheader('Last-Modified', u''))
-        if modified:
-            self.modified = datetime.utcfromtimestamp(time.mktime(modified))

-    def _try_http_response (self):
-        """Try to get a HTTP response object. For persistent
-        connections that the server closed unexpected, a new connection
-        will be opened.
-        """
-        try:
-            self._get_http_response()
-        except socket.error as msg:
-            if msg.args[0] == 32 and self.persistent:
-                # server closed persistent connection - retry
-                log.debug(LOG_CHECK, "Server closed connection: retry")
-                self.persistent = False
-                self._get_http_response()
-            else:
-                raise
-        except httplib.BadStatusLine as msg:
-            if self.persistent:
-                # server closed connection - retry
-                log.debug(LOG_CHECK, "Empty status line: retry")
-                self.persistent = False
-                self._get_http_response()
-            else:
-                raise
-
-    def _get_http_response (self):
-        """Send HTTP request and get response object."""
-        scheme, host, port = self.get_netloc()
-        log.debug(LOG_CHECK, "Connecting to %r", host)
-        self.get_http_object(scheme, host, port)
-        self.add_connection_request()
-        self.add_connection_headers()
-        self.response = self.url_connection.getresponse(buffering=True)
-        self.headers = self.response.msg
-        self.content_type = None
-        self.persistent = not self.response.will_close
-        if self.persistent and self.method == "HEAD":
-            # Some servers send page content after a HEAD request,
-            # but only after making the *next* request. This breaks
-            # protocol synchronisation. Workaround here is to close
-            # the connection after HEAD.
-            # Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
-            self.persistent = False
-        # Note that for POST method the connection should also be closed,
-        # but this method is never used.
-        # If possible, use official W3C HTTP response name
-        if self.response.status in httplib.responses:
-            self.response.reason = httplib.responses[self.response.status]
-        if self.response.reason:
-            self.response.reason = unicode_safe(self.response.reason)
-        log.debug(LOG_CHECK, "Response: %s %s", self.response.status, self.response.reason)
-
-    def add_connection_request(self):
-        """Add connection request."""
-        # the anchor fragment is not part of a HTTP URL, see
-        # http://tools.ietf.org/html/rfc2616#section-3.2.2
-        anchor = ''
-        if self.proxy:
-            path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1],
-                                 self.urlparts[2], self.urlparts[3], anchor))
-        else:
-            path = urlutil.urlunsplit(('', '', self.urlparts[2],
-                                        self.urlparts[3], anchor))
-        self.url_connection.putrequest(self.method, path, skip_host=True,
-                                       skip_accept_encoding=True)
-
-    def add_connection_headers(self):
-        """Add connection header."""
-        # be sure to use the original host as header even for proxies
-        self.url_connection.putheader("Host", self.urlparts[1])
-        if self.auth:
-            # HTTP authorization
-            self.url_connection.putheader("Authorization", self.auth)
-        if self.proxyauth:
-            self.url_connection.putheader("Proxy-Authorization",
-                                         self.proxyauth)
-        if (self.parent_url and
-            self.parent_url.lower().startswith(HTTP_SCHEMAS)):
-            self.url_connection.putheader("Referer", self.parent_url)
-        self.url_connection.putheader("User-Agent",
-            self.aggregate.config["useragent"])
-        # prefer compressed content
-        self.url_connection.putheader("Accept-Encoding", ACCEPT_ENCODING)
-        # prefer UTF-8 encoding
-        self.url_connection.putheader("Accept-Charset", ACCEPT_CHARSET)
-        # prefer parseable mime types
-        self.url_connection.putheader("Accept", ACCEPT)
-        # send do-not-track header
-        self.url_connection.putheader("DNT", "1")
-        if self.aggregate.config['sendcookies']:
-            self.send_cookies()
-        self.url_connection.endheaders()
-
-    def store_cookies (self):
-        """Save cookies from response headers."""
-        if self.aggregate.config['storecookies']:
-            for c in self.cookies:
-                self.add_info(_("Sent Cookie: %(cookie)s.") %
-                              {"cookie": c.client_header_value()})
-            errors = self.aggregate.cookies.add(self.headers,
-                self.urlparts[0], self.urlparts[1], self.urlparts[2])
-            if errors:
-                self.add_warning(
-                  _("Could not store cookies from headers: %(error)s.") %
-                   {'error': "\n".join(errors)},
-                   tag=WARN_HTTP_COOKIE_STORE_ERROR)
-
-    def send_cookies (self):
-        """Add cookie headers to request."""
-        scheme = self.urlparts[0]
-        host = self.urlparts[1]
-        port = urlutil.default_ports.get(scheme, 80)
-        host, port = urlutil.splitport(host, port)
-        path = self.urlparts[2] or u"/"
-        self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
-        if not self.cookies:
-            return
-        # add one cookie header with all cookie data
-        # this is limited by maximum header length
-        headername = "Cookie"
-        headervalue = ""
-        max_value_len = headers.MAX_HEADER_BYTES - len(headername) - 2
-        for c in self.cookies:
-            cookievalue = c.client_header_value()
-            if "version" in c.attributes:
-                # add separate header for explicit versioned cookie
-                if headervalue:
-                    self.url_connection.putheader(headername, headervalue)
-                self.url_connection.putheader(headername, cookievalue)
-                headervalue = ""
-                continue
-            if headervalue:
-                cookievalue = "; " + cookievalue
-            if (len(headervalue) + len(cookievalue)) < max_value_len:
-                headervalue += cookievalue
-            else:
-                log.debug(LOG_CHECK, "Discard too-long cookie %r", cookievalue)
-        if headervalue:
-            log.debug(LOG_CHECK, "Sending cookie header %s:%s", headername, headervalue)
-            self.url_connection.putheader(headername, headervalue)
-
-    def get_http_object (self, scheme, host, port):
-        """
-        Open a HTTP connection.
-
-        @param host: the host to connect to
-        @ptype host: string of the form <host>[:<port>]
-        @param scheme: 'http' or 'https'
-        @ptype scheme: string
-        @return: None
-        """
-        self.close_connection()
-        def create_connection(scheme, host, port):
-            """Create a new http or https connection."""
-            kwargs = dict(port=port, strict=True, timeout=self.aggregate.config["timeout"])
-            if scheme == "http":
-                h = httplib.HTTPConnection(host, **kwargs)
-            elif scheme == "https" and supportHttps:
-                devel_dir = os.path.join(configuration.configdata.install_data, "config")
-                sslverify = self.aggregate.config["sslverify"]
-                if sslverify:
-                    if sslverify is not True:
-                        kwargs["ca_certs"] = sslverify
-                    else:
-                        kwargs["ca_certs"] = configuration.get_share_file(devel_dir, 'ca-certificates.crt')
-                h = httplib.HTTPSConnection(host, **kwargs)
-            else:
-                msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
-                raise LinkCheckerError(msg)
-            if log.is_debug(LOG_CHECK):
-                h.set_debuglevel(1)
-            return h
-        self.get_pooled_connection(scheme, host, port, create_connection)
-        self.url_connection.connect()
-
-    def read_content (self):
-        """Get content of the URL target. The content data is cached after
-        the first call to this method.
-
-        @return: URL content, decompressed and decoded
-        @rtype: string
-        """
-        assert self.method_get_allowed, 'unallowed content read'
-        if self.method != "GET" or self.response is None:
-            self.method = "GET"
-            self._try_http_response()
-            num = self.follow_redirections(set_result=False)
-            if not (0 <= num <= self.max_redirects):
-                raise LinkCheckerError(_("Redirection error"))
-            # Re-read size info, since the GET request result could be different
-            # than a former HEAD request.
-            self.add_size_info()
-        if self.size > self.MaxFilesizeBytes:
-            raise LinkCheckerError(_("File size too large"))
-        self.charset = headers.get_charset(self.headers)
-        return self._read_content()
-
-    def _read_content (self):
-        """Read URL contents."""
-        data = self.response.read(self.MaxFilesizeBytes+1)
-        if len(data) > self.MaxFilesizeBytes:
-            raise LinkCheckerError(_("File size too large"))
-        dlsize = len(data)
-        self.aggregate.add_download_data(self.cache_content_key, data)
-        encoding = headers.get_content_encoding(self.headers)
-        if encoding in SUPPORTED_ENCODINGS:
-            try:
-                if encoding == 'deflate':
-                    f = StringIO(zlib.decompress(data))
-                else:
-                    f = gzip.GzipFile('', 'rb', 9, StringIO(data))
-            except zlib.error as msg:
-                log.debug(LOG_CHECK, "Error %s data of len %d", encoding, len(data))
-                self.add_warning(_("Decompress error %(err)s") %
-                                 {"err": str(msg)},
-                                 tag=WARN_HTTP_DECOMPRESS_ERROR)
-                f = StringIO(data)
-            try:
-                data = f.read()
-            finally:
-                f.close()
-        return data, dlsize
-
-    def encoding_supported (self):
-        """Check if page encoding is supported."""
-        encoding = headers.get_content_encoding(self.headers)
-        if encoding and encoding not in SUPPORTED_ENCODINGS and \
-           encoding != 'identity':
-            self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
-                             {"encoding": encoding},
-                             tag=WARN_HTTP_UNSUPPORTED_ENCODING)
-            return False
-        return True
-
-    def can_get_content(self):
-        """Check if it's allowed to read content."""
-        return self.method_get_allowed
-
-    def content_allows_robots (self):
-        """Check if it's allowed to read content before execution."""
-        if not self.method_get_allowed:
-            return False
-        return super(HttpUrl, self).content_allows_robots()
-
-    def check_warningregex (self):
-        """Check if it's allowed to read content before execution."""
-        if self.method_get_allowed:
-            super(HttpUrl, self).check_warningregex()
+    def read_content(self):
+        """Return data and data size for this URL.
+        Can be overridden in subclasses."""
+        maxbytes = self.aggregate.config["maxfilesizedownload"]
+        buf = StringIO()
+        for data in self.url_connection.iter_content(chunk_size=self.ReadChunkBytes):
+            if buf.tell() + len(data) > maxbytes:
+                raise LinkCheckerError(_("File size too large"))
+            buf.write(data)
+        return buf.getvalue()

    def is_html (self):
        """
@ -748,22 +232,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
        if not self.valid:
            return False
        mime = self.get_content_type()
-        if self.ContentMimetypes.get(mime) != "html":
-            return False
-        if self.headers:
-            return self.encoding_supported()
-        return True
+        return self.ContentMimetypes.get(mime) == "html"

    def is_css (self):
        """Return True iff content of this url is CSS stylesheet."""
        if not self.valid:
            return False
        mime = self.get_content_type()
-        if self.ContentMimetypes.get(mime) != "css":
-            return False
-        if self.headers:
-            return self.encoding_supported()
-        return True
+        return self.ContentMimetypes.get(mime) == "css"

    def is_http (self):
        """
@ -781,30 +257,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
        @return: True if content is parseable
        @rtype: bool
        """
-        if not (self.valid and self.headers):
+        if not self.valid:
            return False
        ctype = self.get_content_type()
        if ctype not in self.ContentMimetypes:
            log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
            return False
-        return self.encoding_supported()
-
-    def parse_url (self):
-        """
-        Parse file contents for new links to check.
-        """
-        ctype = self.get_content_type()
-        if self.is_html():
-            self.parse_html()
-        elif self.is_css():
-            self.parse_css()
-        elif ctype == "application/x-shockwave-flash":
-            self.parse_swf()
-        elif ctype == "application/msword":
-            self.parse_word()
-        elif ctype == "text/vnd.wap.wml":
-            self.parse_wml()
-        self.add_num_url_info()
+        return True

    def get_robots_txt_url (self):
        """
@ -814,28 +273,3 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
        @rtype: string
        """
        return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
-
-    def close_response(self):
-        """Close the HTTP response object."""
-        if self.response is None:
-            return
-        self.response.close()
-        self.response = None
-
-    def close_connection (self):
-        """Release the connection from the connection pool. Persistent
-        connections will not be closed.
-        """
-        log.debug(LOG_CHECK, "Closing %s", self.url_connection)
-        if self.url_connection is None:
-            # no connection is open
-            return
-        # add to cached connections
-        scheme, host, port = self.get_netloc()
-        if self.persistent and self.url_connection.is_idle():
-            expiration = time.time() + headers.http_keepalive(self.headers)
-        else:
-            self.close_response()
-            expiration = None
-        self.aggregate.connections.release(scheme, host, port, self.url_connection, expiration=expiration)
-        self.url_connection = None
--- a/linkcheck/checker/ignoreurl.py
+++ b/linkcheck/checker/ignoreurl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2012 Bastian Kleineidam
+# Copyright (C) 2012-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/internpaturl.py
+++ b/linkcheck/checker/internpaturl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2005-2012 Bastian Kleineidam
+# Copyright (C) 2005-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/mailtourl.py
+++ b/linkcheck/checker/mailtourl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -21,16 +21,13 @@ Handle for mailto: links.
 import re
 import urllib
 import urlparse
-import smtplib
-import socket
 from email._parseaddr import AddressList

 from . import urlbase
 from .. import log, LOG_CHECK, strformat, url as urlutil
 from dns import resolver
 from ..network import iputil
-from .const import WARN_MAIL_NO_MX_HOST, \
-    WARN_MAIL_UNVERIFIED_ADDRESS, WARN_MAIL_NO_CONNECTION
+from .const import WARN_MAIL_NO_MX_HOST


 def getaddresses (addr):
@ -287,78 +284,9 @@ class MailtoUrl (urlbase.UrlBase):
        # debug output
        log.debug(LOG_CHECK, "found %d MX mailhosts:", len(answers))
        for preference, host in mxdata:
-            log.debug(LOG_CHECK,
-                "MX host %r, preference %d", host, preference)
-        # connect
-        self.check_smtp_connect(mxdata, username, domain)
-
-    def check_smtp_connect (self, mxdata, username, domain):
-        """
-        Connect to SMTP servers and check emails.
-
-        @param mxdata: list of (preference, host) tuples to check for
-        @type mxdata: list
-        @param username: the username to verify
-        @type username: string
-        """
-        smtpconnect = 0
-        for preference, host in mxdata:
-            try:
-                log.debug(LOG_CHECK,
-                    "SMTP check for %r (preference %d)", host, preference)
-                self.url_connection = smtplib.SMTP(timeout=self.aggregate.config["timeout"])
-                if log.is_debug(LOG_CHECK):
-                    self.url_connection.set_debuglevel(1)
-                self.url_connection.connect(host)
-                log.debug(LOG_CHECK, "SMTP connected!")
-                smtpconnect = 1
-                self.url_connection.helo()
-                mailaddress = "%s@%s" % (username, domain)
-                status, info = self.url_connection.verify(mailaddress)
-                log.debug(LOG_CHECK, "SMTP info %d %r", status, info)
-                d = {
-                    'info': "%d %s" % (status, str(info)),
-                    'mail': mailaddress,
-                }
-                if status == 250:
-                    self.add_info(_("Verified address %(mail)s: %(info)s.") % d)
-                # check for 25x status code which means that the address
-                # could not be verified, but is sent anyway
-                elif 250 < status < 260:
-                    self.add_info(_("Unverified but presumably valid"
-                                    " address %(mail)s: %(info)s.") % d)
-                else:
-                    self.add_warning(_("Unverified address: %(info)s.") % d,
-                     tag=WARN_MAIL_UNVERIFIED_ADDRESS)
-            except smtplib.SMTPException as msg:
-                self.add_warning(
-                      _("MX mail host %(host)s did not accept connections: "
-                        "%(error)s.") % {'host': host, 'error': str(msg)},
-                        tag=WARN_MAIL_NO_CONNECTION)
-            if smtpconnect:
-                break
-        if not smtpconnect:
-            self.set_result(_("Could not connect, but syntax is correct"),
-                overwrite=True)
-        else:
-            self.set_result(_("Found MX mail host %(host)s") % {'host': host},
-                overwrite=True)
-
-    def close_connection (self):
-        """
-        Close a possibly opened SMTP connection.
-        """
-        if self.url_connection is None:
-            # no connection is open
-            return
-        connection = self.url_connection
-        self.url_connection = None
-        try:
-            connection.quit()
-        except (smtplib.SMTPException, socket.error):
-            # ignore close errors
-            # socket.error is raised for example on timeouts
+            log.debug(LOG_CHECK, "MX host %r, preference %d", host, preference)
            pass
+        self.set_result(_("Valid mail address syntax"))

    def set_cache_keys (self):
        """
--- a/linkcheck/checker/nntpurl.py
+++ b/linkcheck/checker/nntpurl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/pooledconnection.py
+++ b/linkcheck/checker/pooledconnection.py
@ -1,40 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2012 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Mixin class for URLs that pool connections.
-"""
-
-
-class PooledConnection (object):
-    """Support for connection pooling."""
-
-    def get_pooled_connection(self, scheme, host, port, create_connection):
-        """Get a connection from the connection pool."""
-        get_connection = self.aggregate.connections.get
-        while True:
-            connection = get_connection(scheme, host, port, create_connection)
-            if hasattr(connection, 'acquire'):
-                # It's a connection lock object.
-                # This little trick avoids polling: wait for another
-                # connection to be released by acquiring the lock.
-                connection.acquire()
-                # The lock is immediately released since the calling
-                # connections.get() acquires it again.
-                connection.release()
-            else:
-                self.url_connection = connection
-                break
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/telneturl.py
+++ b/linkcheck/checker/telneturl.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/checker/unknownurl.py
+++ b/linkcheck/checker/unknownurl.py
@ -20,7 +20,6 @@ Handle uncheckable URLs.

 import re
 from . import urlbase
-from .const import WARN_IGNORE_URL

 # from http://www.iana.org/assignments/uri-schemes.html
 ignored_schemes_permanent = r"""
@ -124,7 +123,7 @@ ignored_schemes_other = r"""
 """


-ignored_schemes = "^(%s%s%s%s):" % (
+ignored_schemes = "^(%s%s%s%s)$" % (
    ignored_schemes_permanent,
    ignored_schemes_provisional,
    ignored_schemes_historical,
@ -132,7 +131,7 @@ ignored_schemes = "^(%s%s%s%s):" % (
 )
 ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)

-is_unknown_url = ignored_schemes_re.search
+is_unknown_scheme = ignored_schemes_re.match


 class UnknownUrl (urlbase.UrlBase):
@ -140,19 +139,16 @@ class UnknownUrl (urlbase.UrlBase):

    def local_check (self):
        """Only logs that this URL is unknown."""
-        if self.extern[0] and self.extern[1]:
-            self.add_info(_("Outside of domain filter, checked only syntax."))
-        elif self.ignored():
-            self.add_warning(_("%(scheme)s URL ignored.") %
-                             {"scheme": self.scheme.capitalize()},
-                             tag=WARN_IGNORE_URL)
+        if self.ignored():
+            self.add_info(_("%(scheme)s URL ignored.") %
+                          {"scheme": self.scheme.capitalize()})
        else:
            self.set_result(_("URL is unrecognized or has invalid syntax"),
                        valid=False)

    def ignored (self):
        """Return True if this URL scheme is ignored."""
-        return ignored_schemes_re.search(self.url)
+        return is_unknown_scheme(self.scheme)

    def can_get_content (self):
        """Unknown URLs have no content.
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -26,21 +26,19 @@ import time
 import errno
 import socket
 import select
+from cStringIO import StringIO

 from . import absolute_url, get_url_from
-from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
-  strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip,
-  fileutil, get_link_pat)
+from .. import (log, LOG_CHECK, LOG_CACHE,
+  strformat, LinkCheckerError, url as urlutil, trace, get_link_pat, parser)
 from ..HtmlParser import htmlsax
 from ..htmlutil import linkparse
 from ..network import iputil
 from .const import (WARN_URL_EFFECTIVE_URL,
    WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
-    WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
-    WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
-    WARN_URL_CONTENT_SIZE_UNEQUAL, WARN_URL_WHITESPACE,
+    WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
+    WARN_URL_WHITESPACE,
    WARN_URL_TOO_LONG, URL_MAX_LENGTH, URL_WARN_LENGTH,
-    WARN_SYNTAX_HTML, WARN_SYNTAX_CSS,
    ExcList, ExcSyntaxList, ExcNoCacheList)

 # helper alias
@ -71,17 +69,6 @@ def url_norm (url, encoding=None):
        raise LinkCheckerError(msg)


-def getXmlText (parent, tag):
-    """Return XML content of given tag in parent element."""
-    elem = parent.getElementsByTagName(tag)[0]
-    # Yes, the DOM standard is awful.
-    rc = []
-    for node in elem.childNodes:
-        if node.nodeType == node.TEXT_NODE:
-            rc.append(node.data)
-    return ''.join(rc)
-
-
 class UrlBase (object):
    """An URL with additional information like validity etc."""

@ -103,8 +90,8 @@ class UrlBase (object):
        "text/vnd.wap.wml": "wml",
    }

-    # Set maximum file size for downloaded files in bytes.
-    MaxFilesizeBytes = 1024*1024*5
+    # Read in 16kb chunks
+    ReadChunkBytes = 1024*16

    def __init__ (self, base_url, recursion_level, aggregate,
                  parent_url=None, base_ref=None, line=-1, column=-1,
@ -173,8 +160,6 @@ class UrlBase (object):
        self.urlparts = None
        # the scheme, host, port and anchor part of url
        self.scheme = self.host = self.port = self.anchor = None
-        # list of parsed anchors
-        self.anchors = []
        # the result message string and flag
        self.result = u""
        self.has_result = False
@ -190,8 +175,6 @@ class UrlBase (object):
        self.modified = None
        # download time
        self.dltime = -1
-        # download size
-        self.dlsize = -1
        # check time
        self.checktime = 0
        # connection object
@ -211,8 +194,6 @@ class UrlBase (object):
        self.do_check_content = True
        # MIME content type
        self.content_type = None
-        # number of URLs in page content
-        self.num_urls = 0

    def set_result (self, msg, valid=True, overwrite=False):
        """
@ -229,6 +210,8 @@ class UrlBase (object):
            log.warn(LOG_CHECK, "Empty result for %s", self)
        self.result = msg
        self.valid = valid
+        # free content data
+        self.data = None

    def get_title (self):
        """Return title of page the URL refers to.
@ -246,30 +229,6 @@ class UrlBase (object):
                    self.title = title
        return self.title

-    def set_title_from_content (self):
-        """Set title of page the URL refers to.from page content."""
-        if not self.valid:
-            return
-        try:
-            handler = linkparse.TitleFinder()
-        except tuple(ExcList):
-            return
-        parser = htmlsax.parser(handler)
-        handler.parser = parser
-        if self.charset:
-            parser.encoding = self.charset
-        # parse
-        try:
-            parser.feed(self.get_content())
-            parser.flush()
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-        # break cyclic dependencies
-        handler.parser = None
-        parser.handler = None
-        if handler.title:
-            self.title = handler.title
-
    def is_parseable (self):
        """
        Return True iff content of this url is parseable.
@ -287,15 +246,15 @@ class UrlBase (object):
        return False

    def is_http (self):
-        """
-        Return True for http:// URLs.
-        """
+        """Return True for http:// URLs."""
        return False

    def is_file (self):
-        """
-        Return True for file:// URLs.
-        """
+        """Return True for file:// URLs."""
+        return False
+
+    def is_directory(self):
+        """Return True if current URL represents a directory."""
        return False

    def is_local(self):
@ -318,45 +277,6 @@ class UrlBase (object):
        if s not in self.info:
            self.info.append(s)

-    def copy_from_cache (self, cache_data):
-        """
-        Fill attributes from cache data.
-        """
-        self.url = cache_data["url"]
-        self.result = cache_data["result"]
-        self.has_result = True
-        anchor_changed = (self.anchor != cache_data["anchor"])
-        for tag, msg in cache_data["warnings"]:
-            # do not copy anchor warnings, since the current anchor
-            # might have changed
-            if anchor_changed and tag == WARN_URL_ANCHOR_NOT_FOUND:
-                continue
-            self.add_warning(msg, tag=tag)
-        for info in cache_data["info"]:
-            self.add_info(info)
-        self.valid = cache_data["valid"]
-        self.dltime = cache_data["dltime"]
-        self.dlsize = cache_data["dlsize"]
-        self.anchors = cache_data["anchors"]
-        self.content_type = cache_data["content_type"]
-        if anchor_changed and self.valid:
-            # recheck anchor
-            self.check_anchor()
-
-    def get_cache_data (self):
-        """Return all data values that should be put in the cache."""
-        return {"url": self.url,
-                "result": self.result,
-                "warnings": self.warnings,
-                "info": self.info,
-                "valid": self.valid,
-                "dltime": self.dltime,
-                "dlsize": self.dlsize,
-                "anchors": self.anchors,
-                "anchor": self.anchor,
-                "content_type": self.get_content_type(),
-               }
-
    def set_cache_keys (self):
        """
        Set keys for URL checking and content recursion.
@ -367,11 +287,7 @@ class UrlBase (object):
        assert isinstance(self.cache_content_key, unicode), self
        log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
        # construct cache key
-        if self.aggregate.config["anchors"]:
-            # add anchor to cache key
-            self.cache_url_key = urlutil.urlunsplit(self.urlparts[:4]+[self.anchor or u""])
-        else:
-            self.cache_url_key = self.cache_content_key
+        self.cache_url_key = self.cache_content_key
        assert isinstance(self.cache_url_key, unicode), self
        log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)

@ -442,9 +358,9 @@ class UrlBase (object):
        self.url = urlutil.urlunsplit(urlparts)
        # split into (modifiable) list
        self.urlparts = strformat.url_unicode_split(self.url)
+        self.build_url_parts()
        # and unsplit again
        self.url = urlutil.urlunsplit(self.urlparts)
-        self.build_url_parts()

    def build_url_parts (self):
        """Set userinfo, host, port and anchor from self.urlparts.
@ -452,22 +368,28 @@ class UrlBase (object):
        """
        # check userinfo@host:port syntax
        self.userinfo, host = urllib.splituser(self.urlparts[1])
-        # set host lowercase
-        if self.userinfo:
-            self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
-        else:
-            self.urlparts[1] = host.lower()
-        # safe anchor for later checking
-        self.anchor = self.urlparts[4]
        port = urlutil.default_ports.get(self.scheme, 0)
-        self.host, self.port = urlutil.splitport(host, port=port)
-        if self.port is None:
+        host, port = urlutil.splitport(host, port=port)
+        if port is None:
            raise LinkCheckerError(_("URL host %(host)r has invalid port") %
                    {"host": host})
+        self.port = port
+        # set host lowercase
+        self.host = host.lower()
        if self.scheme in scheme_requires_host:
            if not self.host:
                raise LinkCheckerError(_("URL has empty hostname"))
            self.check_obfuscated_ip()
+        if not self.port or self.port == urlutil.default_ports.get(self.scheme):
+            host = self.host
+        else:
+            host = "%s:%d" % (self.host, self.port)
+        if self.userinfo:
+            self.urlparts[1] = "%s@%s" % (self.userinfo, host)
+        else:
+            self.urlparts[1] = host
+        # safe anchor for later checking
+        self.anchor = self.urlparts[4]

    def check_obfuscated_ip (self):
        """Warn if host of this URL is obfuscated IP address."""
@ -476,9 +398,10 @@ class UrlBase (object):
        if iputil.is_obfuscated_ip(self.host):
            ips = iputil.resolve_host(self.host)
            if ips:
+                self.host = ips[0]
                self.add_warning(
                   _("URL %(url)s has obfuscated IP address %(ip)s") % \
-                   {"url": self.base_url, "ip": ips.pop()},
+                   {"url": self.base_url, "ip": ips[0]},
                          tag=WARN_URL_OBFUSCATED_IP)

    def check (self):
@ -499,19 +422,6 @@ class UrlBase (object):
            # close/release possible open connection
            self.close_connection()

-    def add_country_info (self):
-        """Try to ask GeoIP database for country info."""
-        if self.host:
-            country = geoip.get_country(self.host)
-            if country:
-                self.add_info(_("URL is located in %(country)s.") %
-                {"country": _(country)})
-
-    def add_size_info (self):
-        """Store size of URL content from meta info into self.size.
-        Must be implemented in subclasses."""
-        pass
-
    def local_check (self):
        """Local check function can be overridden in subclasses."""
        log.debug(LOG_CHECK, "Checking %s", self)
@ -524,35 +434,28 @@ class UrlBase (object):
        try:
            self.check_connection()
            self.add_size_info()
-            self.add_country_info()
+            self.aggregate.plugin_manager.run_connection_plugins(self)
        except tuple(ExcList) as exc:
            value = self.handle_exception()
            # make nicer error msg for unknown hosts
            if isinstance(exc, socket.error) and exc.args[0] == -2:
                value = _('Hostname not found')
-            # make nicer error msg for bad status line
-            elif isinstance(exc, httplib.BadStatusLine):
-                value = _('Bad HTTP response %(line)r') % {"line": str(value)}
            elif isinstance(exc, UnicodeError):
                # idna.encode(host) failed
                value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': str(value)}
            self.set_result(unicode_safe(value), valid=False)
-        self.checktime = time.time() - check_start
        if self.do_check_content:
            # check content and recursion
            try:
-                self.check_content()
+                if self.valid and self.can_get_content():
+                    self.aggregate.plugin_manager.run_content_plugins(self)
                if self.allows_recursion():
-                    self.parse_url()
-                # check content size
-                self.check_size()
+                    parser.parse_url(self)
            except tuple(ExcList):
                value = self.handle_exception()
-                # make nicer error msg for bad status line
-                if isinstance(value, httplib.BadStatusLine):
-                    value = _('Bad HTTP response %(line)r') % {"line": str(value)}
                self.add_warning(_("could not get content: %(msg)s") %
                     {"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)
+        self.checktime = time.time() - check_start

    def close_connection (self):
        """
@ -595,6 +498,17 @@ class UrlBase (object):
        """
        self.url_connection = urllib2.urlopen(self.url)

+    def add_size_info (self):
+        """Set size of URL content (if any)..
+        Should be overridden in subclasses."""
+        maxbytes = self.aggregate.config["maxfilesizedownload"]
+        if self.size > maxbytes:
+            self.add_warning(
+              _("Content size %(size)s is larger than %(maxbytes)s.") %
+                  dict(size=strformat.strsize(self.size),
+                       maxbytes=strformat.strsize(maxbytes)),
+                tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
+
    def allows_recursion (self):
        """
        Return True iff we can recurse into the url's content.
@ -617,6 +531,9 @@ class UrlBase (object):
        if self.extern[0]:
            log.debug(LOG_CHECK, "... no, extern.")
            return False
+        if self.size > self.aggregate.config["maxfilesizeparse"]:
+            log.debug(LOG_CHECK, "... no, maximum parse size.")
+            return False
        if not self.content_allows_robots():
            log.debug(LOG_CHECK, "... no, robots.")
            return False
@ -628,6 +545,7 @@ class UrlBase (object):
        Return False if the content of this URL forbids robots to
        search for recursive links.
        """
+        # XXX cleanup
        if not self.is_html():
            return True
        if not (self.is_http() or self.is_file()):
@ -644,63 +562,12 @@ class UrlBase (object):
            parser.flush()
        except linkparse.StopParse as msg:
            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
+            pass
        # break cyclic dependencies
        handler.parser = None
        parser.handler = None
        return handler.follow

-    def get_anchors (self):
-        """Store anchors for this URL. Precondition: this URL is
-        an HTML resource."""
-        log.debug(LOG_CHECK, "Getting HTML anchors %s", self)
-        self.find_links(self.add_anchor, tags=linkparse.AnchorTags)
-
-    def find_links (self, callback, tags=None):
-        """Parse into content and search for URLs to check.
-        Found URLs are added to the URL queue.
-        """
-        # construct parser object
-        handler = linkparse.LinkFinder(callback, tags=tags)
-        parser = htmlsax.parser(handler)
-        if self.charset:
-            parser.encoding = self.charset
-        handler.parser = parser
-        # parse
-        try:
-            parser.feed(self.get_content())
-            parser.flush()
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-        # break cyclic dependencies
-        handler.parser = None
-        parser.handler = None
-
-    def add_anchor (self, url, line, column, name, base):
-        """Add anchor URL."""
-        self.anchors.append((url, line, column, name, base))
-
-    def check_anchor (self):
-        """If URL is valid, parseable and has an anchor, check it.
-        A warning is logged and True is returned if the anchor is not found.
-        """
-        if not (self.anchor and self.aggregate.config["anchors"] and
-                self.valid and self.is_html()):
-            return
-        log.debug(LOG_CHECK, "checking anchor %r in %s", self.anchor, self.anchors)
-        enc = lambda anchor: urlutil.url_quote_part(anchor, encoding=self.encoding)
-        if any(x for x in self.anchors if enc(x[0]) == self.anchor):
-            return
-        if self.anchors:
-            anchornames = sorted(set(u"`%s'" % x[0] for x in self.anchors))
-            anchors = u", ".join(anchornames)
-        else:
-            anchors = u"-"
-        args = {"name": self.anchor, "anchors": anchors}
-        msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args,
-                          _("Available anchors: %(anchors)s.") % args)
-        self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND)
-        return True
-
    def set_extern (self, url):
        """
        Match URL against extern and intern link patterns. If no pattern
@ -728,9 +595,15 @@ class UrlBase (object):
                log.debug(LOG_CHECK, "Intern URL %r", url)
                self.extern = (0, 0)
                return
-        log.debug(LOG_CHECK, "Explicit extern URL %r", url)
-        self.extern = (1, 0)
-        return
+        if self.aggregate.config['checkextern']:
+            self.extern = (1, 0)
+        else:
+            self.extern = (1, 1)
+        if self.extern[0] and self.extern[1]:
+            self.add_info(_("The URL is outside of the domain "
+                          "filter, checked only syntax."))
+            if not self.has_result:
+                self.set_result(_("filtered"))

    def get_content_type (self):
        """Return content MIME type or empty string.
@ -741,188 +614,35 @@ class UrlBase (object):

    def can_get_content (self):
        """Indicate wether url get_content() can be called."""
-        return True
+        return self.size <= self.aggregate.config["maxfilesizedownload"]

    def get_content (self):
        """Precondition: url_connection is an opened URL."""
        if self.data is None:
            log.debug(LOG_CHECK, "Get content of %r", self.url)
            t = time.time()
-            self.data, self.dlsize = self.read_content()
+            self.data = self.read_content()
+            self.size = len(self.data)
            self.dltime = time.time() - t
+            if self.size == 0:
+                self.add_warning(_("Content size is zero."),
+                             tag=WARN_URL_CONTENT_SIZE_ZERO)
        return self.data

-    def read_content (self):
-        """Return data and data size for this URL.
-        Can be overridden in subclasses."""
-        if self.size > self.MaxFilesizeBytes:
-            raise LinkCheckerError(_("File size too large"))
-        data = self.url_connection.read(self.MaxFilesizeBytes+1)
-        if len(data) > self.MaxFilesizeBytes:
-            raise LinkCheckerError(_("File size too large"))
-        if not self.is_local():
-            self.aggregate.add_download_data(self.cache_content_key, data)
-        return data, len(data)
+    def read_content(self):
+        """Return data for this URL. Can be overridden in subclasses."""
+        buf = StringIO()
+        data = self.read_content_chunk()
+        while data:
+            if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
+                raise LinkCheckerError(_("File size too large"))
+            buf.write(data)
+            data = self.read_content_chunk()
+        return buf.getvalue()

-    def check_content (self):
-        """Check content data for warnings, syntax errors, viruses etc."""
-        if not (self.valid and self.can_get_content()):
-            return
-        if self.is_html():
-            self.set_title_from_content()
-            if self.aggregate.config["anchors"]:
-                self.get_anchors()
-        self.check_anchor()
-        self.check_warningregex()
-        # is it an intern URL?
-        if not self.extern[0]:
-            # check HTML/CSS syntax
-            if self.aggregate.config["checkhtml"] and self.is_html():
-                self.check_html()
-            if self.aggregate.config["checkcss"] and self.is_css():
-                self.check_css()
-            # check with clamav
-            if self.aggregate.config["scanvirus"]:
-                self.scan_virus()
-
-    def check_warningregex (self):
-        """Check if content matches a given regular expression."""
-        config = self.aggregate.config
-        warningregex = config["warningregex"]
-        if not (warningregex and self.valid and self.is_parseable()):
-            return
-        log.debug(LOG_CHECK, "checking content for warning regex")
-        try:
-            content = self.get_content()
-            curpos = 0
-            curline = 1
-            # add warnings for found matches, up to the maximum allowed number
-            for num, match in enumerate(warningregex.finditer(content)):
-                # calculate line number for match
-                curline += content.count('\n', curpos, match.start())
-                curpos = match.start()
-                # add a warning message
-                msg = _("Found %(match)r at line %(line)d in link contents.")
-                self.add_warning(msg %
-                   {"match": match.group(), "line": curline},
-                   tag=WARN_URL_WARNREGEX_FOUND)
-                # check for maximum number of warnings
-                if num >= config["warningregex_max"]:
-                    break
-        except tuple(ExcList):
-            value = self.handle_exception()
-            self.set_result(unicode_safe(value), valid=False)
-
-    def check_size (self):
-        """Check content size if it is zero or larger than a given
-        maximum size.
-        """
-        if self.dlsize == 0:
-            self.add_warning(_("Content size is zero."),
-                             tag=WARN_URL_CONTENT_SIZE_ZERO)
-        else:
-            maxbytes = self.aggregate.config["warnsizebytes"]
-            if maxbytes is not None and self.dlsize >= maxbytes:
-                self.add_warning(
-                   _("Content size %(dlsize)s is larger than %(maxbytes)s.") %
-                        {"dlsize": strformat.strsize(self.dlsize),
-                         "maxbytes": strformat.strsize(maxbytes)},
-                          tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
-        if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size:
-                self.add_warning(_("Download size (%(dlsize)d Byte) "
-                        "does not equal content size (%(size)d Byte).") %
-                        {"dlsize": self.dlsize,
-                         "size": self.size},
-                          tag=WARN_URL_CONTENT_SIZE_UNEQUAL)
-
-    def check_w3_errors (self, xml, w3type):
-        """Add warnings for W3C HTML or CSS errors in xml format.
-        w3type is either "W3C HTML" or "W3C CSS"."""
-        from xml.dom.minidom import parseString
-        dom = parseString(xml)
-        for error in dom.getElementsByTagName('m:error'):
-            warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
-            attrs = {
-                "w3type": w3type,
-                "line": getXmlText(error, "m:line"),
-                "column": getXmlText(error, "m:col"),
-                "msg": getXmlText(error, "m:message"),
-            }
-            tag = WARN_SYNTAX_HTML if w3type == "W3C HTML" else WARN_SYNTAX_CSS
-            self.add_warning(warnmsg % attrs, tag=tag)
-
-    def check_html (self):
-        """Check HTML syntax of this page (which is supposed to be HTML)
-        with the online W3C HTML validator documented at
-        http://validator.w3.org/docs/api.html
-        """
-        self.aggregate.check_w3_time()
-        try:
-            body = {'fragment': self.get_content(), 'output': 'soap12'}
-            data = urllib.urlencode(body)
-            u = urllib2.urlopen('http://validator.w3.org/check', data)
-            if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
-                self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
-                return
-            self.check_w3_errors(u.read(), "W3C HTML")
-        except Exception:
-            # catch _all_ exceptions since we dont want third party module
-            # errors to propagate into this library
-            err = str(sys.exc_info()[1])
-            log.warn(LOG_CHECK,
-                _("HTML W3C validation caused error: %(msg)s ") %
-                {"msg": err})
-
-    def check_css (self):
-        """Check CSS syntax of this page (which is supposed to be CSS)
-        with the online W3C CSS validator documented at
-        http://jigsaw.w3.org/css-validator/manual.html#expert
-        """
-        self.aggregate.check_w3_time()
-        try:
-            host = 'jigsaw.w3.org'
-            path = '/css-validator/validator'
-            params = {
-                'text': "div {}",
-                'warning': '2',
-                'output': 'soap12',
-            }
-            fields = params.items()
-            content_type, body = httputil.encode_multipart_formdata(fields)
-            h = httplib.HTTPConnection(host)
-            h.putrequest('POST', path)
-            h.putheader('Content-Type', content_type)
-            h.putheader('Content-Length', str(len(body)))
-            h.endheaders()
-            h.send(body)
-            r = h.getresponse(True)
-            if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
-                self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
-                return
-            self.check_w3_errors(r.read(), "W3C HTML")
-        except Exception:
-            # catch _all_ exceptions since we dont want third party module
-            # errors to propagate into this library
-            err = str(sys.exc_info()[1])
-            log.warn(LOG_CHECK,
-                _("CSS W3C validation caused error: %(msg)s ") %
-                {"msg": err})
-
-    def scan_virus (self):
-        """Scan content for viruses."""
-        infected, errors = clamav.scan(self.get_content())
-        for msg in infected:
-            self.add_warning(u"Virus scan infection: %s" % msg)
-        for msg in errors:
-            self.add_warning(u"Virus scan error: %s" % msg)
-
-    def parse_url (self):
-        """
-        Parse url content and search for recursive links.
-        Default parse type is html.
-        """
-        self.parse_html()
-        self.add_num_url_info()
+    def read_content_chunk(self):
+        """Read one chunk of content from this URL."""
+        return self.url_connection.read(self.ReadChunkBytes)

    def get_user_password (self):
        """Get tuple (user, password) from configured authentication.
@ -933,16 +653,8 @@ class UrlBase (object):
            return urllib.splitpasswd(self.userinfo)
        return self.aggregate.config.get_user_password(self.url)

-    def parse_html (self):
-        """Parse into HTML content and search for URLs to check.
-        Found URLs are added to the URL queue.
-        """
-        log.debug(LOG_CHECK, "Parsing HTML %s", self)
-        self.find_links(self.add_url)
-
    def add_url (self, url, line=0, column=0, name=u"", base=None):
        """Queue URL data for checking."""
-        self.num_urls += 1
        if base:
            base_ref = urlutil.url_norm(base)[0]
        else:
@ -954,108 +666,6 @@ class UrlBase (object):
            # Only queue URLs which have a result or are not strict extern.
            self.aggregate.urlqueue.put(url_data)

-    def add_num_url_info(self):
-        """Add number of URLs parsed to info."""
-        if self.num_urls > 0:
-            attrs = {"num": self.num_urls}
-            msg = _n("%(num)d URL parsed.", "%(num)d URLs parsed.", self.num_urls)
-            self.add_info(msg % attrs)
-
-    def parse_opera (self):
-        """Parse an opera bookmark file."""
-        log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)
-        from ..bookmarks.opera import parse_bookmark_data
-        for url, name, lineno in parse_bookmark_data(self.get_content()):
-            self.add_url(url, line=lineno, name=name)
-
-    def parse_chromium (self):
-        """Parse a Chromium or Google Chrome bookmark file."""
-        log.debug(LOG_CHECK, "Parsing Chromium bookmarks %s", self)
-        from ..bookmarks.chromium import parse_bookmark_data
-        for url, name in parse_bookmark_data(self.get_content()):
-            self.add_url(url, name=name)
-
-    def parse_safari (self):
-        """Parse a Safari bookmark file."""
-        log.debug(LOG_CHECK, "Parsing Safari bookmarks %s", self)
-        from ..bookmarks.safari import parse_bookmark_data
-        for url, name in parse_bookmark_data(self.get_content()):
-            self.add_url(url, name=name)
-
-    def parse_text (self):
-        """Parse a text file with one url per line; comment and blank
-        lines are ignored."""
-        log.debug(LOG_CHECK, "Parsing text %s", self)
-        lineno = 0
-        for line in self.get_content().splitlines():
-            lineno += 1
-            line = line.strip()
-            if not line or line.startswith('#'):
-                continue
-            self.add_url(line, line=lineno)
-
-    def parse_css (self):
-        """
-        Parse a CSS file for url() patterns.
-        """
-        log.debug(LOG_CHECK, "Parsing CSS %s", self)
-        lineno = 0
-        linkfinder = linkparse.css_url_re.finditer
-        strip_comments = linkparse.strip_c_comments
-        for line in strip_comments(self.get_content()).splitlines():
-            lineno += 1
-            for mo in linkfinder(line):
-                column = mo.start("url")
-                url = strformat.unquote(mo.group("url").strip())
-                self.add_url(url, line=lineno, column=column)
-
-    def parse_swf (self):
-        """Parse a SWF file for URLs."""
-        linkfinder = linkparse.swf_url_re.finditer
-        for mo in linkfinder(self.get_content()):
-            url = mo.group()
-            self.add_url(url)
-
-    def parse_word (self):
-        """Parse a word file for hyperlinks."""
-        if not winutil.has_word():
-            return
-        filename = self.get_temp_filename()
-        # open word file and parse hyperlinks
-        try:
-            app = winutil.get_word_app()
-            try:
-                doc = winutil.open_wordfile(app, filename)
-                if doc is None:
-                    raise winutil.Error("could not open word file %r" % filename)
-                try:
-                    for link in doc.Hyperlinks:
-                        self.add_url(link.Address, name=link.TextToDisplay)
-                finally:
-                    winutil.close_wordfile(doc)
-            finally:
-                winutil.close_word_app(app)
-        except winutil.Error, msg:
-            log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
-
-    def parse_wml (self):
-        """Parse into WML content and search for URLs to check.
-        Found URLs are added to the URL queue.
-        """
-        log.debug(LOG_CHECK, "Parsing WML %s", self)
-        self.find_links(self.add_url, tags=linkparse.WmlTags)
-
-    def get_temp_filename (self):
-        """Get temporary filename for content to parse."""
-        # store content in temporary file
-        fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
-            prefix='lc_')
-        try:
-            fd.write(self.get_content())
-        finally:
-            fd.close()
-        return filename
-
    def serialized (self, sep=os.linesep):
        """
        Return serialized url check data as unicode string.
@ -1103,7 +713,7 @@ class UrlBase (object):
            if pat:
                log.debug(LOG_CHECK, "Add intern pattern %r", pat)
                self.aggregate.config['internlinks'].append(get_link_pat(pat))
-        except UnicodeError, msg:
+        except UnicodeError as msg:
            res = _("URL has unparsable domain name: %(domain)s") % \
                {"domain": msg}
            self.set_result(res, valid=False)
@ -1151,7 +761,7 @@ class UrlBase (object):
          Number of seconds needed to check this link, default: zero.
        - url_data.dltime: int
          Number of seconds needed to download URL content, default: -1
-        - url_data.dlsize: int
+        - url_data.size: int
          Size of downloaded URL content, default: -1
        - url_data.info: list of unicode
          Additional information about this URL.
@ -1181,7 +791,7 @@ class UrlBase (object):
          domain=(self.urlparts[1] if self.urlparts else u""),
          checktime=self.checktime,
          dltime=self.dltime,
-          dlsize=self.dlsize,
+          size=self.size,
          info=self.info,
          line=self.line,
          column=self.column,
@ -1211,7 +821,7 @@ urlDataAttr = [
    'domain',
    'checktime',
    'dltime',
-    'dlsize',
+    'size',
    'info',
    'modified',
    'line',
--- a/linkcheck/cmdline.py
+++ b/linkcheck/cmdline.py
@ -20,7 +20,7 @@ Utility functions suitable for command line clients.
 from __future__ import print_function
 import sys
 import argparse
-from . import checker, fileutil, strformat
+from . import checker, fileutil, strformat, plugins
 from .director import console


@ -42,6 +42,19 @@ def print_version(exit_code=0):
    sys.exit(exit_code)


+def print_plugins(folders, exit_code=0):
+    """Print available plugins and exit."""
+    modules = plugins.get_plugin_modules(folders)
+    pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__)
+
+    for pluginclass in pluginclasses:
+        print(pluginclass.__name__)
+        doc = strformat.wrap(pluginclass.__doc__, 80)
+        print(strformat.indent(doc))
+        print()
+    sys.exit(exit_code)
+
+
 def print_usage (msg, exit_code=2):
    """Print a program msg text to stderr and exit."""
    program = sys.argv[0]
--- a/linkcheck/configuration/init.py
+++ b/linkcheck/configuration/init.py
@ -27,7 +27,7 @@ import urlparse
 import shutil
 import socket
 import _LinkChecker_configdata as configdata
-from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames, clamav,
+from .. import (log, LOG_CHECK, LOG_ROOT, ansicolor, lognames,
    get_config_dir, fileutil, configdict)
 from . import confparse
 from ..decorators import memoized
@ -75,6 +75,9 @@ Modules = (
 def get_modules_info ():
    """Return list of unicode strings with detected module info."""
    lines = []
+    # requests
+    import requests
+    lines.append(u"Requests: %s" % requests.__version__)
    # PyQt
    try:
        from PyQt4 import QtCore
@ -129,53 +132,48 @@ class Configuration (dict):
        Initialize the default options.
        """
        super(Configuration, self).__init__()
-        self['trace'] = False
-        self["verbose"] = False
-        self["complete"] = False
-        self["warnings"] = True
-        self["ignorewarnings"] = []
-        self['quiet'] = False
-        self["anchors"] = False
-        self["externlinks"] = []
-        self["internlinks"] = []
-        # on ftp, password is set by Pythons ftplib
+        ## checking options
+        self["allowedschemes"] = []
+        self['cookiefile'] = None
+        self["debugmemory"] = False
+        self["localwebroot"] = None
+        self["maxfilesizeparse"] = 1*1024*1024
+        self["maxfilesizedownload"] = 5*1024*1024
+        self["maxnumurls"] = None
+        self["maxrunseconds"] = None
+        self["maxrequestspersecond"] = 10
+        self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
+        self["proxy"] = urllib.getproxies()
+        self["sslverify"] = True
+        self["threads"] = 100
+        self["timeout"] = 60
+        self["aborttimeout"] = 300
+        self["recursionlevel"] = -1
+        self["useragent"] = UserAgent
+        ## authentication
        self["authentication"] = []
        self["loginurl"] = None
        self["loginuserfield"] = "login"
        self["loginpasswordfield"] = "password"
        self["loginextrafields"] = {}
-        self["proxy"] = urllib.getproxies()
-        self["recursionlevel"] = -1
-        self["wait"] = 0
-        self['sendcookies'] = False
-        self['storecookies'] = False
-        self['cookiefile'] = None
-        self["status"] = False
-        self["status_wait_seconds"] = 5
+        ## filtering
+        self["externlinks"] = []
+        self["ignorewarnings"] = []
+        self["internlinks"] = []
+        self["checkextern"] = False
+        ## plugins
+        self["pluginfolders"] = get_plugin_folders()
+        self["enabledplugins"] = []
+        ## output
+        self['trace'] = False
+        self['quiet'] = False
+        self["verbose"] = False
+        self["warnings"] = True
        self["fileoutput"] = []
        self['output'] = 'text'
+        self["status"] = False
+        self["status_wait_seconds"] = 5
        self['logger'] = None
-        self["warningregex"] = None
-        self["warningregex_max"] = 5
-        self["warnsizebytes"] = None
-        self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
-        self["threads"] = 100
-        # socket timeout in seconds
-        self["timeout"] = 60
-        self["checkhtml"] = False
-        self["checkcss"] = False
-        self["scanvirus"] = False
-        self["clamavconf"] = clamav.canonical_clamav_conf()
-        self["useragent"] = UserAgent
-        self["debugmemory"] = False
-        self["localwebroot"] = None
-        self["sslverify"] = True
-        self["warnsslcertdaysvalid"] = 14
-        self["maxrunseconds"] = None
-        self["maxnumurls"] = None
-        self["maxconnectionshttp"] = 10
-        self["maxconnectionshttps"] = 10
-        self["maxconnectionsftp"] = 2
        self.loggers = {}
        from ..logger import LoggerClasses
        for c in LoggerClasses:
@ -302,29 +300,15 @@ class Configuration (dict):

    def sanitize (self):
        "Make sure the configuration is consistent."
-        if self["anchors"]:
-            self.sanitize_anchors()
        if self['logger'] is None:
            self.sanitize_logger()
-        if self['scanvirus']:
-            self.sanitize_scanvirus()
-        if self['storecookies'] or self['cookiefile']:
-            self.sanitize_cookies()
        if self['loginurl']:
            self.sanitize_loginurl()
        self.sanitize_proxies()
+        self.sanitize_plugins()
        # set default socket timeout
        socket.setdefaulttimeout(self['timeout'])

-    def sanitize_anchors (self):
-        """Make anchor configuration consistent."""
-        if not self["warnings"]:
-            self["warnings"] = True
-            from ..checker.const import Warnings
-            self["ignorewarnings"] = Warnings.keys()
-        if 'url-anchor-not-found' in self["ignorewarnings"]:
-            self["ignorewarnings"].remove('url-anchor-not-found')
-
    def sanitize_logger (self):
        """Make logger configuration consistent."""
        if not self['output']:
@ -332,24 +316,6 @@ class Configuration (dict):
            self['output'] = 'text'
        self['logger'] = self.logger_new(self['output'])

-    def sanitize_scanvirus (self):
-        """Ensure clamav is installed for virus checking."""
-        try:
-            clamav.init_clamav_conf(self['clamavconf'])
-        except clamav.ClamavError:
-            log.warn(LOG_CHECK,
-                _("Clamav could not be initialized"))
-            self['scanvirus'] = False
-
-    def sanitize_cookies (self):
-        """Make cookie configuration consistent."""
-        if not self['sendcookies']:
-            log.warn(LOG_CHECK, _("activating sendcookies."))
-            self['sendcookies'] = True
-        if not self['storecookies']:
-            log.warn(LOG_CHECK, _("activating storecookies."))
-            self['storecookies'] = True
-
    def sanitize_loginurl (self):
        """Make login configuration consistent."""
        url = self["loginurl"]
@ -377,9 +343,6 @@ class Configuration (dict):
            log.warn(LOG_CHECK,
              _("disabling login URL %(url)s.") % {"url": url})
            self["loginurl"] = None
-        elif not self['storecookies']:
-            # login URL implies storing and sending cookies
-            self['storecookies'] = self['sendcookies'] = True

    def sanitize_proxies (self):
        """Try to read additional proxy settings which urllib does not
@ -395,6 +358,39 @@ class Configuration (dict):
            if ftp_proxy:
                self["proxy"]["ftp"] = ftp_proxy

+    def sanitize_plugins(self):
+        """Ensure each plugin is configurable."""
+        for plugin in self["enabledplugins"]:
+            if plugin not in self:
+                self[plugin] = {}
+
+
+def get_plugin_folders():
+    """Get linkchecker plugin folders. Default is ~/.linkchecker/plugins/."""
+    folders = []
+    defaultfolder = normpath("~/.linkchecker/plugins")
+    if not os.path.exists(defaultfolder) and not Portable:
+        try:
+            make_userdir(defaultfolder)
+        except StandardError as errmsg:
+            msg = _("could not create plugin directory %(dirname)r: %(errmsg)r")
+            args = dict(dirname=defaultfolder, errmsg=errmsg)
+            log.warn(LOG_CHECK, msg % args)
+    if os.path.exists(defaultfolder):
+        folders.append(defaultfolder)
+    return folders
+
+
+def make_userdir(child):
+    """Create a child directory."""
+    userdir = os.path.dirname(child)
+    if not os.path.isdir(userdir):
+        if os.name == 'nt':
+            # Windows forbids filenames with leading dot unless
+            # a trailing dot is added.
+            userdir += "."
+        os.mkdir(userdir, 0700)
+

 def get_user_config():
    """Get the user configuration filename.
@ -413,13 +409,7 @@ def get_user_config():
       not Portable:
        # copy the initial configuration to the user configuration
        try:
-            userdir = os.path.dirname(userconf)
-            if not os.path.isdir(userdir):
-                if os.name == 'nt':
-                    # Windows forbids filenames with leading dot unless
-                    # a trailing dot is added.
-                    userdir += "."
-                os.mkdir(userdir, 0700)
+            make_userdir(userconf)
            shutil.copy(initialconf, userconf)
        except StandardError as errmsg:
            msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r")
@ -445,6 +435,7 @@ def get_gconf_http_proxy ():
                return "%s:%d" % (host, port)
    except StandardError as msg:
        log.debug(LOG_CHECK, "error getting HTTP proxy from gconf: %s", msg)
+        pass
    return None


@ -464,6 +455,7 @@ def get_gconf_ftp_proxy ():
            return "%s:%d" % (host, port)
    except StandardError as msg:
        log.debug(LOG_CHECK, "error getting FTP proxy from gconf: %s", msg)
+        pass
    return None


@ -478,6 +470,7 @@ def get_kde_http_proxy ():
        return data.get("http_proxy")
    except StandardError as msg:
        log.debug(LOG_CHECK, "error getting HTTP proxy from KDE: %s", msg)
+        pass


 def get_kde_ftp_proxy ():
@ -491,6 +484,7 @@ def get_kde_ftp_proxy ():
        return data.get("ftp_proxy")
    except StandardError as msg:
        log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg)
+        pass

 # The following KDE functions are largely ported and ajusted from
 # Google Chromium:
--- a/linkcheck/configuration/confparse.py
+++ b/linkcheck/configuration/confparse.py
@ -17,9 +17,8 @@
 """Parse configuration files"""

 import ConfigParser
-import re
 import os
-from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil
+from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins


 def read_multiline (value):
@ -53,16 +52,17 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
                failed_files = set(files) - set(self.read_ok)
                log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
            # Read all the configuration parameters from the given files.
-            self.read_output_config()
            self.read_checking_config()
            self.read_authentication_config()
            self.read_filtering_config()
+            self.read_output_config()
+            self.read_plugin_config()
        except Exception as msg:
            raise LinkCheckerError(
              _("Error parsing configuration: %s") % unicode(msg))

    def read_string_option (self, section, option, allowempty=False):
-        """Read a sring option."""
+        """Read a string option."""
        if self.has_option(section, option):
            value = self.get(section, option)
            if not allowempty and not value:
@ -106,11 +106,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
            if self.getboolean(section, "verbose"):
                self.config["verbose"] = True
                self.config["warnings"] = True
-        if self.has_option(section, "complete"):
-            if self.getboolean(section, "complete"):
-                self.config["complete"] = True
-                self.config["verbose"] = True
-                self.config["warnings"] = True
        if self.has_option(section, "quiet"):
            if self.getboolean(section, "quiet"):
                self.config['output'] = 'none'
@ -141,37 +136,24 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
        self.read_int_option(section, "threads", min=-1)
        self.config['threads'] = max(0, self.config['threads'])
        self.read_int_option(section, "timeout", min=1)
-        self.read_boolean_option(section, "anchors")
+        self.read_int_option(section, "aborttimeout", min=1)
        self.read_int_option(section, "recursionlevel", min=-1)
-        if self.has_option(section, "warningregex"):
-            val = self.get(section, "warningregex")
-            if val:
-                self.config["warningregex"] = re.compile(val)
-        self.read_int_option(section, "warnsizebytes", min=1)
        self.read_string_option(section, "nntpserver")
        self.read_string_option(section, "useragent")
-        self.read_int_option(section, "pause", key="wait", min=0)
-        for name in ("http", "https", "ftp"):
-            self.read_int_option(section, "maxconnections%s" % name, min=1)
-        self.read_check_options(section)
-
-    def read_check_options (self, section):
-        """Read check* options."""
-        self.read_boolean_option(section, "checkhtml")
-        self.read_boolean_option(section, "checkcss")
-        self.read_boolean_option(section, "scanvirus")
-        self.read_boolean_option(section, "clamavconf")
+        self.read_int_option(section, "maxrequestspersecond", min=1)
+        self.read_int_option(section, "maxnumurls", min=0)
+        self.read_int_option(section, "maxfilesizeparse", min=1)
+        self.read_int_option(section, "maxfilesizedownload", min=1)
+        if self.has_option(section, "allowedschemes"):
+            self.config['allowedschemes'] = [x.strip().lower() for x in \
+                 self.get(section, 'allowedschemes').split(',')]
        self.read_boolean_option(section, "debugmemory")
-        if self.has_option(section, "cookies"):
-            self.config["sendcookies"] = self.config["storecookies"] = \
-                self.getboolean(section, "cookies")
        self.read_string_option(section, "cookiefile")
        self.read_string_option(section, "localwebroot")
        try:
            self.read_boolean_option(section, "sslverify")
        except ValueError:
            self.read_string_option(section, "sslverify")
-        self.read_int_option(section, "warnsslcertdaysvalid", min=1)
        self.read_int_option(section, "maxrunseconds", min=0)

    def read_authentication_config (self):
@ -198,7 +180,6 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
                raise LinkCheckerError(_("invalid login URL `%s'. Only " \
                  "HTTP and HTTPS URLs are supported.") % val)
            self.config["loginurl"] = val
-            self.config["storecookies"] = self.config["sendcookies"] = True
        self.read_string_option(section, "loginuserfield")
        self.read_string_option(section, "loginpasswordfield")
        # read login extra fields
@ -231,7 +212,7 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
        """
        section = "filtering"
        if self.has_option(section, "ignorewarnings"):
-            self.config['ignorewarnings'] = [f.strip() for f in \
+            self.config['ignorewarnings'] = [f.strip().lower() for f in \
                 self.get(section, 'ignorewarnings').split(',')]
        if self.has_option(section, "ignore"):
            for line in read_multiline(self.get(section, "ignore")):
@ -244,3 +225,14 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
        if self.has_option(section, "internlinks"):
            pat = get_link_pat(self.get(section, "internlinks"))
            self.config["internlinks"].append(pat)
+        self.read_boolean_option(section, "checkextern")
+
+    def read_plugin_config(self):
+        """Read plugin-specific configuration values."""
+        folders = self.config["pluginfolders"]
+        modules = plugins.get_plugin_modules(folders)
+        for pluginclass in plugins.get_plugin_classes(modules):
+            section = pluginclass.__name__
+            if self.has_section(section):
+                self.config["enabledplugins"].append(section)
+                self.config[section] = pluginclass.read_config(self)
--- a/linkcheck/containers.py
+++ b/linkcheck/containers.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2012 Bastian Kleineidam
+# Copyright (C) 2004-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/cookies.py
+++ b/linkcheck/cookies.py
@ -15,510 +15,13 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
-Parsing and storing of cookies. See [1]RFC 2965 and [2]RFC 2109.
-The reason for this module is that neither the cookielib nor the Cookie
-modules included in the Python standard library provide a usable interface
-for programmable cookie handling.
-This module provides parsing of cookies for all formats specified by
-the above RFCs, plus smart methods handling data conversion and formatting.
-And a cookie storage class is provided.
-
-[1] http://www.faqs.org/rfcs/rfc2965.html
-[2] http://www.faqs.org/rfcs/rfc2109.html
+Parsing of cookies.
 """

-import time
-import string
-import re
 import cookielib
 import httplib
+import requests
 from cStringIO import StringIO
-from . import strformat
-
-
-_nulljoin = ''.join
-_semispacejoin = '; '.join
-_spacejoin = ' '.join
-
-class CookieError (StandardError):
-    """Thrown for invalid cookie syntax or conflicting/impossible values."""
-    pass
-
-_LegalChars       = string.ascii_letters + string.digits + "!#$%&'*+-.^_`|~:"
-_Translator       = {
-    '\000' : '\\000',  '\001' : '\\001',  '\002' : '\\002',
-    '\003' : '\\003',  '\004' : '\\004',  '\005' : '\\005',
-    '\006' : '\\006',  '\007' : '\\007',  '\010' : '\\010',
-    '\011' : '\\011',  '\012' : '\\012',  '\013' : '\\013',
-    '\014' : '\\014',  '\015' : '\\015',  '\016' : '\\016',
-    '\017' : '\\017',  '\020' : '\\020',  '\021' : '\\021',
-    '\022' : '\\022',  '\023' : '\\023',  '\024' : '\\024',
-    '\025' : '\\025',  '\026' : '\\026',  '\027' : '\\027',
-    '\030' : '\\030',  '\031' : '\\031',  '\032' : '\\032',
-    '\033' : '\\033',  '\034' : '\\034',  '\035' : '\\035',
-    '\036' : '\\036',  '\037' : '\\037',
-
-    # Because of the way browsers really handle cookies (as opposed
-    # to what the RFC says) we also encode , and ;
-
-    ',' : '\\054', ';' : '\\073',
-
-    '"' : '\\"',       '\\' : '\\\\',
-
-    '\177' : '\\177',  '\200' : '\\200',  '\201' : '\\201',
-    '\202' : '\\202',  '\203' : '\\203',  '\204' : '\\204',
-    '\205' : '\\205',  '\206' : '\\206',  '\207' : '\\207',
-    '\210' : '\\210',  '\211' : '\\211',  '\212' : '\\212',
-    '\213' : '\\213',  '\214' : '\\214',  '\215' : '\\215',
-    '\216' : '\\216',  '\217' : '\\217',  '\220' : '\\220',
-    '\221' : '\\221',  '\222' : '\\222',  '\223' : '\\223',
-    '\224' : '\\224',  '\225' : '\\225',  '\226' : '\\226',
-    '\227' : '\\227',  '\230' : '\\230',  '\231' : '\\231',
-    '\232' : '\\232',  '\233' : '\\233',  '\234' : '\\234',
-    '\235' : '\\235',  '\236' : '\\236',  '\237' : '\\237',
-    '\240' : '\\240',  '\241' : '\\241',  '\242' : '\\242',
-    '\243' : '\\243',  '\244' : '\\244',  '\245' : '\\245',
-    '\246' : '\\246',  '\247' : '\\247',  '\250' : '\\250',
-    '\251' : '\\251',  '\252' : '\\252',  '\253' : '\\253',
-    '\254' : '\\254',  '\255' : '\\255',  '\256' : '\\256',
-    '\257' : '\\257',  '\260' : '\\260',  '\261' : '\\261',
-    '\262' : '\\262',  '\263' : '\\263',  '\264' : '\\264',
-    '\265' : '\\265',  '\266' : '\\266',  '\267' : '\\267',
-    '\270' : '\\270',  '\271' : '\\271',  '\272' : '\\272',
-    '\273' : '\\273',  '\274' : '\\274',  '\275' : '\\275',
-    '\276' : '\\276',  '\277' : '\\277',  '\300' : '\\300',
-    '\301' : '\\301',  '\302' : '\\302',  '\303' : '\\303',
-    '\304' : '\\304',  '\305' : '\\305',  '\306' : '\\306',
-    '\307' : '\\307',  '\310' : '\\310',  '\311' : '\\311',
-    '\312' : '\\312',  '\313' : '\\313',  '\314' : '\\314',
-    '\315' : '\\315',  '\316' : '\\316',  '\317' : '\\317',
-    '\320' : '\\320',  '\321' : '\\321',  '\322' : '\\322',
-    '\323' : '\\323',  '\324' : '\\324',  '\325' : '\\325',
-    '\326' : '\\326',  '\327' : '\\327',  '\330' : '\\330',
-    '\331' : '\\331',  '\332' : '\\332',  '\333' : '\\333',
-    '\334' : '\\334',  '\335' : '\\335',  '\336' : '\\336',
-    '\337' : '\\337',  '\340' : '\\340',  '\341' : '\\341',
-    '\342' : '\\342',  '\343' : '\\343',  '\344' : '\\344',
-    '\345' : '\\345',  '\346' : '\\346',  '\347' : '\\347',
-    '\350' : '\\350',  '\351' : '\\351',  '\352' : '\\352',
-    '\353' : '\\353',  '\354' : '\\354',  '\355' : '\\355',
-    '\356' : '\\356',  '\357' : '\\357',  '\360' : '\\360',
-    '\361' : '\\361',  '\362' : '\\362',  '\363' : '\\363',
-    '\364' : '\\364',  '\365' : '\\365',  '\366' : '\\366',
-    '\367' : '\\367',  '\370' : '\\370',  '\371' : '\\371',
-    '\372' : '\\372',  '\373' : '\\373',  '\374' : '\\374',
-    '\375' : '\\375',  '\376' : '\\376',  '\377' : '\\377'
-    }
-
-def quote(str, LegalChars=_LegalChars):
-    r"""Quote a string for use in a cookie header.
-
-    If the string does not need to be double-quoted, then just return the
-    string.  Otherwise, surround the string in doublequotes and quote
-    (with a \) special characters.
-    """
-    if all(c in LegalChars for c in str):
-        return str
-    else:
-        return '"' + _nulljoin(_Translator.get(s, s) for s in str) + '"'
-
-
-_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]")
-_QuotePatt = re.compile(r"[\\].")
-
-def unquote(str):
-    """Remove string quoting."""
-    # If there aren't any doublequotes,
-    # then there can't be any special characters.  See RFC 2109.
-    if len(str) < 2:
-        return str
-    if str[0] != '"' or str[-1] != '"':
-        return str
-
-    # We have to assume that we must decode this string.
-    # Down to work.
-
-    # Remove the "s
-    str = str[1:-1]
-
-    # Check for special sequences.  Examples:
-    #    \012 --> \n
-    #    \"   --> "
-    #
-    i = 0
-    n = len(str)
-    res = []
-    while 0 <= i < n:
-        o_match = _OctalPatt.search(str, i)
-        q_match = _QuotePatt.search(str, i)
-        if not o_match and not q_match:              # Neither matched
-            res.append(str[i:])
-            break
-        # else:
-        j = k = -1
-        if o_match:
-            j = o_match.start(0)
-        if q_match:
-            k = q_match.start(0)
-        if q_match and (not o_match or k < j):     # QuotePatt matched
-            res.append(str[i:k])
-            res.append(str[k+1])
-            i = k + 2
-        else:                                      # OctalPatt matched
-            res.append(str[i:j])
-            res.append(chr(int(str[j+1:j+4], 8)))
-            i = j + 4
-    return _nulljoin(res)
-
-
-has_embedded_dot = re.compile(r"[a-zA-Z0-9]\.[a-zA-Z]").search
-
-
-# Pattern for finding cookie snatched from Pythons Cookie.py
-# Modification: allow whitespace in values.
-_LegalCharsPatt  = r"[\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=]"
-_CookiePattern = re.compile(r"""
-    (?x)                           # This is a verbose pattern
-    (?P<key>                       # Start of group 'key'
-    """ + _LegalCharsPatt + r"""+?   # Any word of at least one letter
-    )                              # End of group 'key'
-    (                              # Optional group: there may not be a value.
-    \s*=\s*                          # Equal Sign
-    (?P<val>                         # Start of group 'val'
-    "(?:[^\\"]|\\.)*"                  # Any doublequoted string
-    |                                  # or
-    \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT  # Special case for "expires" attr
-    |                                  # or
-    """ + _LegalCharsPatt + r"""*      # Any word or empty string
-    )                                # End of group 'val'
-    )?                             # End of optional value group
-    \s*                            # Any number of spaces.
-    (\s+|;|$)                      # Ending either at space, semicolon, or EOS.
-    """)
-
-class HttpCookie (object):
-    """A cookie consists of one name-value pair with attributes.
-    Each attribute consists of a predefined name (see attribute_names)
-    and a value (which is optional for some attributes)."""
-
-    # A mapping from the lowercase variant on the left to the
-    # appropriate traditional formatting on the right.
-    attribute_names = {
-        # Old Netscape attribute
-        "expires":    "expires",
-        # Defined by RFC 2109
-        "path":       "Path",
-        "comment":    "Comment",
-        "domain":     "Domain",
-        "max-age":    "Max-Age",
-        "secure":     "secure",
-        "version":    "Version",
-        # Additional attributes defined by RFC 2965
-        "commenturl": "CommentURL",
-        "discard":    "Discard",
-        "port":       "Port",
-        # httponly to protect against XSS attacks
-        "httponly":   "httponly",
-    }
-
-    def __init__ (self, name, value, attributes=None):
-        """Store name, value and attributes. Also calculates expiration
-        if given in attributes."""
-        self.name = name
-        self.value = value
-        if attributes is None:
-            self.attributes = {}
-        else:
-            self.attributes = attributes
-        self.calculate_expiration()
-
-    def calculate_expiration (self):
-        """If "max-age" or "expires" attributes are given, calculate
-        the time when this cookie expires.
-        Stores the time value in self.expires, or None if this cookie
-        does not expire.
-        """
-        # default: do not expire
-        self.expire = None
-        if "max-age" in self.attributes:
-            now = time.time()
-            try:
-                maxage = int(self.attributes["max-age"])
-                if maxage == 0:
-                    # Expire immediately: subtract 1 to be sure since
-                    # some clocks have only full second precision.
-                    self.expire = now - 1
-                else:
-                    self.expire = now + maxage
-            except (ValueError, OverflowError):
-                # note: even self.now + maxage can overflow
-                pass
-        elif "expires" in self.attributes:
-            expiration_date = self.attributes["expires"]
-            try:
-                self.expire = cookielib.http2time(expiration_date)
-            except ValueError:
-                # see http://bugs.python.org/issue16181
-                raise CookieError("Invalid expiration date in %r" % expiration_date)
-
-    def is_expired (self, now=None):
-        """Return True if this cookie is expired, else False."""
-        if self.expire is None:
-            # Does not expire.
-            return False
-        if now is None:
-            now = time.time()
-        return now > self.expire
-
-    def __repr__ (self):
-        """Return cookie name, value and attributes as string."""
-        attrs = "; ".join("%s=%r"%(k, v) for k, v in self.attributes.items())
-        return "<%s %s=%r; %s>" % (self.__class__.__name__,
-         self.name, self.value, attrs)
-
-    def is_valid_for (self, scheme, host, port, path):
-        """Check validity of this cookie against the desired scheme,
-        host and path."""
-        if self.check_expired() and \
-           self.check_domain(host) and \
-           self.check_port(port) and \
-           self.check_path(path) and \
-           self.check_secure(scheme):
-            return True
-        return False
-
-    def check_expired (self):
-        """Return False if cookie is expired, else True."""
-        return not self.is_expired()
-
-    def check_domain (self, domain):
-        """Return True if given domain matches this cookie, else False."""
-        if "domain" not in self.attributes:
-            return False
-        cdomain = self.attributes["domain"]
-        if domain == cdomain:
-            # equality matches
-            return True
-        if "." not in domain and domain == cdomain[1:]:
-            # "localhost" and ".localhost" match
-            return True
-        if not domain.endswith(cdomain):
-            # any suffix matches
-            return False
-        if "." in domain[:-(len(cdomain)+1)]:
-            # prefix must be dot-free
-            return False
-        return True
-
-    def check_port (self, port):
-        """Return True if given port matches this cookie, else False.
-        For now, this returns always True."""
-        return True
-
-    def check_path (self, path):
-        """Return True if given path matches this cookie, else False."""
-        if "path" not in self.attributes:
-            return False
-        return path.startswith(self.attributes["path"])
-
-    def check_secure (self, scheme):
-        """Return True if given Scheme is allowed for this cookie, else
-        False."""
-        if "secure" in self.attributes:
-            return scheme == "https"
-        return True
-
-    def set_attribute (self, key, value):
-        """Helper method to set attribute values. Called when parsing
-        cookie data.
-        The attribute key and value are checked, and CookieError is
-        raised in these cases."""
-        if self.attributes is None:
-            raise CookieError("no NAME=VALUE before attributes found")
-        key = key.lower()
-        if key not in self.attribute_names:
-            raise CookieError("invalid attribute %r" % key)
-        if value:
-            value = unquote(value)
-        else:
-            value = ""
-        if key == "domain":
-            value = value.lower()
-            if not value.startswith(".") and not has_embedded_dot(value):
-                if "." in value:
-                    raise CookieError("invalid dot in domain %r" % value)
-                # supply a leading dot
-                value = "."+value
-        if key == "max-age":
-            try:
-                if int(value) < 0:
-                    raise ValueError("Negative Max-Age")
-            except (OverflowError, ValueError):
-                raise CookieError("invalid Max-Age number: %r" % value)
-        if key == "port":
-            ports = value.split(",")
-            for port in ports:
-                try:
-                    if not (0 <= int(port) <= 65535):
-                        raise ValueError("Invalid port number")
-                except (OverflowError, ValueError):
-                    raise CookieError("invalid port number: %r" % port)
-        self.attributes[key] = value
-
-    def parse (self, text, patt=_CookiePattern):
-        """Parse cookie data."""
-        text = strformat.ascii_safe(text.rstrip('\r\n'))
-        # reset values
-        self.name = None
-        self.value = None
-        self.attributes = None
-        # Our starting point
-        i = 0
-        # Length of string
-        n = len(text)
-
-        while 0 <= i < n:
-            # Start looking for a key-value pair.
-            match = patt.search(text, i)
-            if not match:
-                # No more key-value pairs.
-                break
-            key, value = match.group("key"), match.group("val")
-            if value is None:
-                value = ""
-            i = match.end()
-            # Parse the key, value in case it's metainfo.
-            if self.name is None:
-                # Set name and value.
-                self.name = key
-                self.value = unquote(value)
-                self.attributes = {}
-            else:
-                if key.startswith("$"):
-                    key = key[1:]
-                self.set_attribute(key, value)
-        if self.name is None:
-            raise CookieError("missing cookie name in %r" % text)
-        self.calculate_expiration()
-
-    def set_default_attributes (self, scheme, host, path):
-        """Set domain and path attributes for given scheme, host and
-        path."""
-        scheme = strformat.ascii_safe(scheme)
-        host = strformat.ascii_safe(host)
-        path = strformat.ascii_safe(path)
-        if "domain" not in self.attributes:
-            self.attributes["domain"] = host.lower()
-        if "path" not in self.attributes:
-            i = path.rfind("/")
-            if i == -1:
-                path = "/"
-            else:
-                path = path[:i]
-                if not path:
-                    path = "/"
-            self.attributes["path"] = path
-        if not self.check_domain(host):
-            cdomain = self.attributes["domain"]
-            raise CookieError("domain %r not for cookie %r" % (cdomain, host))
-        if not self.check_path(path):
-            cpath = self.attributes["path"]
-            raise CookieError("domain %r not for cookie %r" % (cpath, path))
-        if not self.check_secure(scheme):
-            raise CookieError("no secure scheme %r" % scheme)
-
-    def quote (self, key, value):
-        """Quote value for given key."""
-        return quote(value)
-
-    def server_header_value (self):
-        """Return HTTP header value to send to server."""
-        parts = ["%s=%s" % (self.name, quote(self.value))]
-        parts.extend(["%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
-                  for k, v in self.attributes.items()])
-        return "; ".join(parts)
-
-    def client_header_value (self):
-        """Return HTTP header value to send to client."""
-        parts = []
-        if "version" in self.attributes:
-            parts.append("$Version=%s" % quote(self.attributes["version"]))
-        parts.append("%s=%s" % (self.name, quote(self.value)))
-        parts.extend(["$%s=%s"% (self.attribute_names[k], self.quote(k, v)) \
-                  for k, v in self.attributes.items() if k != "version"])
-        return "; ".join(parts)
-
-class NetscapeCookie (HttpCookie):
-    """Parses RFC 2109 (Netscape) cookies."""
-
-    def __init__ (self, text, scheme, host, path):
-        """Parse given cookie data."""
-        self.parse(text)
-        self.set_default_attributes(scheme, host, path)
-
-    def server_header_name (self):
-        """Return "Set-Cookie" as server header name."""
-        return "Set-Cookie"
-
-    def __eq__ (self, other):
-        """Compare equality of cookie."""
-        return (isinstance(other, NetscapeCookie) and
-            self.name.lower() == other.name.lower() and
-            self.attributes['domain'] == other.attributes['domain'] and
-            self.attributes['path'] == other.attributes['path'])
-
-    def __hash__ (self):
-        """Cookie hash value"""
-        data = (
-          self.name.lower(),
-          self.attributes['domain'],
-          self.attributes['path'],
-        )
-        return hash(data)
-
-
-
-class Rfc2965Cookie (HttpCookie):
-    """Parses RFC 2965 cookies."""
-
-    def __init__ (self, text, scheme, host, path):
-        """Parse given cookie data."""
-        self.parse(text)
-        self.set_default_attributes(scheme, host, path)
-
-    def check_port (self, port):
-        """Return True if given port matches this cookie, else False."""
-        if "port" not in self.attributes:
-            return True
-        cport = self.attributes["port"]
-        return port in [int(x) for x in cport.split(",")]
-
-    def server_header_name (self):
-        """Return "Set-Cookie2" as server header name."""
-        return "Set-Cookie2"
-
-    def quote (self, key, value):
-        """Quote value for given key."""
-        if key == "port":
-            return quote(value, LegalChars="")
-        return quote(value)
-
-    def __eq__ (self, other):
-        """Compare equality of cookie."""
-        return (isinstance(other, Rfc2965Cookie) and
-            self.name.lower() == other.name.lower() and
-            self.attributes['domain'].lower() ==
-                other.attributes['domain'].lower() and
-            self.attributes['path'] == other.attributes['path'])
-
-    def __hash__ (self):
-        """Cookie hash value"""
-        data = (
-          self.name.lower(),
-          self.attributes['domain'].lower(),
-          self.attributes['path'],
-        )
-        return hash(data)


 def from_file (filename):
@ -545,92 +48,21 @@ def from_file (filename):
 def from_headers (strheader):
    """Parse cookie data from a string in HTTP header (RFC 2616) format.

-    @return: tuple (headers, scheme, host, path)
+    @return: list of cookies
    @raises: ValueError for incomplete or invalid data
    """
+    res = []
    fp = StringIO(strheader)
    headers = httplib.HTTPMessage(fp, seekable=True)
    if "Host" not in headers:
        raise ValueError("Required header 'Host:' missing")
    host = headers["Host"]
-    scheme = headers.get("Scheme", "http")
    path= headers.get("Path", "/")
-    return (headers, scheme, host, path)
-
-
-## Taken and adpated from the _mechanize package included in Twill.
-
-def cookie_str(cookie):
-    """Return string representation of Cookie."""
-    h = [(cookie.name, unquote(cookie.value)),
-         ("path", cookie.path),
-         ("domain", cookie.domain)]
-    if cookie.port is not None: h.append(("port", cookie.port))
-    #if cookie.path_specified: h.append(("path_spec", None))
-    #if cookie.port_specified: h.append(("port_spec", None))
-    #if cookie.domain_initial_dot: h.append(("domain_dot", None))
-    if cookie.secure: h.append(("secure", None))
-    if cookie.httponly: h.append(("httponly", None))
-    if cookie.expires: h.append(("expires",
-                               time2isoz(float(cookie.expires))))
-    if cookie.discard: h.append(("discard", None))
-    if cookie.comment: h.append(("comment", cookie.comment))
-    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
-    #if cookie.rfc2109: h.append(("rfc2109", None))
-
-    keys = cookie.nonstandard_attr_keys()
-    keys.sort()
-    for k in keys:
-        h.append((k, str(cookie.get_nonstandard_attr(k))))
-
-    h.append(("version", str(cookie.version)))
-
-    return join_header_words([h])
-
-
-def time2isoz(t=None):
-    """Return a string representing time in seconds since epoch, t.
-
-    If the function is called without an argument, it will use the current
-    time.
-
-    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
-    representing Universal Time (UTC, aka GMT).  An example of this format is:
-
-    1994-11-24 08:49:37Z
-
-    """
-    if t is None: t = time.time()
-    year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
-    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
-        year, mon, mday, hour, min, sec)
-
-
-join_escape_re = re.compile(r"([\"\\])")
-def join_header_words(lists):
-    """Do the inverse of the conversion done by split_header_words.
-
-    Takes a list of lists of (key, value) pairs and produces a single header
-    value.  Attribute values are quoted if needed.
-
-    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
-    'text/plain; charset="iso-8859/1"'
-    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
-    'text/plain, charset="iso-8859/1"'
-
-    """
-    headers = []
-    for pairs in lists:
-        attr = []
-        for k, v in pairs:
-            if v is not None:
-                if not re.search(r"^\w+$", v):
-                    v = join_escape_re.sub(r"\\\1", v)  # escape " and \
-                    v = '"%s"' % v
-                if k is None:  # Netscape cookies may have no name
-                    k = v
-                else:
-                    k = "%s=%s" % (k, v)
-            attr.append(k)
-        if attr: headers.append("; ".join(attr))
-    return ", ".join(headers)
+    for header in headers.getallmatchingheaders("Set-Cookie"):
+        headervalue = header.split(':', 1)[1]
+        for pairs in cookielib.split_header_words([headervalue]):
+            for name, value in pairs:
+                cookie = requests.cookies.create_cookie(name, value,
+                    domain=host, path=path)
+                res.append(cookie)
+    return res
--- a/linkcheck/decorators.py
+++ b/linkcheck/decorators.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2005-2012 Bastian Kleineidam
+# Copyright (C) 2005-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/director/init.py
+++ b/linkcheck/director/init.py
@ -19,13 +19,11 @@ Management of checking a queue of links with several threads.
 """
 import os
 import thread
-import urlparse
-from cStringIO import StringIO
-from .. import log, LOG_CHECK, LinkCheckerInterrupt, cookies, dummy, \
-  fileutil, strformat
-from ..cache import urlqueue, robots_txt, cookie, connection
+import time
+from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
+  fileutil, strformat, plugins
+from ..cache import urlqueue, robots_txt
 from . import aggregator, console
-from ..httplib2 import HTTPMessage


 def visit_loginurl (aggregate):
@ -53,7 +51,7 @@ def visit_loginurl (aggregate):
        log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
          {"url": url})
        return
-    store_cookies(tc.get_browser().cj, aggregate.cookies, url)
+    #XXX store_cookies(tc.get_browser().cj, aggregate.cookies, url)
    resulturl = tc.get_browser().get_url()
    log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
    # add result URL to check list
@ -107,18 +105,6 @@ def search_formname (fieldnames, tc):
    return None


-def store_cookies (cookiejar, cookiecache, url):
-    """Store cookies in cookiejar into the cookiecache."""
-    cookielst = []
-    for c in cookiejar:
-        cookielst.append("Set-Cookie2: %s" % cookies.cookie_str(c))
-    log.debug(LOG_CHECK, "Store cookies %s", cookielst)
-    headers = HTTPMessage(StringIO("\r\n".join(cookielst)))
-    urlparts = urlparse.urlsplit(url)
-    scheme, host, path = urlparts[0:3]
-    cookiecache.add(headers, scheme, host, path)
-
-
 def check_urls (aggregate):
    """Main check function; checks all configured URLs until interrupted
    with Ctrl-C.
@ -194,14 +180,17 @@ def abort (aggregate):
            break
        except KeyboardInterrupt:
            log.warn(LOG_CHECK, _("user abort; force shutdown"))
+            aggregate.logger.end_log_output()
            abort_now()


 def abort_now ():
    """Force exit of current process without cleanup."""
    if os.name == 'posix':
-        # Unix systems can use sigkill
+        # Unix systems can use signals
        import signal
+        os.kill(os.getpid(), signal.SIGTERM)
+        time.sleep(1)
        os.kill(os.getpid(), signal.SIGKILL)
    elif os.name == 'nt':
        # NT has os.abort()
@ -214,8 +203,6 @@ def abort_now ():
 def get_aggregate (config):
    """Get an aggregator instance with given configuration."""
    _urlqueue = urlqueue.UrlQueue(max_allowed_puts=config["maxnumurls"])
-    connections = connection.ConnectionPool(config.get_connectionlimits(), wait=config["wait"])
-    cookies = cookie.CookieJar()
    _robots_txt = robots_txt.RobotsTxt()
-    return aggregator.Aggregate(config, _urlqueue, connections,
-                                cookies, _robots_txt)
+    plugin_manager = plugins.PluginManager(config)
+    return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager)
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@ -17,54 +17,93 @@
 """
 Aggregate needed object instances for checker threads.
 """
-import time
 import threading
-from .. import log, LOG_CHECK, strformat
+import thread
+import requests
+import time
+import random
+from .. import log, LOG_CHECK, strformat, cookies
 from ..decorators import synchronized
 from ..cache import urlqueue
-from . import logger, status, checker, cleanup
+from . import logger, status, checker, interrupt


-_w3_time_lock = threading.Lock()
 _threads_lock = threading.RLock()
-_download_lock = threading.Lock()
+_hosts_lock = threading.RLock()
+
+def new_request_session(config):
+    """Create a new request session."""
+    session = requests.Session()
+    # XXX proxies
+    if config["cookiefile"]:
+        for cookie in cookies.from_file(config["cookiefile"]):
+            session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
+    return session
+

 class Aggregate (object):
    """Store thread-safe data collections for checker threads."""

-    def __init__ (self, config, urlqueue, connections, cookies, robots_txt):
+    def __init__ (self, config, urlqueue, robots_txt, plugin_manager):
        """Store given link checking objects."""
        self.config = config
        self.urlqueue = urlqueue
-        self.connections = connections
-        self.cookies = cookies
-        self.robots_txt = robots_txt
        self.logger = logger.Logger(config)
        self.threads = []
-        self.last_w3_call = 0
-        self.downloaded_bytes = 0
+        self.request_sessions = {}
+        self.robots_txt = robots_txt
+        self.plugin_manager = plugin_manager
+        self.times = {}
+        requests_per_second = config["maxrequestspersecond"]
+        self.wait_time_min = 1.0 / requests_per_second
+        self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)

    @synchronized(_threads_lock)
    def start_threads (self):
        """Spawn threads for URL checking and status printing."""
        if self.config["status"]:
            t = status.Status(self.urlqueue, self.config.status_logger,
-                self.config["status_wait_seconds"],
-                self.config["maxrunseconds"])
+                self.config["status_wait_seconds"])
+            t.start()
+            self.threads.append(t)
+        if self.config["maxrunseconds"]:
+            t = interrupt.Interrupt(self.config["maxrunseconds"])
            t.start()
            self.threads.append(t)
-        t = cleanup.Cleanup(self.connections)
-        t.start()
-        self.threads.append(t)
        num = self.config["threads"]
        if num > 0:
            for dummy in range(num):
-                t = checker.Checker(self.urlqueue, self.logger)
-                t.start()
+                t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
                self.threads.append(t)
+                t.start()
        else:
+            self.request_sessions[thread.get_ident()] = new_request_session(self.config)
            checker.check_url(self.urlqueue, self.logger)

+    @synchronized(_threads_lock)
+    def add_request_session(self):
+        """Add a request session for current thread."""
+        session = new_request_session(self.config)
+        self.request_sessions[thread.get_ident()] = session
+
+    @synchronized(_threads_lock)
+    def get_request_session(self):
+        """Get the request session for current thread."""
+        return self.request_sessions[thread.get_ident()]
+
+    @synchronized(_hosts_lock)
+    def wait_for_host(self, host):
+        """Throttle requests to one host."""
+        t = time.time()
+        if host in self.times:
+            due_time = self.times[host]
+            if due_time > t:
+                wait = due_time - t
+                time.sleep(wait)
+                t = time.time()
+        wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
+        self.times[host] = t + wait_time
+
    @synchronized(_threads_lock)
    def print_active_threads (self):
        """Log all currently active threads."""
@ -77,8 +116,8 @@ class Aggregate (object):
                    first = False
                log.info(LOG_CHECK, name[12:])
        args = dict(
-            num=len(self.threads),
-            timeout=strformat.strduration_long(self.config["timeout"]),
+            num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
+            timeout=strformat.strduration_long(self.config["aborttimeout"]),
        )
        log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)

@ -98,7 +137,7 @@ class Aggregate (object):
        """Print still-active URLs and empty the URL queue."""
        self.print_active_threads()
        self.cancel()
-        timeout = self.config["timeout"]
+        timeout = self.config["aborttimeout"]
        try:
            self.urlqueue.join(timeout=timeout)
        except urlqueue.Timeout:
@ -118,36 +157,9 @@ class Aggregate (object):
            self.cancel()
        for t in self.threads:
            t.stop()
-        self.connections.clear()
-        self.gather_statistics()

    @synchronized(_threads_lock)
    def is_finished (self):
        """Determine if checking is finished."""
        self.remove_stopped_threads()
        return self.urlqueue.empty() and not self.threads
-
-    @synchronized(_w3_time_lock)
-    def check_w3_time (self):
-        """Make sure the W3C validators are at most called once a second."""
-        if time.time() - self.last_w3_call < 1:
-            time.sleep(1)
-        self.last_w3_call = time.time()
-
-    @synchronized(_download_lock)
-    def add_download_data(self, url, data):
-        """Add given downloaded data.
-        @param url: URL which data belongs to
-        @ptype url: unicode
-        @param data: downloaded data
-        @ptype data: string
-        """
-        self.downloaded_bytes += len(data)
-
-    def gather_statistics(self):
-        """Gather download and cache statistics and send them to the
-        logger.
-        """
-        robots_txt_stats = self.robots_txt.hits, self.robots_txt.misses
-        download_stats = self.downloaded_bytes
-        self.logger.add_statistics(robots_txt_stats, download_stats)
--- a/linkcheck/director/checker.py
+++ b/linkcheck/director/checker.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2011 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -36,14 +36,17 @@ def check_url (urlqueue, logger):
 class Checker (task.LoggedCheckedTask):
    """URL check thread."""

-    def __init__ (self, urlqueue, logger):
+    def __init__ (self, urlqueue, logger, add_request_session):
        """Store URL queue and logger."""
        super(Checker, self).__init__(logger)
        self.urlqueue = urlqueue
        self.origname = self.getName()
+        self.add_request_session = add_request_session

    def run_checked (self):
        """Check URLs in the queue."""
+        # construct per-thread HTTP/S requests session
+        self.add_request_session()
        while not self.stopped(0):
            self.check_url()

--- a/linkcheck/director/cleanup.py
+++ b/linkcheck/director/cleanup.py
@ -1,40 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2007-2011 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""Cleanup task."""
-import time
-from . import task, console
-
-
-class Cleanup (task.CheckedTask):
-    """Cleanup task performing periodic cleanup of cached connections."""
-
-    def __init__ (self, connections):
-        """Store urlqueue object."""
-        super(Cleanup, self).__init__()
-        self.connections = connections
-
-    def run_checked (self):
-        """Print periodic status messages."""
-        self.start_time = time.time()
-        self.setName("Cleanup")
-        # clean every 15 seconds
-        while not self.stopped(15):
-            self.connections.remove_expired()
-
-    def internal_error (self):
-        """Print internal error to console."""
-        console.internal_error()
--- a/linkcheck/director/console.py
+++ b/linkcheck/director/console.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2013 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/director/interrupt.py
+++ b/linkcheck/director/interrupt.py
@ -0,0 +1,46 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006-2014 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""Status message handling"""
+import time
+from . import task
+from .. import log, LOG_CHECK, strformat
+
+
+class Interrupt (task.CheckedTask):
+    """Thread that raises KeyboardInterrupt after a specified duration.
+    This gives us a portable SIGALRM implementation.
+    The duration is checked every 5 seconds.
+    """
+    WaitSeconds = 5
+
+    def __init__ (self, duration):
+        """Initialize the task.
+        @param duration: raise KeyboardInterrupt after given number of seconds
+        @ptype duration: int
+        """
+        super(Interrupt, self).__init__()
+        self.duration = duration
+
+    def run_checked (self):
+        """Wait and raise KeyboardInterrupt after."""
+        self.start_time = time.time()
+        self.setName("Interrupt")
+        while not self.stopped(self.WaitSeconds):
+            duration = time.time() - self.start_time
+            if duration > self.duration:
+                log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration))
+                raise KeyboardInterrupt()
--- a/linkcheck/director/logger.py
+++ b/linkcheck/director/logger.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2012 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -29,7 +29,6 @@ class Logger (object):
        self.loggers = [config['logger']]
        self.loggers.extend(config['fileoutput'])
        self.verbose = config["verbose"]
-        self.complete = config["complete"]
        self.warnings = config["warnings"]

    def start_log_output (self):
@ -46,15 +45,8 @@ class Logger (object):
        for logger in self.loggers:
            logger.end_output()

-    def add_statistics(self, robots_txt_stats, download_stats):
-        """Add statistics to logger."""
-        for logger in self.loggers:
-            logger.add_statistics(robots_txt_stats, download_stats)
-
    def do_print (self, url_data):
        """Determine if URL entry should be logged or not."""
-        if self.complete:
-            return True
        if self.verbose:
            return True
        if self.warnings and url_data.warnings:
--- a/linkcheck/director/status.py
+++ b/linkcheck/director/status.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2012 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -22,7 +22,7 @@ from . import task
 class Status (task.LoggedCheckedTask):
    """Thread that gathers and logs the status periodically."""

-    def __init__ (self, urlqueue, logger, wait_seconds, max_duration):
+    def __init__ (self, urlqueue, logger, wait_seconds):
        """Initialize the status logger task.
        @param urlqueue: the URL queue
        @ptype urlqueue: Urlqueue
@ -30,33 +30,27 @@ class Status (task.LoggedCheckedTask):
        @ptype logger: console.StatusLogger
        @param wait_seconds: interval in seconds to report status
        @ptype wait_seconds: int
-        @param max_duration: abort checking after given number of seconds
-        @ptype max_duration: int or None
        """
        super(Status, self).__init__(logger)
        self.urlqueue = urlqueue
        self.wait_seconds = wait_seconds
        assert self.wait_seconds >= 1
-        self.first_wait = True
-        self.max_duration = max_duration

    def run_checked (self):
        """Print periodic status messages."""
        self.start_time = time.time()
        self.setName("Status")
-        if not self.first_wait:
-            wait_seconds = self.wait_seconds
-        else:
-            # the first status should be after a second
-            self.first_wait = False
-            wait_seconds = 1
+        # the first status should be after a second
+        wait_seconds = 1
+        first_wait = True
        while not self.stopped(wait_seconds):
            self.log_status()
+            if first_wait:
+                wait_seconds = self.wait_seconds
+                first_wait = False

    def log_status (self):
        """Log a status message."""
        duration = time.time() - self.start_time
-        if self.max_duration is not None and duration > self.max_duration:
-            raise KeyboardInterrupt()
        checked, in_progress, queue = self.urlqueue.status()
        self.logger.log_status(checked, in_progress, queue, duration)
--- a/linkcheck/director/task.py
+++ b/linkcheck/director/task.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2006-2011 Bastian Kleineidam
+# Copyright (C) 2006-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -16,7 +16,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 import thread
 from ..decorators import notimplemented
-from .. import log, LOG_CHECK, threader
+from .. import threader
 from . import console


@ -28,7 +28,6 @@ class CheckedTask (threader.StoppableThread):
        try:
            self.run_checked()
        except KeyboardInterrupt:
-            log.warn(LOG_CHECK, "interrupt did not reach the main thread")
            thread.interrupt_main()
        except Exception:
            self.internal_error()
--- a/linkcheck/dummy.py
+++ b/linkcheck/dummy.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2005-2011 Bastian Kleineidam
+# Copyright (C) 2005-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/fileutil.py
+++ b/linkcheck/fileutil.py
@ -275,6 +275,12 @@ def is_accessable_by_others(filename):
    return mode & (stat.S_IRWXG | stat.S_IRWXO)


+def is_writable_by_others(filename):
+    """Check if file or directory is world writable."""
+    mode = os.stat(filename)[stat.ST_MODE]
+    return mode & stat.S_IWOTH
+
+
@memoized
 def is_writable(filename):
    """Check if
--- a/linkcheck/ftpparse.py
+++ b/linkcheck/ftpparse.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2009-2010 Bastian Kleineidam
+# Copyright (C) 2009-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/init.py
+++ b/linkcheck/gui/init.py
@ -23,7 +23,7 @@ from PyQt4 import QtCore, QtGui
 from .linkchecker_ui_main import Ui_MainWindow
 from .properties import set_properties, clear_properties
 from .statistics import set_statistics, clear_statistics
-from .debug import LinkCheckerDebug, LinkCheckerDebugMemory
+from .debug import LinkCheckerDebug
 from .logger import SignalLogger, GuiLogHandler, StatusLogger
 from .help import HelpWindow
 from .options import LinkCheckerOptions
@ -37,7 +37,7 @@ from .settings import Settings
 from .recentdocs import RecentDocumentModel
 from .projects import openproject, saveproject, loadproject, ProjectExt
 from .. import configuration, checker, director, get_link_pat, \
-    strformat, fileutil, LinkCheckerError, memoryutil
+    strformat, fileutil, LinkCheckerError
 from ..containers import enum
 from .. import url as urlutil
 from ..checker import httpheaders
@ -99,7 +99,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
        # init subdialogs
        self.options = LinkCheckerOptions(parent=self)
        self.debug = LinkCheckerDebug(parent=self)
-        self.debugmemory = LinkCheckerDebugMemory(parent=self)
        self.checker = CheckerThread(parent=self)
        self.contextmenu = ContextMenu(parent=self)
        self.editor = EditorWindow(parent=self)
@ -175,8 +174,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
        def set_idle ():
            """Set application status to idle."""
            self.status = Status.idle
-            if self.config["debugmemory"]:
-                self.dump_memory()
            self.set_statusmsg(_("Check finished."))
            self.controlButton.clicked.disconnect(self.checker.cancel)
        self.checker.finished.connect(set_idle)
@ -250,7 +247,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
            self.config["threads"] = 1
        else:
            self.config.reset_loglevel()
-        self.config["debugmemory"] = data["debugmemory"]
        if data["warninglines"]:
            lines = data["warninglines"].splitlines()
            ro = re.compile(warninglines2regex(lines))
@ -313,7 +309,6 @@ class LinkCheckerMain (QtGui.QMainWindow, Ui_MainWindow):
        elif status == Status.checking:
            self.treeView.setSortingEnabled(False)
            self.debug.reset()
-            self.debugmemory.reset()
            self.set_statusmsg(u"Checking site...")
            # disable commands
            self.menubar.setEnabled(False)
@ -423,7 +418,7 @@ Version 2 or later.
    def cancel (self):
        """Note that checking is canceled."""
        self.controlButton.setEnabled(False)
-        duration = strformat.strduration_long(self.config["timeout"])
+        duration = strformat.strduration_long(self.config["aborttimeout"])
        self.set_statusmsg(_(u"Closing active URLs with timeout %s...") % duration)

    @QtCore.pyqtSlot()
@ -436,16 +431,6 @@ Version 2 or later.
        else:
            raise ValueError("Invalid application status %r" % self.status)

-    def dump_memory (self):
-        """Dump memory to temporary file and inform user with a modal
-        dialog where the file is."""
-        self.set_statusmsg(_(u"Dumping memory statistics..."))
-        filename = memoryutil.write_memory_dump()
-        title = _(u"LinkChecker memory dump written")
-        message = _(u"The memory dump has been written to `%(filename)s'.")
-        attrs = dict(filename=filename)
-        QtGui.QMessageBox.information(self, title, message % attrs)
-
    def get_url (self):
        """Return URL to check from the urlinput widget."""
        url = strformat.stripurl(unicode(self.urlinput.text()))
@ -524,9 +509,10 @@ Version 2 or later.
        """View URL source in editor window."""
        self.editor.setWindowTitle(u"View %s" % url)
        self.editor.setUrl(url)
-        info, data = urlutil.get_content(url, proxy=self.config["proxy"])
-        if (info, data) == (None, None):
-            self.editor.setText(u"An error occurred retreiving URL `%s'." % url)
+        data, info = urlutil.get_content(url, proxy=self.config["proxy"])
+        if data is None:
+            msg = u"An error occurred retreiving URL `%s': %s." % (url, info)
+            self.editor.setText(msg)
        else:
            content_type = httpheaders.get_content_type(info)
            if not content_type:
--- a/linkcheck/gui/checker.py
+++ b/linkcheck/gui/checker.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2008-2011 Bastian Kleineidam
+# Copyright (C) 2008-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/contextmenu.py
+++ b/linkcheck/gui/contextmenu.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2009-2011 Bastian Kleineidam
+# Copyright (C) 2009-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/debug.py
+++ b/linkcheck/gui/debug.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2009-2012 Bastian Kleineidam
+# Copyright (C) 2009-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -41,23 +41,3 @@ class LinkCheckerDebug (QtGui.QDialog, Ui_DebugDialog):
    def getText (self):
        """Get debug info as string."""
        return self.textEdit.toPlainText()
-
-
-class LinkCheckerDebugMemory (QtGui.QDialog, Ui_DebugDialog):
-    """Show memory debugging output."""
-
-    def __init__ (self, parent=None):
-        """Setup the debug memory dialog."""
-        super(LinkCheckerDebugMemory, self).__init__(parent)
-        self.setupUi(self)
-        font = QtGui.QFont("Consolas", 11)
-        font.setFixedPitch(True)
-        self.textEdit.document().setDefaultFont(font)
-
-    def reset (self):
-        """Clear memory info."""
-        self.textEdit.clear()
-
-    def setText (self, text):
-        """Set memory debug info."""
-        return self.textEdit.setPlainText(text)
--- a/linkcheck/gui/editor.py
+++ b/linkcheck/gui/editor.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2012 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/editor_qsci.py
+++ b/linkcheck/gui/editor_qsci.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/editor_qt.py
+++ b/linkcheck/gui/editor_qt.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/help.py
+++ b/linkcheck/gui/help.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2009-2011 Bastian Kleineidam
+# Copyright (C) 2009-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/lineedit.py
+++ b/linkcheck/gui/lineedit.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2012 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/linkchecker_ui_debug.py
+++ b/linkcheck/gui/linkchecker_ui_debug.py
@ -2,8 +2,8 @@

 # Form implementation generated from reading ui file 'ui/debug.ui'
 #
-# Created: Mon Dec 12 19:00:37 2011
-#      by: PyQt4 UI code generator 4.8.6
+# Created: Fri Feb 28 21:24:59 2014
+#      by: PyQt4 UI code generator 4.9.3
 #
 # WARNING! All changes made in this file will be lost!

@ -19,7 +19,6 @@ class Ui_DebugDialog(object):
        DebugDialog.setObjectName(_fromUtf8("DebugDialog"))
        DebugDialog.setWindowModality(QtCore.Qt.ApplicationModal)
        DebugDialog.resize(564, 547)
-        DebugDialog.setWindowTitle(_("LinkChecker debug log"))
        self.verticalLayout = QtGui.QVBoxLayout(DebugDialog)
        self.verticalLayout.setObjectName(_fromUtf8("verticalLayout"))
        self.frame = QtGui.QFrame(DebugDialog)
@ -40,5 +39,5 @@ class Ui_DebugDialog(object):
        QtCore.QMetaObject.connectSlotsByName(DebugDialog)

    def retranslateUi(self, DebugDialog):
-        pass
+        DebugDialog.setWindowTitle(_("LinkChecker debug log"))

--- a/linkcheck/gui/linkchecker_ui_main.py
+++ b/linkcheck/gui/linkchecker_ui_main.py
@ -2,7 +2,7 @@

 # Form implementation generated from reading ui file 'ui/main.ui'
 #
-# Created: Tue Nov  6 21:47:39 2012
+# Created: Fri Feb 28 21:24:58 2014
 #      by: PyQt4 UI code generator 4.9.3
 #
 # WARNING! All changes made in this file will be lost!
@ -679,29 +679,6 @@ class Ui_MainWindow(object):
        self.stats_url_maxlen.setOpenExternalLinks(True)
        self.stats_url_maxlen.setObjectName(_fromUtf8("stats_url_maxlen"))
        self.gridLayout_3.addWidget(self.stats_url_maxlen, 1, 1, 1, 1)
-        self.label_14 = QtGui.QLabel(self.groupBox_2)
-        sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
-        sizePolicy.setHorizontalStretch(0)
-        sizePolicy.setVerticalStretch(0)
-        sizePolicy.setHeightForWidth(self.label_14.sizePolicy().hasHeightForWidth())
-        self.label_14.setSizePolicy(sizePolicy)
-        self.label_14.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignTrailing|QtCore.Qt.AlignVCenter)
-        self.label_14.setObjectName(_fromUtf8("label_14"))
-        self.gridLayout_3.addWidget(self.label_14, 1, 2, 1, 1)
-        self.stats_domains = QtGui.QLabel(self.groupBox_2)
-        sizePolicy = QtGui.QSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred)
-        sizePolicy.setHorizontalStretch(0)
-        sizePolicy.setVerticalStretch(0)
-        sizePolicy.setHeightForWidth(self.stats_domains.sizePolicy().hasHeightForWidth())
-        self.stats_domains.setSizePolicy(sizePolicy)
-        self.stats_domains.setMinimumSize(QtCore.QSize(30, 0))
-        self.stats_domains.setFrameShape(QtGui.QFrame.StyledPanel)
-        self.stats_domains.setFrameShadow(QtGui.QFrame.Sunken)
-        self.stats_domains.setText(_fromUtf8(""))
-        self.stats_domains.setTextFormat(QtCore.Qt.RichText)
-        self.stats_domains.setOpenExternalLinks(True)
-        self.stats_domains.setObjectName(_fromUtf8("stats_domains"))
-        self.gridLayout_3.addWidget(self.stats_domains, 1, 3, 1, 1)
        self.verticalLayout_2.addWidget(self.groupBox_2)
        self.horizontalLayout.addWidget(self.statistics)
        self.verticalLayout.addLayout(self.horizontalLayout)
@ -831,7 +808,6 @@ class Ui_MainWindow(object):
        self.label_18.setText(_("Min. length"))
        self.label_20.setText(_("Avg. length"))
        self.label_19.setText(_("Max. length"))
-        self.label_14.setText(_("Domains"))
        self.menuEdit.setTitle(_("&Edit"))
        self.menuFile.setTitle(_("&File"))
        self.menuHelp.setTitle(_("&Help"))
--- a/linkcheck/gui/linkchecker_ui_options.py
+++ b/linkcheck/gui/linkchecker_ui_options.py
@ -2,8 +2,8 @@

 # Form implementation generated from reading ui file 'ui/options.ui'
 #
-# Created: Sun Jun 10 11:51:42 2012
-#      by: PyQt4 UI code generator 4.9.1
+# Created: Fri Feb 28 21:24:59 2014
+#      by: PyQt4 UI code generator 4.9.3
 #
 # WARNING! All changes made in this file will be lost!

@ -28,6 +28,7 @@ class Ui_Options(object):
        self.widget = QtGui.QWidget(self.groupBox_2)
        self.widget.setObjectName(_fromUtf8("widget"))
        self.formLayout = QtGui.QFormLayout(self.widget)
+        self.formLayout.setFieldGrowthPolicy(QtGui.QFormLayout.ExpandingFieldsGrow)
        self.formLayout.setMargin(0)
        self.formLayout.setObjectName(_fromUtf8("formLayout"))
        self.label = QtGui.QLabel(self.widget)
@ -60,14 +61,6 @@ class Ui_Options(object):
        self.debug.setText(_fromUtf8(""))
        self.debug.setObjectName(_fromUtf8("debug"))
        self.formLayout.setWidget(2, QtGui.QFormLayout.FieldRole, self.debug)
-        self.label_7 = QtGui.QLabel(self.widget)
-        self.label_7.setToolTip(_fromUtf8(""))
-        self.label_7.setObjectName(_fromUtf8("label_7"))
-        self.formLayout.setWidget(3, QtGui.QFormLayout.LabelRole, self.label_7)
-        self.debugmemory = QtGui.QCheckBox(self.widget)
-        self.debugmemory.setText(_fromUtf8(""))
-        self.debugmemory.setObjectName(_fromUtf8("debugmemory"))
-        self.formLayout.setWidget(3, QtGui.QFormLayout.FieldRole, self.debugmemory)
        self.verticalLayout.addWidget(self.widget)
        spacerItem = QtGui.QSpacerItem(20, 10, QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding)
        self.verticalLayout.addItem(spacerItem)
@ -143,7 +136,6 @@ class Ui_Options(object):
        self.label_2.setText(_("Verbose output"))
        self.verbose.setToolTip(_("Log all checked URLs once. Default is to log only errors and warnings."))
        self.label_4.setText(_("Debug"))
-        self.label_7.setText(_("Debug memory usage"))
        self.label_5.setText(_("Warn when one of these strings are found (one per line):"))
        self.label_6.setText(_("Ignore URLs matching one of these patterns (one per line):"))
        self.groupBox.setTitle(_("Configuration file"))
--- a/linkcheck/gui/options.py
+++ b/linkcheck/gui/options.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2009-2012 Bastian Kleineidam
+# Copyright (C) 2009-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -19,7 +19,7 @@ import os
 from PyQt4 import QtGui
 from .linkchecker_ui_options import Ui_Options
 from .editor import EditorWindow
-from ..fileutil import is_writable, has_module
+from ..fileutil import is_writable
 from .. import configuration


@ -46,11 +46,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
        self.recursionlevel.setValue(-1)
        self.verbose.setChecked(False)
        self.debug.setChecked(False)
-        self.debugmemory.setChecked(False)
-        if not has_module("meliae"):
-            self.debugmemory.setEnabled(False)
-            from ..memoryutil import MemoryDebugMsg
-            self.debugmemory.setToolTip(MemoryDebugMsg)
        self.warninglines.setPlainText(u"")
        self.ignorelines.setPlainText(u"")

@ -69,7 +64,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
        """Return option data as dictionary."""
        return dict(
            debug=self.debug.isChecked(),
-            debugmemory=self.debugmemory.isChecked(),
            verbose=self.verbose.isChecked(),
            recursionlevel=self.recursionlevel.value(),
            warninglines=unicode(self.warninglines.toPlainText()),
@ -80,8 +74,6 @@ class LinkCheckerOptions (QtGui.QDialog, Ui_Options):
        """Set GUI options from given data."""
        if data.get("debug") is not None:
            self.debug.setChecked(data["debug"])
-        if data.get("debugmemory") is not None:
-            self.debugmemory.setChecked(data["debugmemory"])
        if data.get("verbose") is not None:
            self.verbose.setChecked(data["verbose"])
        if data.get("recursionlevel") is not None:
--- a/linkcheck/gui/projects.py
+++ b/linkcheck/gui/projects.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2012 Bastian Kleineidam
+# Copyright (C) 2012-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -57,9 +57,6 @@ class ProjectParser (confparse.LCConfigParser):
            return
        data = {}
        option = "debug"
-        if self.has_option(section, option):
-             data[option] = self.getboolean(section, option)
-        option = "debugmemory"
        if self.has_option(section, option):
             data[option] = self.getboolean(section, option)
        option = "verbose"
--- a/linkcheck/gui/properties.py
+++ b/linkcheck/gui/properties.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2012 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -38,8 +38,8 @@ def set_properties (widget, data):
        widget.prop_dltime.setText(_("%.3f seconds") % data.dltime)
    else:
        widget.prop_dltime.setText(u"")
-    if data.dlsize >= 0:
-        widget.prop_size.setText(strformat.strsize(data.dlsize))
+    if data.size >= 0:
+        widget.prop_size.setText(strformat.strsize(data.size))
    else:
        widget.prop_size.setText(u"")
    if data.modified:
--- a/linkcheck/gui/recentdocs.py
+++ b/linkcheck/gui/recentdocs.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/settings.py
+++ b/linkcheck/gui/settings.py
@ -85,10 +85,10 @@ class Settings (object):

    def read_options (self):
        """Return stored GUI options."""
-        data = dict(debug=None, debugmemory=None, verbose=None,
+        data = dict(debug=None, verbose=None,
            recursionlevel=None, warninglines=None, ignorelines=None)
        self.settings.beginGroup('output')
-        for key in ("debug", "debugmemory", "verbose"):
+        for key in ("debug", "verbose"):
            if self.settings.contains(key):
                data[key] = self.settings.value(key).toBool()
        self.settings.endGroup()
@ -116,7 +116,7 @@ class Settings (object):
    def save_options (self, data):
        """Save GUI options."""
        self.settings.beginGroup('output')
-        for key in ("debug", "debugmemory", "verbose"):
+        for key in ("debug", "verbose"):
            self.settings.setValue(key, QtCore.QVariant(data[key]))
        self.settings.endGroup()
        self.settings.beginGroup('checking')
--- a/linkcheck/gui/statistics.py
+++ b/linkcheck/gui/statistics.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2011 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -19,7 +19,6 @@ from ..logger import ContentTypes

 def set_statistics (widget, statistics):
    """Set statistic information in given widget."""
-    widget.stats_domains.setText(u"%d" % len(statistics.domains))
    widget.stats_url_minlen.setText(u"%d" % statistics.min_url_length)
    widget.stats_url_maxlen.setText(u"%d" % statistics.max_url_length)
    widget.stats_url_avglen.setText(u"%d" % statistics.avg_url_length)
@ -38,7 +37,6 @@ def set_statistics (widget, statistics):

 def clear_statistics (widget):
    """Reset statistic information in given widget."""
-    widget.stats_domains.setText(u"")
    widget.stats_url_minlen.setText(u"")
    widget.stats_url_maxlen.setText(u"")
    widget.stats_url_avglen.setText(u"")
--- a/linkcheck/gui/syntax.py
+++ b/linkcheck/gui/syntax.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/ui/main.ui
+++ b/linkcheck/gui/ui/main.ui
@ -1402,53 +1402,6 @@
              </property>
             </widget>
            </item>
-            <item row="1" column="2">
-             <widget class="QLabel" name="label_14">
-              <property name="sizePolicy">
-               <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
-                <horstretch>0</horstretch>
-                <verstretch>0</verstretch>
-               </sizepolicy>
-              </property>
-              <property name="text">
-               <string>Domains</string>
-              </property>
-              <property name="alignment">
-               <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
-              </property>
-             </widget>
-            </item>
-            <item row="1" column="3">
-             <widget class="QLabel" name="stats_domains">
-              <property name="sizePolicy">
-               <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
-                <horstretch>0</horstretch>
-                <verstretch>0</verstretch>
-               </sizepolicy>
-              </property>
-              <property name="minimumSize">
-               <size>
-                <width>30</width>
-                <height>0</height>
-               </size>
-              </property>
-              <property name="frameShape">
-               <enum>QFrame::StyledPanel</enum>
-              </property>
-              <property name="frameShadow">
-               <enum>QFrame::Sunken</enum>
-              </property>
-              <property name="text">
-               <string/>
-              </property>
-              <property name="textFormat">
-               <enum>Qt::RichText</enum>
-              </property>
-              <property name="openExternalLinks">
-               <bool>true</bool>
-              </property>
-             </widget>
-            </item>
           </layout>
          </widget>
         </item>
--- a/linkcheck/gui/ui/options.ui
+++ b/linkcheck/gui/ui/options.ui
@ -29,6 +29,9 @@
      <item>
       <widget class="QWidget" name="widget" native="true">
        <layout class="QFormLayout" name="formLayout">
+         <property name="fieldGrowthPolicy">
+          <enum>QFormLayout::ExpandingFieldsGrow</enum>
+         </property>
         <item row="0" column="0">
          <widget class="QLabel" name="label">
           <property name="toolTip">
@ -104,23 +107,6 @@
           </property>
          </widget>
         </item>
-         <item row="3" column="0">
-          <widget class="QLabel" name="label_7">
-           <property name="toolTip">
-            <string extracomment="When checking finishes, write a memory dump to a temporary file. The memory dump is written both when checking finishes normally and when checking gets canceled."/>
-           </property>
-           <property name="text">
-            <string>Debug memory usage</string>
-           </property>
-          </widget>
-         </item>
-         <item row="3" column="1">
-          <widget class="QCheckBox" name="debugmemory">
-           <property name="text">
-            <string/>
-           </property>
-          </widget>
-         </item>
        </layout>
       </widget>
      </item>
--- a/linkcheck/gui/updater.py
+++ b/linkcheck/gui/updater.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/urlmodel.py
+++ b/linkcheck/gui/urlmodel.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2010-2011 Bastian Kleineidam
+# Copyright (C) 2010-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/gui/validator.py
+++ b/linkcheck/gui/validator.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/htmlutil/init.py
+++ b/linkcheck/htmlutil/init.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2008-2009 Bastian Kleineidam
+# Copyright (C) 2008-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/htmlutil/linkname.py
+++ b/linkcheck/htmlutil/linkname.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2001-2010 Bastian Kleineidam
+# Copyright (C) 2001-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -201,9 +201,7 @@ class LinkFinder (TagFinder):
    def start_element (self, tag, attrs):
        """Search for links and store found URLs in a list."""
        log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
-        log.debug(LOG_CHECK, "line %d col %d old line %d old col %d",
-            self.parser.lineno(), self.parser.column(),
-            self.parser.last_lineno(), self.parser.last_column())
+        log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
        if tag == "base" and not self.base_ref:
            self.base_ref = unquote(attrs.get_true("href", u''))
        tagattrs = self.tags.get(tag, [])
@ -282,7 +280,6 @@ class LinkFinder (TagFinder):
            return
        for u in urls:
            assert isinstance(u, unicode) or u is None, repr(u)
-            log.debug(LOG_CHECK,
-              u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
+            log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
            self.callback(u, self.parser.last_lineno(),
                          self.parser.last_column(), name, base)
--- a/linkcheck/httplib2.py
+++ b/linkcheck/httplib2.py
--- a/linkcheck/httputil.py
+++ b/linkcheck/httputil.py
@ -1,86 +1,9 @@
 # -*- coding: iso-8859-1 -*-
 # Various HTTP utils with a free license
-from cStringIO import StringIO
-from . import gzip2 as gzip
-from . import httplib2 as httplib
-from . import log, LOG_CHECK, fileutil
-import re
-import zlib
-import urllib
-import urllib2
+from . import fileutil
 import base64


-###########################################################################
-# urlutils.py - Simplified urllib handling
-#
-#   Written by Chris Lawrence <lawrencc@debian.org>
-#   (C) 1999-2002 Chris Lawrence
-#
-# This program is freely distributable per the following license:
-#
-##  Permission to use, copy, modify, and distribute this software and its
-##  documentation for any purpose and without fee is hereby granted,
-##  provided that the above copyright notice appears in all copies and that
-##  both that copyright notice and this permission notice appear in
-##  supporting documentation.
-##
-##  I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
-##  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I
-##  BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
-##  DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-##  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
-##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
-##  SOFTWARE.
-
-def decode (page):
-    """Gunzip or deflate a compressed page."""
-    log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
-    encoding = page.info().get("Content-Encoding")
-    if encoding in ('gzip', 'x-gzip', 'deflate'):
-        # cannot seek in socket descriptors, so must get content now
-        content = page.read()
-        try:
-            if encoding == 'deflate':
-                fp = StringIO(zlib.decompress(content))
-            else:
-                fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
-        except zlib.error as msg:
-            log.debug(LOG_CHECK, "uncompressing had error "
-                 "%s, assuming non-compressed content", str(msg))
-            fp = StringIO(content)
-        # remove content-encoding header
-        headers = httplib.HTTPMessage(StringIO(""))
-        ceheader = re.compile(r"(?i)content-encoding:")
-        for h in page.info().keys():
-            if not ceheader.match(h):
-                headers[h] = page.info()[h]
-        newpage = urllib.addinfourl(fp, headers, page.geturl())
-        newpage.code = page.code
-        newpage.msg = page.msg
-        return newpage
-    return page
-
-
-class HttpWithGzipHandler (urllib2.HTTPHandler):
-    """Support gzip encoding."""
-    def http_open (self, req):
-        """Send request and decode answer."""
-        return decode(urllib2.HTTPHandler.http_open(self, req))
-
-
-if hasattr(httplib, 'HTTPS'):
-    class HttpsWithGzipHandler (urllib2.HTTPSHandler):
-        """Support gzip encoding."""
-
-        def https_open (self, req):
-            """Send request and decode answer."""
-            return decode(urllib2.HTTPSHandler.https_open(self, req))
-
-# end of urlutils.py routines
-###########################################################################
-
-
 def encode_multipart_formdata(fields, files=None):
    """
    From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306
--- a/linkcheck/i18n.py
+++ b/linkcheck/i18n.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/linkcheck/lc_cgi.py
+++ b/linkcheck/lc_cgi.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2000-2012 Bastian Kleineidam
+# Copyright (C) 2000-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -172,7 +172,7 @@ def get_configuration(form, out):
    config["logger"] = config.logger_new('html', fd=out, encoding=HTML_ENCODING)
    config["threads"] = 2
    if "anchors" in form:
-        config["anchors"] = True
+        config["enabledplugins"].append("AnchorCheck")
    if "errors" not in form:
        config["verbose"] = True
    # avoid checking of local files or other nasty stuff
@ -246,15 +246,16 @@ def format_error (why):
    @return: HTML page content
    @rtype: unicode
    """
-    return _("""<html><head>
+    return _("""<!DOCTYPE HTML>
+<html><head>
 <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
 <title>LinkChecker Online Error</title></head>
 <body text=#192c83 bgcolor=#fff7e5 link=#191c83 vlink=#191c83 alink=#191c83>
 <blockquote>
-<b>Error: %s</b><br>
+<b>Error: %s</b><br/>
 The LinkChecker Online script has encountered an error. Please ensure
 that your provided URL link begins with <code>http://</code> and
-contains only these characters: <code>A-Za-z0-9./_~-</code><br><br>
+contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
 Errors are logged.
 </blockquote>
 </body>
--- a/Show more
+++ b/Show more