From e6ad32c0282e623c537a56a5f80e6cfd3fa98b07 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 23 Jan 2013 19:42:29 +0100 Subject: [PATCH] Catch UnicodeError for invalid host names. --- doc/changelog.txt | 8 +++++--- linkcheck/checker/const.py | 2 ++ linkcheck/checker/urlbase.py | 9 ++++++--- tests/checker/data/http.html | 3 +++ tests/checker/data/http.html.result | 8 +++++++- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 7e9eea42..5cc7364b 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -3,9 +3,9 @@ Features: - checking: Support URLs. - logging: Sending SIGUSR1 signal prints the stack trace of all current - running threads. This makes it easier to debug deadlocks. -- gui: Added support of Drag-and-Drop of local files. If the local file is - a LinkChecker project (.lcp) it is loaded automatically, else the check + running threads. This makes debugging deadlocks easier. +- gui: Support Drag-and-Drop of local files. If the local file is + a LinkChecker project (.lcp) file it is loaded, else the check URL is set to the local file URL. Changes: @@ -14,6 +14,8 @@ Changes: Fixes: - checking: Fix a crash when closing a Word document after scanning failed. Closes: GH bug #369 +- checking: Catch UnicodeError from idna.encode() fixing an internal error when + trying to connect to certain invalid hostnames. 8.3 "Mahna Mahna Killer" (released 6.1.2013) diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 81557c59..53dd2483 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -53,6 +53,8 @@ ExcCacheList = [ ftplib.error_temp, ftplib.error_perm, ftplib.error_proto, + # idna.encode(), called from socket.create_connection() + UnicodeError, ] # Exceptions that do not put the URL in the cache so that the URL can diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index b8b0fb50..a813bdf8 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -526,14 +526,17 @@ class UrlBase (object): self.check_connection() self.add_size_info() self.add_country_info() - except tuple(ExcList): + except tuple(ExcList) as exc: value = self.handle_exception() # make nicer error msg for unknown hosts - if isinstance(value, socket.error) and value.args[0] == -2: + if isinstance(exc, socket.error) and exc.args[0] == -2: value = _('Hostname not found') # make nicer error msg for bad status line - if isinstance(value, httplib.BadStatusLine): + elif isinstance(exc, httplib.BadStatusLine): value = _('Bad HTTP response %(line)r') % {"line": str(value)} + elif isinstance(exc, UnicodeError): + # idna.encode(host) failed + value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': str(value)} self.set_result(unicode_safe(value), valid=False) self.checktime = time.time() - check_start if self.do_check_content: diff --git a/tests/checker/data/http.html b/tests/checker/data/http.html index a9ea63a3..6dae00da 100644 --- a/tests/checker/data/http.html +++ b/tests/checker/data/http.html @@ -21,3 +21,6 @@ + + +UnicodeError diff --git a/tests/checker/data/http.html.result b/tests/checker/data/http.html.result index da4a91f7..29ff3001 100644 --- a/tests/checker/data/http.html.result +++ b/tests/checker/data/http.html.result @@ -1,7 +1,7 @@ url http://localhost:%(port)d/%(datadir)s/http.html cache key http://localhost:%(port)d/%(datadir)s/http.html real url http://localhost:%(port)d/%(datadir)s/http.html -info 13 URLs parsed. +info 14 URLs parsed. valid url dns://www.example.org @@ -76,3 +76,9 @@ info Redirected to `http://www.iana.org/domains/example'. warning Anchor `a%%3D1%%2C2%%2C3' not found. Available anchors: -. valid +url http://.example.org/ +cache key http://.example.org/ +real url http://.example.org/ +name UnicodeError +warning Access denied by robots.txt, skipping content checks. +error