do not catch UnicodeError, handle that intern

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3269 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-28 07:43:59 +00:00 · 2006-05-19 17:13:16 +00:00 · 2006-05-19 17:13:16 +00:00 · 7e1e01bd36
commit 7e1e01bd36
parent 98d5387ac0
6 changed files with 28 additions and 7 deletions
--- a/6
+++ b/6
@ -65,6 +65,12 @@
    Removed: linkcheck/checker/{ignored,error}url.py
    Added: linkcheck/checker/unknownurl.py

+  * Convert the "label too long" domain name parse error into
+    a more friendly error message.
+    Type: bugfix
+    Changed: linkcheck/checker/{__init__,urlbase,httpurl,fileurl}.py,
+      linkchecker
+
 3.4 "The Chumscrubbers" (released 4.2.2006)

  * Ignore decoding errors when retrieving the robots.txt URL.
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -34,8 +34,6 @@ import linkcheck.dns.exception
 # Catch these exception on syntax checks.
 ExcSyntaxList = [
    linkcheck.LinkCheckerError,
-    # .encode('idna') raises this
-    UnicodeError,
 ]

 # Catch these exceptions on content and connect checks. All other
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -107,7 +107,7 @@ class FileUrl (urlbase.UrlBase):
        base_url = re.sub("^file://(/?)([a-zA-Z]):", r"file:///\2|", base_url)
        # norm base url again after changing
        if self.base_url != base_url:
-            base_url, is_idn = linkcheck.url.url_norm(base_url)
+            base_url, is_idn = linkcheck.checker.urlbase.url_norm(base_url)
            self.base_url = unicode(base_url)

    def build_url (self):
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -310,7 +310,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                "Redirected to %r", newurl)
            self.add_info(_("Redirected to %(url)s.") % {'url': newurl},
                          tag="http-redirect")
-            redirected, is_idn = linkcheck.url.url_norm(newurl)
+            # norm base url - can raise UnicodeError from url.idna_encode()
+            redirected, is_idn = linkcheck.checker.urlbase.url_norm(newurl)
            assert None == linkcheck.log.debug(linkcheck.LOG_CHECK,
                "Norm redirected to %r", redirected)
            urlparts = linkcheck.strformat.url_unicode_split(redirected)
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -53,6 +53,17 @@ def urljoin (parent, url, scheme):
    return urlparse.urljoin(parent, url)


+def url_norm (url):
+    """
+    Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError.
+    """
+    try:
+        return linkcheck.url.url_norm(url)
+    except UnicodeError:
+        msg = _("URL has unparsable domain name: %s") % sys.exc_info()[1]
+        raise linkcheck.LinkCheckerError(msg)
+
+
 class UrlBase (object):
    """
    An URL with additional information like validity etc.
@ -294,8 +305,8 @@ class UrlBase (object):
        Construct self.url and self.urlparts out of the given base
        url information self.base_url, self.parent_url and self.base_ref.
        """
-        # norm base url
-        base_url, is_idn = linkcheck.url.url_norm(self.base_url)
+        # norm base url - can raise UnicodeError from url.idna_encode()
+        base_url, is_idn = url_norm(self.base_url)
        if is_idn:
            self.add_warning(_("""URL %r has a unicode domain name which
                          is not yet widely supported. You should use
--- a/7
+++ b/7
@ -715,7 +715,12 @@ for url in args:
        # syntactic sugar
        url = "ftp://%s" % url
    url_data = get_url_from(url, 0, aggregate, assume_local=True)
-    linkcheck.add_intern_pattern(url_data, config)
+    try:
+        linkcheck.add_intern_pattern(url_data, config)
+    except UnicodeError:
+        linkcheck.log.error(linkcheck.LOG_CMDLINE,
+            _("URL has unparsable domain name: %s"), sys.exc_info()[1])
+        sys.exit(1)
    aggregate.urlqueue.put(url_data)
 # set up profiling/psyco
 if do_profile and not has_profile: