diff --git a/doc/changelog.txt b/doc/changelog.txt index 3d9bb294..9d01495a 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -4,7 +4,8 @@ Fixes: - checking: Recheck extern status on HTTP redirects even if domain did not change. Patch by Charles Jones. Closes: SF bug #3495407 - +- checking: Fix non-ascii HTTP header handling. + Closes: SF bug #3495621 7.5 "Kukushka" (released 13.02.2012) diff --git a/linkcheck/checker/httpheaders.py b/linkcheck/checker/httpheaders.py index 8eb55d4b..62b38034 100644 --- a/linkcheck/checker/httpheaders.py +++ b/linkcheck/checker/httpheaders.py @@ -17,7 +17,6 @@ """ Helper functions dealing with HTTP headers. """ -from ..containers import CaselessDict DEFAULT_TIMEOUT_SECS = 300 @@ -100,18 +99,3 @@ def get_content_encoding (headers): @rtype: string """ return headers.get("Content-Encoding", "").strip() - - -def decode_headers (headers): - """Decode ISO-8859-1 headers to unicode. Since a dictionary is - returned, multiple header entries are not preserved. - - @return: decoded keys and values - @rtype: CaselessDict(unicode -> unicode) - """ - headers_encoded = CaselessDict() - for key, value in headers.items(): - key = key.decode("iso-8859-1", "replace") - value = value.decode("iso-8859-1", "replace") - headers_encoded[key] = value - return headers_encoded diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index d84fea48..56749582 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -102,7 +102,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # Note that content-encoding causes size differences since # the content data is always decoded. try: - self.size = int(self.headers["Content-Length"]) + self.size = int(self.getheader("Content-Length")) if self.dlsize == -1: self.dlsize = self.size except (ValueError, OverflowError): @@ -147,7 +147,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # check the http connection response = self.check_http_connection() if self.headers and "Server" in self.headers: - server = self.headers['Server'] + server = self.getheader('Server') else: server = _("unknown") if self.fallback_get: @@ -201,7 +201,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # proxy enforcement (overrides standard proxy) if response.status == 305 and self.headers: oldproxy = (self.proxy, self.proxyauth) - newproxy = self.headers.get("Location") + newproxy = self.getheader("Location") self.add_info(_("Enforced proxy `%(name)s'.") % {"name": newproxy}) self.set_proxy(newproxy) @@ -243,7 +243,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): return response # user authentication if response.status == 401: - authenticate = self.headers.get('WWW-Authenticate') + authenticate = self.getheader('WWW-Authenticate') if not authenticate or not authenticate.startswith("Basic"): # LinkChecker only supports Basic authorization args = {"auth": authenticate} @@ -267,15 +267,15 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # The squid proxy reports valid 200 instead of 404 due to # garbage sent from the server at the start of the GET # request. See http://www.aldec.com/Products - if u'squid' not in self.headers.get('Via', '').lower(): + if u'squid' not in self.getheader('Via', u'').lower(): self.method = "GET" self.aliases = [] continue elif self.headers and self.method == "HEAD" and self.method_get_allowed: # test for HEAD support mime = self.get_content_type() - poweredby = self.headers.get('X-Powered-By', '') - server = self.headers.get('Server', '') + poweredby = self.getheader('X-Powered-By', u'') + server = self.getheader('Server', u'') if ((mime in ('application/octet-stream', 'text/plain') and (poweredby.startswith('Zope') or server.startswith('Zope'))) or ('ASP.NET' in poweredby and 'Microsoft-IIS' in server)): @@ -313,8 +313,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def follow_redirection (self, response, set_result, redirected): """Follow one redirection of http response.""" - newurl = self.headers.get("Location", - self.headers.get("Uri", "")) + newurl = self.getheader("Location", + self.getheader("Uri", u"")) # make new url absolute and unicode newurl = urlparse.urljoin(redirected, unicode_safe(newurl)) log.debug(LOG_CHECK, "Redirected to %r", newurl) @@ -444,6 +444,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): tag=WARN_HTTP_MOVED_PERMANENT) self.has301status = True + def getheader (self, name, default=None): + """Get decoded header value. + + @return: decoded header value or default of not found + @rtype: unicode or type of default + """ + value = self.headers.get(name) + if value is None: + return default + return value.decode("iso-8859-1", "replace") + def get_alias_cache_data (self): """ Return all data values that should be put in the cache, @@ -481,7 +492,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.set_result(u"%r %s" % (response.status, response.reason)) else: self.set_result(u"OK") - modified = self.headers.get('Last-Modified', '') + modified = self.getheader('Last-Modified', u'') if modified: self.add_info(_("Last modified %(date)s.") % {"date": modified}) @@ -555,7 +566,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.url_connection.endheaders() response = self.url_connection.getresponse(True) self.timeout = headers.http_timeout(response) - self.headers = headers.decode_headers(response.msg) + self.headers = response.msg self.content_type = None self.persistent = not response.will_close if self.persistent and self.method == "HEAD": @@ -568,7 +579,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # Note that for POST method the connection should also be closed, # but this method is never used. if self.persistent and (self.method == "GET" or - self.headers.get("Content-Length") != "0"): + self.getheader("Content-Length") != u"0"): # always read content from persistent connections self._read_content(response) assert not response.will_close @@ -650,7 +661,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.method = "GET" response = self._try_http_response() response = self.follow_redirections(response, set_result=False)[1] - self.headers = headers.decode_headers(response.msg) + self.headers = response.msg self.content_type = None # Re-read size info, since the GET request result could be different # than a former HEAD request.