From 4c15fc6a8b061fb6f021e2df15e8b5ccf5635be3 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Sat, 14 Jan 2012 11:01:09 +0100 Subject: [PATCH] Properly handle non-ASCII HTTP header values. --- doc/changelog.txt | 4 ++++ linkcheck/checker/httpheaders.py | 16 ++++++++++++++++ linkcheck/checker/httpurl.py | 12 ++++++------ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/doc/changelog.txt b/doc/changelog.txt index 8b9ad735..4eb71d07 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -1,5 +1,9 @@ 7.5 "" (released xx.xx.2012) +Fixes: +- checking: Properly handle non-ascii HTTP header values. + Closes: SF bug #3473359 + Changes: - checking: Add steam:// URIs to the list of ignored URIs. Closes: SF bug #3471570 diff --git a/linkcheck/checker/httpheaders.py b/linkcheck/checker/httpheaders.py index b6f2b260..1490a83a 100644 --- a/linkcheck/checker/httpheaders.py +++ b/linkcheck/checker/httpheaders.py @@ -17,6 +17,7 @@ """ Helper functions dealing with HTTP headers. """ +from ..containers import CaselessDict DEFAULT_TIMEOUT_SECS = 300 @@ -99,3 +100,18 @@ def get_content_encoding (headers): @rtype: string """ return headers.get("Content-Encoding", "").strip() + + +def decode_headers (headers): + """Decode ISO-8859-1 headers to unicode. Since a dictionary is + returned, multiple header entries are not preserved. + + @return: decoded keys and values + @rtype: CaselessDict(unicode -> unicode) + """ + headers_encoded = CaselessDict() + for key, value in headers.items(): + key = key.decode("iso-8859-1", "replace") + value = value.decode("iso-8859-1", "replace") + headers_encoded[key] = value + return headers_encoded diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index daeb0455..74da3087 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -201,7 +201,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # proxy enforcement (overrides standard proxy) if response.status == 305 and self.headers: oldproxy = (self.proxy, self.proxyauth) - newproxy = self.headers.getheader("Location") + newproxy = self.headers.get("Location") self.add_info(_("Enforced proxy `%(name)s'.") % {"name": newproxy}) self.set_proxy(newproxy) @@ -307,8 +307,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def follow_redirection (self, response, set_result, redirected): """Follow one redirection of http response.""" - newurl = self.headers.getheader("Location", - self.headers.getheader("Uri", "")) + newurl = self.headers.get("Location", + self.headers.get("Uri", "")) # make new url absolute and unicode newurl = urlparse.urljoin(redirected, unicode_safe(newurl)) log.debug(LOG_CHECK, "Redirected to %r", newurl) @@ -551,7 +551,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.url_connection.endheaders() response = self.url_connection.getresponse(True) self.timeout = headers.http_timeout(response) - self.headers = response.msg + self.headers = headers.decode_headers(response.msg) self.content_type = None self.persistent = not response.will_close if self.persistent and self.method == "HEAD": @@ -564,7 +564,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # Note that for POST method the connection should also be closed, # but this method is never used. if self.persistent and (self.method == "GET" or - self.headers.getheader("Content-Length") != "0"): + self.headers.get("Content-Length") != "0"): # always read content from persistent connections self._read_content(response) assert not response.will_close @@ -646,7 +646,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.method = "GET" response = self._try_http_response() response = self.follow_redirections(response, set_result=False)[1] - self.headers = response.msg + self.headers = headers.decode_headers(response.msg) self.content_type = None # Re-read size info, since the GET request result could be different # than a former HEAD request.