diff --git a/ChangeLog b/ChangeLog index 85a6bff1..c16fae16 100644 --- a/ChangeLog +++ b/ChangeLog @@ -23,6 +23,11 @@ Type: bugfix Changed: linkcheck/checker/mailtourl.py + * Workaround for buggy servers that break protocol synchronization of + persistent HTTP connections. + Changed: linkcheck/checker/httpurl.py + Closes: SF bug #1913992 + 4.8 "Hallam Foe" (released 16.12.2007) * Fix message typo for not disclosing information. diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index a5f1c1a1..9e92ba21 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -502,9 +502,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.headers = response.msg self.persistent = not response.will_close if self.persistent and (self.method == "GET" or - self.headers.getheader("Content-Length", "")): + self.headers.getheader("Content-Length") != "0"): # always read content from persistent connections self._read_content(response) + if self.persistent and self.method == "HEAD": + # Some servers send page content after a HEAD request, + # but only after making the *next* request. This breaks + # protocol synchronisation. Workaround here is to close + # the connection after HEAD. + # Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk) + self.persistent = False # If possible, use official W3C HTTP response name if response.status in httpresponses: response.reason = httpresponses[response.status]