Use HTTPMessage() in all urllib handlers, really fixing the bug noted in http://www.python.org/sf/1117588. The workaround has been removed.

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2603 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-05-18 17:53:39 +00:00
parent cb54822dac
commit 63b76ec642

View file

@ -54,48 +54,6 @@ debug = lambda txt: _msg("debug:", txt)
warn = lambda txt: _msg("warning:", txt)
error = lambda txt: _msg("error:", txt)
class MyHTTPRedirectHandler (urllib2.HTTPRedirectHandler):
"""
Work around for bug http://www.python.org/sf/1117588
"""
def http_error_302 (self, req, fp, code, msg, headers):
if 'location' in headers:
newurl = headers.get('location')
elif 'uri' in headers:
newurl = headers.get('uri')
else:
return
newurl = urlparse.urljoin(req.get_full_url(), newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(req, fp, code, msg, headers, newurl)
if new is None:
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise HTTPError, (req.get_full_url(), code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new)
http_error_301 = http_error_303 = http_error_307 = http_error_302
class PasswordManager (object):
"""
@ -205,7 +163,7 @@ class RobotFileParser (object):
urllib2.HTTPDigestAuthHandler(pwd_manager),
urllib2.ProxyDigestAuthHandler(pwd_manager),
urllib2.HTTPDefaultErrorHandler,
MyHTTPRedirectHandler,
urllib2.HTTPRedirectHandler,
]
if hasattr(linkcheck.httplib2, 'HTTPS'):
handlers.append(HttpsWithGzipHandler)
@ -514,7 +472,7 @@ def decode (page):
"%s, assuming non-compressed content" % str(msg))
fp = StringIO.StringIO(content)
# remove content-encoding header
headers = {}
headers = httplib.HTTPMessage(StringIO.StringIO(""))
ceheader = re.compile(r"(?i)content-encoding:")
for h in page.info().keys():
if not ceheader.match(h):