Fix gzip handling in http content decoder.

This commit is contained in:
Bastian Kleineidam 2012-09-30 14:00:49 +02:00
parent 169bdecb69
commit 27b61c3bfa
2 changed files with 13 additions and 6 deletions

View file

@ -32,10 +32,10 @@ import base64
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
## SOFTWARE.
def decode (page):
"""Gunzip or deflate a compressed page."""
log.debug(LOG_CHECK,
"robots.txt page info %d %s", page.code, str(page.info()))
log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
# cannot seek in socket descriptors, so must get content now
@ -58,6 +58,7 @@ def decode (page):
newpage = urllib.addinfourl(fp, headers, page.geturl())
newpage.code = page.code
newpage.msg = page.msg
return newpage
return page

View file

@ -79,11 +79,12 @@ class RobotFileParser (object):
def read (self):
"""Read the robots.txt URL and feeds it to the parser."""
self._reset()
data = None
headers = {
'User-Agent': configuration.UserAgent,
'Accept-Encoding': ACCEPT_ENCODING,
}
req = urllib2.Request(self.url, None, headers)
req = urllib2.Request(self.url, data, headers)
try:
self._read_content(req)
except urllib2.HTTPError, x:
@ -125,7 +126,12 @@ class RobotFileParser (object):
@raise: httplib.HTTPException, IOError on HTTP errors
@raise: ValueError on bad digest auth (a bug)
"""
f = urlutil.get_opener(self.user, self.password, self.proxy)
if log.is_debug(LOG_CHECK):
debuglevel = 1
else:
debuglevel = 0
f = urlutil.get_opener(user=self.user, password=self.password,
proxy=self.proxy, debuglevel=debuglevel)
res = None
try:
res = f.open(req)
@ -133,7 +139,7 @@ class RobotFileParser (object):
if ct and ct.lower().startswith("text/plain"):
self.parse([line.strip() for line in res])
else:
log.debug(LOG_CHECK, "%r allow all (not text content)", self.url)
log.debug(LOG_CHECK, "%r allow all (no text content)", self.url)
self.allow_all = True
finally:
if res is not None:
@ -157,7 +163,7 @@ class RobotFileParser (object):
@return: None
"""
log.debug(LOG_CHECK, "%r parse lines", self.url)
log.debug(LOG_CHECK, "%r parse %d lines", self.url, len(lines))
state = 0
linenumber = 0
entry = Entry()