mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-30 19:14:43 +00:00
Fix gzip handling in http content decoder.
This commit is contained in:
parent
169bdecb69
commit
27b61c3bfa
2 changed files with 13 additions and 6 deletions
|
|
@ -32,10 +32,10 @@ import base64
|
|||
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
|
||||
## SOFTWARE.
|
||||
|
||||
def decode (page):
|
||||
"""Gunzip or deflate a compressed page."""
|
||||
log.debug(LOG_CHECK,
|
||||
"robots.txt page info %d %s", page.code, str(page.info()))
|
||||
log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
|
||||
encoding = page.info().get("Content-Encoding")
|
||||
if encoding in ('gzip', 'x-gzip', 'deflate'):
|
||||
# cannot seek in socket descriptors, so must get content now
|
||||
|
|
@ -58,6 +58,7 @@ def decode (page):
|
|||
newpage = urllib.addinfourl(fp, headers, page.geturl())
|
||||
newpage.code = page.code
|
||||
newpage.msg = page.msg
|
||||
return newpage
|
||||
return page
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -79,11 +79,12 @@ class RobotFileParser (object):
|
|||
def read (self):
|
||||
"""Read the robots.txt URL and feeds it to the parser."""
|
||||
self._reset()
|
||||
data = None
|
||||
headers = {
|
||||
'User-Agent': configuration.UserAgent,
|
||||
'Accept-Encoding': ACCEPT_ENCODING,
|
||||
}
|
||||
req = urllib2.Request(self.url, None, headers)
|
||||
req = urllib2.Request(self.url, data, headers)
|
||||
try:
|
||||
self._read_content(req)
|
||||
except urllib2.HTTPError, x:
|
||||
|
|
@ -125,7 +126,12 @@ class RobotFileParser (object):
|
|||
@raise: httplib.HTTPException, IOError on HTTP errors
|
||||
@raise: ValueError on bad digest auth (a bug)
|
||||
"""
|
||||
f = urlutil.get_opener(self.user, self.password, self.proxy)
|
||||
if log.is_debug(LOG_CHECK):
|
||||
debuglevel = 1
|
||||
else:
|
||||
debuglevel = 0
|
||||
f = urlutil.get_opener(user=self.user, password=self.password,
|
||||
proxy=self.proxy, debuglevel=debuglevel)
|
||||
res = None
|
||||
try:
|
||||
res = f.open(req)
|
||||
|
|
@ -133,7 +139,7 @@ class RobotFileParser (object):
|
|||
if ct and ct.lower().startswith("text/plain"):
|
||||
self.parse([line.strip() for line in res])
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r allow all (not text content)", self.url)
|
||||
log.debug(LOG_CHECK, "%r allow all (no text content)", self.url)
|
||||
self.allow_all = True
|
||||
finally:
|
||||
if res is not None:
|
||||
|
|
@ -157,7 +163,7 @@ class RobotFileParser (object):
|
|||
|
||||
@return: None
|
||||
"""
|
||||
log.debug(LOG_CHECK, "%r parse lines", self.url)
|
||||
log.debug(LOG_CHECK, "%r parse %d lines", self.url, len(lines))
|
||||
state = 0
|
||||
linenumber = 0
|
||||
entry = Entry()
|
||||
|
|
|
|||
Loading…
Reference in a new issue