Fix gzip handling in http content decoder.

2026-04-30 19:14:43 +00:00 · 2012-09-30 14:00:49 +02:00 · 2012-09-30 14:00:49 +02:00 · 27b61c3bfa
commit 27b61c3bfa
parent 169bdecb69
2 changed files with 13 additions and 6 deletions
--- a/linkcheck/httputil.py
+++ b/linkcheck/httputil.py
@ -32,10 +32,10 @@ import base64
 ##  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 ##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 ##  SOFTWARE.
+
 def decode (page):
    """Gunzip or deflate a compressed page."""
-    log.debug(LOG_CHECK,
-      "robots.txt page info %d %s", page.code, str(page.info()))
+    log.debug(LOG_CHECK, "page info %d %s", page.code, str(page.info()))
    encoding = page.info().get("Content-Encoding")
    if encoding in ('gzip', 'x-gzip', 'deflate'):
        # cannot seek in socket descriptors, so must get content now
@ -58,6 +58,7 @@ def decode (page):
        newpage = urllib.addinfourl(fp, headers, page.geturl())
        newpage.code = page.code
        newpage.msg = page.msg
+        return newpage
    return page


--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -79,11 +79,12 @@ class RobotFileParser (object):
    def read (self):
        """Read the robots.txt URL and feeds it to the parser."""
        self._reset()
+        data = None
        headers = {
            'User-Agent': configuration.UserAgent,
            'Accept-Encoding': ACCEPT_ENCODING,
        }
-        req = urllib2.Request(self.url, None, headers)
+        req = urllib2.Request(self.url, data, headers)
        try:
            self._read_content(req)
        except urllib2.HTTPError, x:
@ -125,7 +126,12 @@ class RobotFileParser (object):
        @raise: httplib.HTTPException, IOError on HTTP errors
        @raise: ValueError on bad digest auth (a bug)
        """
-        f = urlutil.get_opener(self.user, self.password, self.proxy)
+        if log.is_debug(LOG_CHECK):
+            debuglevel = 1
+        else:
+            debuglevel = 0
+        f = urlutil.get_opener(user=self.user, password=self.password,
+            proxy=self.proxy, debuglevel=debuglevel)
        res = None
        try:
            res = f.open(req)
@ -133,7 +139,7 @@ class RobotFileParser (object):
            if ct and ct.lower().startswith("text/plain"):
                self.parse([line.strip() for line in res])
            else:
-                log.debug(LOG_CHECK, "%r allow all (not text content)", self.url)
+                log.debug(LOG_CHECK, "%r allow all (no text content)", self.url)
                self.allow_all = True
        finally:
            if res is not None:
@ -157,7 +163,7 @@ class RobotFileParser (object):

        @return: None
        """
-        log.debug(LOG_CHECK, "%r parse lines", self.url)
+        log.debug(LOG_CHECK, "%r parse %d lines", self.url, len(lines))
        state = 0
        linenumber = 0
        entry = Entry()