ignore decompression errors

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1062 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-23 15:44:44 +00:00 · 2003-10-21 10:11:46 +00:00 · 2003-10-21 10:11:46 +00:00 · 0de34a7675
commit 0de34a7675
parent 61324a5e68
3 changed files with 21 additions and 14 deletions
--- a/3
+++ b/3
@ -4,6 +4,9 @@
    Changed: linkcheck/{File,Http,Ftp,}UrlData.py, linkcheck/linkparser.py
  * try to use psyco for the commandline linkchecker script
    Changed: linkchecker
+  * when decompression of compressed HTML pages fails, assume the page
+    is not compressed
+    Changed: linkcheck/{robotparser2,HttpUrlData}.py

 1.9.3 (released 16.10.2003)
  * re-added an updated robot parser which uses urllib2 and can decode
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -16,8 +16,9 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-import urlparse, sys, time, re, httplib, robotparser2
+import urlparse, sys, time, re, httplib, zlib, gzip robotparser2
 from urllib import quote, unquote
+from cStringIO import StringIO
 import Config, i18n
 from debug import *
 # XXX not dynamic
@ -343,13 +344,13 @@ class HttpUrlData (ProxyUrlData):
            self.data = response.read()
            encoding = self.headers.get("Content-Encoding")
            if encoding in _supported_encodings:
-                from cStringIO import StringIO
-                if encoding == 'deflate':
-                    import zlib
-                    f = StringIO(zlib.decompress(self.data))
-                else:
-                    import gzip
-                    f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
+                try:
+                    if encoding == 'deflate':
+                        f = StringIO(zlib.decompress(self.data))
+                    else:
+                        f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
+                except zlib.error:
+                    f = StringIO(self.data)
                self.data = f.read()
            self.downloadtime = time.time() - t
        return self.data
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -284,14 +284,17 @@ def decode (page):
    encoding = page.info().get("Content-Encoding") 
    if encoding in ('gzip', 'x-gzip', 'deflate'):
        from cStringIO import StringIO
+        import zlib, gzip
        # cannot seek in socket descriptors, so must get content now
        content = page.read()
-        if encoding == 'deflate':
-            import zlib
-            fp = StringIO(zlib.decompress(content))
-        else:
-            import gzip
-            fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
+        try:
+            if encoding == 'deflate':
+                fp = StringIO(zlib.decompress(content))
+            else:
+                fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
+        except zlib.error, msg:
+            # XXX warning
+            fp = StringIO(content)
        # remove content-encoding header
        headers = {}
        ceheader = re.compile(r"(?i)content-encoding:")