ignore decompression errors

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1062 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-10-21 10:11:46 +00:00
parent 61324a5e68
commit 0de34a7675
3 changed files with 21 additions and 14 deletions

View file

@ -4,6 +4,9 @@
Changed: linkcheck/{File,Http,Ftp,}UrlData.py, linkcheck/linkparser.py
* try to use psyco for the commandline linkchecker script
Changed: linkchecker
* when decompression of compressed HTML pages fails, assume the page
is not compressed
Changed: linkcheck/{robotparser2,HttpUrlData}.py
1.9.3 (released 16.10.2003)
* re-added an updated robot parser which uses urllib2 and can decode

View file

@ -16,8 +16,9 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import urlparse, sys, time, re, httplib, robotparser2
import urlparse, sys, time, re, httplib, zlib, gzip robotparser2
from urllib import quote, unquote
from cStringIO import StringIO
import Config, i18n
from debug import *
# XXX not dynamic
@ -343,13 +344,13 @@ class HttpUrlData (ProxyUrlData):
self.data = response.read()
encoding = self.headers.get("Content-Encoding")
if encoding in _supported_encodings:
from cStringIO import StringIO
if encoding == 'deflate':
import zlib
f = StringIO(zlib.decompress(self.data))
else:
import gzip
f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
try:
if encoding == 'deflate':
f = StringIO(zlib.decompress(self.data))
else:
f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
except zlib.error:
f = StringIO(self.data)
self.data = f.read()
self.downloadtime = time.time() - t
return self.data

View file

@ -284,14 +284,17 @@ def decode (page):
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
from cStringIO import StringIO
import zlib, gzip
# cannot seek in socket descriptors, so must get content now
content = page.read()
if encoding == 'deflate':
import zlib
fp = StringIO(zlib.decompress(content))
else:
import gzip
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
try:
if encoding == 'deflate':
fp = StringIO(zlib.decompress(content))
else:
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
except zlib.error, msg:
# XXX warning
fp = StringIO(content)
# remove content-encoding header
headers = {}
ceheader = re.compile(r"(?i)content-encoding:")