mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-23 15:44:44 +00:00
ignore decompression errors
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1062 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
61324a5e68
commit
0de34a7675
3 changed files with 21 additions and 14 deletions
|
|
@ -4,6 +4,9 @@
|
|||
Changed: linkcheck/{File,Http,Ftp,}UrlData.py, linkcheck/linkparser.py
|
||||
* try to use psyco for the commandline linkchecker script
|
||||
Changed: linkchecker
|
||||
* when decompression of compressed HTML pages fails, assume the page
|
||||
is not compressed
|
||||
Changed: linkcheck/{robotparser2,HttpUrlData}.py
|
||||
|
||||
1.9.3 (released 16.10.2003)
|
||||
* re-added an updated robot parser which uses urllib2 and can decode
|
||||
|
|
|
|||
|
|
@ -16,8 +16,9 @@
|
|||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import urlparse, sys, time, re, httplib, robotparser2
|
||||
import urlparse, sys, time, re, httplib, zlib, gzip robotparser2
|
||||
from urllib import quote, unquote
|
||||
from cStringIO import StringIO
|
||||
import Config, i18n
|
||||
from debug import *
|
||||
# XXX not dynamic
|
||||
|
|
@ -343,13 +344,13 @@ class HttpUrlData (ProxyUrlData):
|
|||
self.data = response.read()
|
||||
encoding = self.headers.get("Content-Encoding")
|
||||
if encoding in _supported_encodings:
|
||||
from cStringIO import StringIO
|
||||
if encoding == 'deflate':
|
||||
import zlib
|
||||
f = StringIO(zlib.decompress(self.data))
|
||||
else:
|
||||
import gzip
|
||||
f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
|
||||
try:
|
||||
if encoding == 'deflate':
|
||||
f = StringIO(zlib.decompress(self.data))
|
||||
else:
|
||||
f = gzip.GzipFile('', 'rb', 9, StringIO(self.data))
|
||||
except zlib.error:
|
||||
f = StringIO(self.data)
|
||||
self.data = f.read()
|
||||
self.downloadtime = time.time() - t
|
||||
return self.data
|
||||
|
|
|
|||
|
|
@ -284,14 +284,17 @@ def decode (page):
|
|||
encoding = page.info().get("Content-Encoding")
|
||||
if encoding in ('gzip', 'x-gzip', 'deflate'):
|
||||
from cStringIO import StringIO
|
||||
import zlib, gzip
|
||||
# cannot seek in socket descriptors, so must get content now
|
||||
content = page.read()
|
||||
if encoding == 'deflate':
|
||||
import zlib
|
||||
fp = StringIO(zlib.decompress(content))
|
||||
else:
|
||||
import gzip
|
||||
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
|
||||
try:
|
||||
if encoding == 'deflate':
|
||||
fp = StringIO(zlib.decompress(content))
|
||||
else:
|
||||
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
|
||||
except zlib.error, msg:
|
||||
# XXX warning
|
||||
fp = StringIO(content)
|
||||
# remove content-encoding header
|
||||
headers = {}
|
||||
ceheader = re.compile(r"(?i)content-encoding:")
|
||||
|
|
|
|||
Loading…
Reference in a new issue