Move http util function in a separate module.

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3747 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2008-05-08 10:36:08 +00:00
parent d5ec7c1ac3
commit bc9b9ee07e
2 changed files with 125 additions and 84 deletions

121
linkcheck/httputil.py Normal file
View file

@ -0,0 +1,121 @@
# -*- coding: iso-8859-1 -*-
# Various HTTP utils with a free license
from cStringIO import StringIO
from . import gzip2 as gzip
from . import httplib2 as httplib
from . import log, LOG_CHECK
import re
import mimetypes
import zlib
import urllib
import urllib2
###########################################################################
# urlutils.py - Simplified urllib handling
#
# Written by Chris Lawrence <lawrencc@debian.org>
# (C) 1999-2002 Chris Lawrence
#
# This program is freely distributable per the following license:
#
## Permission to use, copy, modify, and distribute this software and its
## documentation for any purpose and without fee is hereby granted,
## provided that the above copyright notice appears in all copies and that
## both that copyright notice and this permission notice appear in
## supporting documentation.
##
## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I
## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
## SOFTWARE.
def decode (page):
"""Gunzip or deflate a compressed page."""
log.debug(LOG_CHECK,
"robots.txt page info %d %s", page.code, str(page.info()))
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
# cannot seek in socket descriptors, so must get content now
content = page.read()
try:
if encoding == 'deflate':
fp = StringIO(zlib.decompress(content))
else:
fp = gzip.GzipFile('', 'rb', 9, StringIO(content))
except zlib.error, msg:
log.debug(LOG_CHECK, "uncompressing had error "
"%s, assuming non-compressed content", str(msg))
fp = StringIO(content)
# remove content-encoding header
headers = httplib.HTTPMessage(StringIO(""))
ceheader = re.compile(r"(?i)content-encoding:")
for h in page.info().keys():
if not ceheader.match(h):
headers[h] = page.info()[h]
newpage = urllib.addinfourl(fp, headers, page.geturl())
if hasattr(page, "code"):
# python 2.4 compatibility
newpage.code = page.code
if hasattr(page, "msg"):
# python 2.4 compatibility
newpage.msg = page.msg
page = newpage
return page
class HttpWithGzipHandler (urllib2.HTTPHandler):
"""Support gzip encoding."""
def http_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPHandler.http_open(self, req))
if hasattr(httplib, 'HTTPS'):
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
"""Support gzip encoding."""
def http_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPSHandler.http_open(self, req))
# end of urlutils.py routines
###########################################################################
def encode_multipart_formdata(fields, files=None):
"""
From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, value) elements for data to be
uploaded as files.
Return (content_type, body) ready for httplib.HTTP instance
"""
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
if files is not None:
for (key, filename, value) in files:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
L.append('Content-Type: %s' % get_content_type(filename))
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def get_content_type(filename):
return mimetypes.guess_type(filename)[0] or 'application/octet-stream'

View file

@ -20,21 +20,14 @@ Robots.txt parser.
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/wc/norobots-rfc.html
"""
import urlparse
import httplib
import urllib
import urllib2
import time
import socket
import re
import zlib
import sys
import cStringIO as StringIO
import linkcheck
from . import configuration
from . import log, LOG_CHECK
from . import gzip2 as gzip
from . import httplib2 as httplib
from . import log, LOG_CHECK, configuration, httputil
__all__ = ["RobotFileParser"]
@ -116,7 +109,7 @@ class RobotFileParser (object):
handlers = [
urllib2.ProxyHandler(urllib.getproxies()),
urllib2.UnknownHandler,
HttpWithGzipHandler,
httputil.HttpWithGzipHandler,
urllib2.HTTPBasicAuthHandler(pwd_manager),
urllib2.ProxyBasicAuthHandler(pwd_manager),
urllib2.HTTPDigestAuthHandler(pwd_manager),
@ -125,7 +118,7 @@ class RobotFileParser (object):
urllib2.HTTPRedirectHandler,
]
if hasattr(httplib, 'HTTPS'):
handlers.append(HttpsWithGzipHandler)
handlers.append(httputil.HttpsWithGzipHandler)
return urllib2.build_opener(*handlers)
def read (self):
@ -423,76 +416,3 @@ class Entry (object):
if line.applies_to(path):
return line.allowance
return True
###########################################################################
# urlutils.py - Simplified urllib handling
#
# Written by Chris Lawrence <lawrencc@debian.org>
# (C) 1999-2002 Chris Lawrence
#
# This program is freely distributable per the following license:
#
## Permission to use, copy, modify, and distribute this software and its
## documentation for any purpose and without fee is hereby granted,
## provided that the above copyright notice appears in all copies and that
## both that copyright notice and this permission notice appear in
## supporting documentation.
##
## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I
## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
## SOFTWARE.
def decode (page):
"""Gunzip or deflate a compressed page."""
log.debug(LOG_CHECK,
"robots.txt page info %d %s", page.code, str(page.info()))
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
# cannot seek in socket descriptors, so must get content now
content = page.read()
try:
if encoding == 'deflate':
fp = StringIO.StringIO(zlib.decompress(content))
else:
fp = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
except zlib.error, msg:
log.debug(LOG_CHECK, "uncompressing had error "
"%s, assuming non-compressed content", str(msg))
fp = StringIO.StringIO(content)
# remove content-encoding header
headers = httplib.HTTPMessage(StringIO.StringIO(""))
ceheader = re.compile(r"(?i)content-encoding:")
for h in page.info().keys():
if not ceheader.match(h):
headers[h] = page.info()[h]
newpage = urllib.addinfourl(fp, headers, page.geturl())
if hasattr(page, "code"):
# python 2.4 compatibility
newpage.code = page.code
if hasattr(page, "msg"):
# python 2.4 compatibility
newpage.msg = page.msg
page = newpage
return page
class HttpWithGzipHandler (urllib2.HTTPHandler):
"""Support gzip encoding."""
def http_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPHandler.http_open(self, req))
if hasattr(httplib, 'HTTPS'):
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
"""Support gzip encoding."""
def http_open (self, req):
"""Send request and decode answer."""
return decode(urllib2.HTTPSHandler.http_open(self, req))
# end of urlutils.py routines
###########################################################################