diff --git a/linkcheck/httputil.py b/linkcheck/httputil.py new file mode 100644 index 00000000..b84fae3e --- /dev/null +++ b/linkcheck/httputil.py @@ -0,0 +1,121 @@ +# -*- coding: iso-8859-1 -*- +# Various HTTP utils with a free license +from cStringIO import StringIO +from . import gzip2 as gzip +from . import httplib2 as httplib +from . import log, LOG_CHECK +import re +import mimetypes +import zlib +import urllib +import urllib2 + + +########################################################################### +# urlutils.py - Simplified urllib handling +# +# Written by Chris Lawrence +# (C) 1999-2002 Chris Lawrence +# +# This program is freely distributable per the following license: +# +## Permission to use, copy, modify, and distribute this software and its +## documentation for any purpose and without fee is hereby granted, +## provided that the above copyright notice appears in all copies and that +## both that copyright notice and this permission notice appear in +## supporting documentation. +## +## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL +## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I +## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +## SOFTWARE. +def decode (page): + """Gunzip or deflate a compressed page.""" + log.debug(LOG_CHECK, + "robots.txt page info %d %s", page.code, str(page.info())) + encoding = page.info().get("Content-Encoding") + if encoding in ('gzip', 'x-gzip', 'deflate'): + # cannot seek in socket descriptors, so must get content now + content = page.read() + try: + if encoding == 'deflate': + fp = StringIO(zlib.decompress(content)) + else: + fp = gzip.GzipFile('', 'rb', 9, StringIO(content)) + except zlib.error, msg: + log.debug(LOG_CHECK, "uncompressing had error " + "%s, assuming non-compressed content", str(msg)) + fp = StringIO(content) + # remove content-encoding header + headers = httplib.HTTPMessage(StringIO("")) + ceheader = re.compile(r"(?i)content-encoding:") + for h in page.info().keys(): + if not ceheader.match(h): + headers[h] = page.info()[h] + newpage = urllib.addinfourl(fp, headers, page.geturl()) + if hasattr(page, "code"): + # python 2.4 compatibility + newpage.code = page.code + if hasattr(page, "msg"): + # python 2.4 compatibility + newpage.msg = page.msg + page = newpage + return page + + +class HttpWithGzipHandler (urllib2.HTTPHandler): + """Support gzip encoding.""" + def http_open (self, req): + """Send request and decode answer.""" + return decode(urllib2.HTTPHandler.http_open(self, req)) + + +if hasattr(httplib, 'HTTPS'): + class HttpsWithGzipHandler (urllib2.HTTPSHandler): + """Support gzip encoding.""" + + def http_open (self, req): + """Send request and decode answer.""" + return decode(urllib2.HTTPSHandler.http_open(self, req)) + +# end of urlutils.py routines +########################################################################### + + +def encode_multipart_formdata(fields, files=None): + """ + From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + + fields is a sequence of (name, value) elements for regular form fields. + files is a sequence of (name, filename, value) elements for data to be + uploaded as files. + Return (content_type, body) ready for httplib.HTTP instance + """ + BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' + CRLF = '\r\n' + L = [] + for (key, value) in fields: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"' % key) + L.append('') + L.append(value) + if files is not None: + for (key, filename, value) in files: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + L.append('Content-Type: %s' % get_content_type(filename)) + L.append('') + L.append(value) + L.append('--' + BOUNDARY + '--') + L.append('') + body = CRLF.join(L) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 154692d0..2994f77d 100755 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -20,21 +20,14 @@ Robots.txt parser. The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/wc/norobots-rfc.html """ - import urlparse -import httplib import urllib import urllib2 import time import socket -import re -import zlib import sys -import cStringIO as StringIO -import linkcheck -from . import configuration -from . import log, LOG_CHECK -from . import gzip2 as gzip +from . import httplib2 as httplib +from . import log, LOG_CHECK, configuration, httputil __all__ = ["RobotFileParser"] @@ -116,7 +109,7 @@ class RobotFileParser (object): handlers = [ urllib2.ProxyHandler(urllib.getproxies()), urllib2.UnknownHandler, - HttpWithGzipHandler, + httputil.HttpWithGzipHandler, urllib2.HTTPBasicAuthHandler(pwd_manager), urllib2.ProxyBasicAuthHandler(pwd_manager), urllib2.HTTPDigestAuthHandler(pwd_manager), @@ -125,7 +118,7 @@ class RobotFileParser (object): urllib2.HTTPRedirectHandler, ] if hasattr(httplib, 'HTTPS'): - handlers.append(HttpsWithGzipHandler) + handlers.append(httputil.HttpsWithGzipHandler) return urllib2.build_opener(*handlers) def read (self): @@ -423,76 +416,3 @@ class Entry (object): if line.applies_to(path): return line.allowance return True - -########################################################################### -# urlutils.py - Simplified urllib handling -# -# Written by Chris Lawrence -# (C) 1999-2002 Chris Lawrence -# -# This program is freely distributable per the following license: -# -## Permission to use, copy, modify, and distribute this software and its -## documentation for any purpose and without fee is hereby granted, -## provided that the above copyright notice appears in all copies and that -## both that copyright notice and this permission notice appear in -## supporting documentation. -## -## I DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL -## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL I -## BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY -## DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -## WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, -## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS -## SOFTWARE. -def decode (page): - """Gunzip or deflate a compressed page.""" - log.debug(LOG_CHECK, - "robots.txt page info %d %s", page.code, str(page.info())) - encoding = page.info().get("Content-Encoding") - if encoding in ('gzip', 'x-gzip', 'deflate'): - # cannot seek in socket descriptors, so must get content now - content = page.read() - try: - if encoding == 'deflate': - fp = StringIO.StringIO(zlib.decompress(content)) - else: - fp = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content)) - except zlib.error, msg: - log.debug(LOG_CHECK, "uncompressing had error " - "%s, assuming non-compressed content", str(msg)) - fp = StringIO.StringIO(content) - # remove content-encoding header - headers = httplib.HTTPMessage(StringIO.StringIO("")) - ceheader = re.compile(r"(?i)content-encoding:") - for h in page.info().keys(): - if not ceheader.match(h): - headers[h] = page.info()[h] - newpage = urllib.addinfourl(fp, headers, page.geturl()) - if hasattr(page, "code"): - # python 2.4 compatibility - newpage.code = page.code - if hasattr(page, "msg"): - # python 2.4 compatibility - newpage.msg = page.msg - page = newpage - return page - - -class HttpWithGzipHandler (urllib2.HTTPHandler): - """Support gzip encoding.""" - def http_open (self, req): - """Send request and decode answer.""" - return decode(urllib2.HTTPHandler.http_open(self, req)) - - -if hasattr(httplib, 'HTTPS'): - class HttpsWithGzipHandler (urllib2.HTTPSHandler): - """Support gzip encoding.""" - - def http_open (self, req): - """Send request and decode answer.""" - return decode(urllib2.HTTPSHandler.http_open(self, req)) - -# end of urlutils.py routines -###########################################################################