diff --git a/linkcheck/log/ColoredLogger.py b/linkcheck/log/ColoredLogger.py index 3c458a3d..2f36732a 100644 --- a/linkcheck/log/ColoredLogger.py +++ b/linkcheck/log/ColoredLogger.py @@ -17,6 +17,7 @@ from StandardLogger import StandardLogger from linkcheck import StringUtil, i18n, AnsiColor +from linkcheck.url import url_quote class ColoredLogger (StandardLogger): @@ -88,7 +89,8 @@ class ColoredLogger (StandardLogger): if self.prefix: self.fd.write("| ") self.fd.write(self.field("realurl")+self.spaces("realurl")+ - self.colorreal+urlData.url+self.colorreset+"\n") + self.colorreal+url_quote(urlData.url)+ + self.colorreset+"\n") if urlData.dltime>=0 and self.has_field("dltime"): if self.prefix: self.fd.write("| ") diff --git a/linkcheck/log/GMLLogger.py b/linkcheck/log/GMLLogger.py index 877b5240..7613a9cb 100644 --- a/linkcheck/log/GMLLogger.py +++ b/linkcheck/log/GMLLogger.py @@ -18,6 +18,7 @@ import time from linkcheck import Config, i18n from linkcheck.log import strtime, strduration +from linkcheck.url import url_quote from StandardLogger import StandardLogger from Logger import Logger @@ -56,7 +57,7 @@ class GMLLogger (StandardLogger): self.fd.write(" node [\n") self.fd.write(" id %d\n" % node.id) if self.has_field("realurl"): - self.fd.write(' label "%s"\n' % node.url) + self.fd.write(' label "%s"\n' % url_quote(node.url)) if node.dltime>=0 and self.has_field("dltime"): self.fd.write(" dltime %d\n" % node.dltime) if node.dlsize>=0 and self.has_field("dlsize"): diff --git a/linkcheck/log/HtmlLogger.py b/linkcheck/log/HtmlLogger.py index a4a011aa..f8e2ceb5 100644 --- a/linkcheck/log/HtmlLogger.py +++ b/linkcheck/log/HtmlLogger.py @@ -18,6 +18,7 @@ from StandardLogger import StandardLogger from Logger import Logger from linkcheck.log import strtime, strduration +from linkcheck.url import url_quote from linkcheck import StringUtil, i18n, Config import time @@ -96,8 +97,8 @@ class HtmlLogger (StandardLogger): urlData.baseRef+"\n\n") if urlData.url and self.has_field("realurl"): self.fd.write("\n"+self.field("realurl")+"\n"+ - ''+urlData.url+"\n\n") + ''+url_quote(urlData.url)+"\n\n") if urlData.dltime>=0 and self.has_field("dltime"): self.fd.write("\n"+self.field("dltime")+"\n"+ (i18n._("%.3f seconds") % urlData.dltime)+ diff --git a/linkcheck/log/SQLLogger.py b/linkcheck/log/SQLLogger.py index 727ba856..09820623 100644 --- a/linkcheck/log/SQLLogger.py +++ b/linkcheck/log/SQLLogger.py @@ -19,6 +19,7 @@ from StandardLogger import StandardLogger from Logger import Logger import time from linkcheck.log import strtime, strduration +from linkcheck.url import url_quote from linkcheck import StringUtil, i18n, Config class SQLLogger (StandardLogger): @@ -59,7 +60,7 @@ class SQLLogger (StandardLogger): StringUtil.sqlify(urlData.warningString), StringUtil.sqlify(urlData.infoString), urlData.valid, - StringUtil.sqlify(urlData.url), + StringUtil.sqlify(url_quote(urlData.url)), urlData.line, urlData.column, StringUtil.sqlify(urlData.name), diff --git a/linkcheck/log/StandardLogger.py b/linkcheck/log/StandardLogger.py index 9525b825..a5f76259 100644 --- a/linkcheck/log/StandardLogger.py +++ b/linkcheck/log/StandardLogger.py @@ -17,6 +17,7 @@ import sys, time from linkcheck import Config, i18n +from linkcheck.url import url_quote from Logger import Logger from linkcheck.log import strtime, strduration from linkcheck import StringUtil @@ -101,7 +102,7 @@ __init__(self, **args) urlData.baseRef+"\n") if urlData.url and self.has_field('realurl'): self.fd.write(self.field("realurl")+self.spaces("realurl")+ - urlData.url+"\n") + url_quote(urlData.url)+"\n") if urlData.dltime>=0 and self.has_field('dltime'): self.fd.write(self.field("dltime")+self.spaces("dltime")+ i18n._("%.3f seconds\n") % urlData.dltime) diff --git a/linkcheck/log/XMLLogger.py b/linkcheck/log/XMLLogger.py index 569a8a57..dd99ba8b 100644 --- a/linkcheck/log/XMLLogger.py +++ b/linkcheck/log/XMLLogger.py @@ -19,6 +19,7 @@ import time from linkcheck import Config, i18n from linkcheck.StringUtil import xmlify from linkcheck.log import strtime, strduration +from linkcheck.url import url_quote from StandardLogger import StandardLogger from Logger import Logger @@ -59,7 +60,8 @@ class XMLLogger (StandardLogger): self.fd.write(' \n") if self.has_field("realurl"): - self.fd.write(" \n" % xmlify(node.url)) + self.fd.write(" \n" %\ + xmlify(url_quote(node.url))) self.fd.write(" \n") if node.dltime>=0 and self.has_field("dltime"): self.fd.write(" %f\n" % node.dltime) diff --git a/linkcheck/url.py b/linkcheck/url.py new file mode 100644 index 00000000..508c34bf --- /dev/null +++ b/linkcheck/url.py @@ -0,0 +1,112 @@ +# -*- coding: iso-8859-1 -*- +"""url utils""" +# Copyright (C) 2000-2004 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +__version__ = "$Revision$"[11:-2] +__date__ = "$Date$"[7:-2] + +import re, urlparse, os +from urllib import splittype, splithost, splitnport, splitquery, quote, unquote + +# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO" +_az09 = r"a-z0-9" +_path = r"\-\_\.\!\~\*\'\(\)" +_hex_safe = r"2-9a-f" +_hex_full = r"0-9a-f" +_safe_scheme_pattern = r"(https?|ftp)" +_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals() +_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals() +_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals() +safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\ + _safe_path_pattern+_safe_fragment_pattern + +is_valid_url = re.compile("^%s$"%safe_url_pattern).match + +def safe_host_pattern (host): + return _safe_scheme_pattern+"://"+host+ \ + _safe_path_pattern+_safe_fragment_pattern + + +# XXX better name/implementation for this function +def stripsite (url): + """remove scheme and host from url. return host, newurl""" + url = urlparse.urlsplit(url) + return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) ) + + +def url_norm (url): + """unquote and normalize url which must be quoted""" + urlparts = list(urlparse.urlsplit(url)) + urlparts[0] = unquote(urlparts[0]) + urlparts[1] = unquote(urlparts[1]) + urlparts[2] = unquote(urlparts[2]) + urlparts[4] = unquote(urlparts[4]) + path = urlparts[2].replace('\\', '/') + if not path or path=='/': + urlparts[2] = '/' + else: + # XXX this works only under windows and posix?? + # collapse redundant path segments + urlparts[2] = os.path.normpath(path).replace('\\', '/') + if path.endswith('/'): + urlparts[2] += '/' + return urlparse.urlunsplit(urlparts) + + +def url_quote (url): + """quote given url""" + urlparts = list(urlparse.urlsplit(url)) + urlparts[0] = quote(urlparts[0]) + urlparts[1] = quote(urlparts[1], ':') + urlparts[2] = quote(urlparts[2], '/') + urlparts[4] = quote(urlparts[4]) + return urlparse.urlunsplit(urlparts) + + +def document_quote (document): + """quote given document""" + doc, query = splitquery(document) + doc = quote(doc, '/') + if query: + return "%s?%s" % (doc, query) + return doc + + +default_ports = { + 'http' : 80, + 'https' : 443, + 'nntps' : 563, +} + +def spliturl (url): + """split url in a tuple (scheme, hostname, port, document) where + hostname is always lowercased + precondition: url is syntactically correct URI (eg has no whitespace)""" + scheme, netloc = splittype(url) + host, document = splithost(netloc) + port = default_ports.get(scheme, 80) + if host: + host = host.lower() + host, port = splitnport(host, port) + return scheme, host, port, document + + +# constants defining url part indexes +SCHEME = 0 +HOSTNAME = DOMAIN = 1 +PORT = 2 +DOCUMENT = 3