diff --git a/linkcheck/log/ColoredLogger.py b/linkcheck/log/ColoredLogger.py
index 3c458a3d..2f36732a 100644
--- a/linkcheck/log/ColoredLogger.py
+++ b/linkcheck/log/ColoredLogger.py
@@ -17,6 +17,7 @@
from StandardLogger import StandardLogger
from linkcheck import StringUtil, i18n, AnsiColor
+from linkcheck.url import url_quote
class ColoredLogger (StandardLogger):
@@ -88,7 +89,8 @@ class ColoredLogger (StandardLogger):
if self.prefix:
self.fd.write("| ")
self.fd.write(self.field("realurl")+self.spaces("realurl")+
- self.colorreal+urlData.url+self.colorreset+"\n")
+ self.colorreal+url_quote(urlData.url)+
+ self.colorreset+"\n")
if urlData.dltime>=0 and self.has_field("dltime"):
if self.prefix:
self.fd.write("| ")
diff --git a/linkcheck/log/GMLLogger.py b/linkcheck/log/GMLLogger.py
index 877b5240..7613a9cb 100644
--- a/linkcheck/log/GMLLogger.py
+++ b/linkcheck/log/GMLLogger.py
@@ -18,6 +18,7 @@
import time
from linkcheck import Config, i18n
from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
from StandardLogger import StandardLogger
from Logger import Logger
@@ -56,7 +57,7 @@ class GMLLogger (StandardLogger):
self.fd.write(" node [\n")
self.fd.write(" id %d\n" % node.id)
if self.has_field("realurl"):
- self.fd.write(' label "%s"\n' % node.url)
+ self.fd.write(' label "%s"\n' % url_quote(node.url))
if node.dltime>=0 and self.has_field("dltime"):
self.fd.write(" dltime %d\n" % node.dltime)
if node.dlsize>=0 and self.has_field("dlsize"):
diff --git a/linkcheck/log/HtmlLogger.py b/linkcheck/log/HtmlLogger.py
index a4a011aa..f8e2ceb5 100644
--- a/linkcheck/log/HtmlLogger.py
+++ b/linkcheck/log/HtmlLogger.py
@@ -18,6 +18,7 @@
from StandardLogger import StandardLogger
from Logger import Logger
from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
from linkcheck import StringUtil, i18n, Config
import time
@@ -96,8 +97,8 @@ class HtmlLogger (StandardLogger):
urlData.baseRef+"\n\n")
if urlData.url and self.has_field("realurl"):
self.fd.write("
\n| "+self.field("dltime")+" | \n"+
(i18n._("%.3f seconds") % urlData.dltime)+
diff --git a/linkcheck/log/SQLLogger.py b/linkcheck/log/SQLLogger.py
index 727ba856..09820623 100644
--- a/linkcheck/log/SQLLogger.py
+++ b/linkcheck/log/SQLLogger.py
@@ -19,6 +19,7 @@ from StandardLogger import StandardLogger
from Logger import Logger
import time
from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
from linkcheck import StringUtil, i18n, Config
class SQLLogger (StandardLogger):
@@ -59,7 +60,7 @@ class SQLLogger (StandardLogger):
StringUtil.sqlify(urlData.warningString),
StringUtil.sqlify(urlData.infoString),
urlData.valid,
- StringUtil.sqlify(urlData.url),
+ StringUtil.sqlify(url_quote(urlData.url)),
urlData.line,
urlData.column,
StringUtil.sqlify(urlData.name),
diff --git a/linkcheck/log/StandardLogger.py b/linkcheck/log/StandardLogger.py
index 9525b825..a5f76259 100644
--- a/linkcheck/log/StandardLogger.py
+++ b/linkcheck/log/StandardLogger.py
@@ -17,6 +17,7 @@
import sys, time
from linkcheck import Config, i18n
+from linkcheck.url import url_quote
from Logger import Logger
from linkcheck.log import strtime, strduration
from linkcheck import StringUtil
@@ -101,7 +102,7 @@ __init__(self, **args)
urlData.baseRef+"\n")
if urlData.url and self.has_field('realurl'):
self.fd.write(self.field("realurl")+self.spaces("realurl")+
- urlData.url+"\n")
+ url_quote(urlData.url)+"\n")
if urlData.dltime>=0 and self.has_field('dltime'):
self.fd.write(self.field("dltime")+self.spaces("dltime")+
i18n._("%.3f seconds\n") % urlData.dltime)
diff --git a/linkcheck/log/XMLLogger.py b/linkcheck/log/XMLLogger.py
index 569a8a57..dd99ba8b 100644
--- a/linkcheck/log/XMLLogger.py
+++ b/linkcheck/log/XMLLogger.py
@@ -19,6 +19,7 @@ import time
from linkcheck import Config, i18n
from linkcheck.StringUtil import xmlify
from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
from StandardLogger import StandardLogger
from Logger import Logger
@@ -59,7 +60,8 @@ class XMLLogger (StandardLogger):
self.fd.write(' \n")
if self.has_field("realurl"):
- self.fd.write(" \n" % xmlify(node.url))
+ self.fd.write(" \n" %\
+ xmlify(url_quote(node.url)))
self.fd.write(" \n")
if node.dltime>=0 and self.has_field("dltime"):
self.fd.write(" %f\n" % node.dltime)
diff --git a/linkcheck/url.py b/linkcheck/url.py
new file mode 100644
index 00000000..508c34bf
--- /dev/null
+++ b/linkcheck/url.py
@@ -0,0 +1,112 @@
+# -*- coding: iso-8859-1 -*-
+"""url utils"""
+# Copyright (C) 2000-2004 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+__version__ = "$Revision$"[11:-2]
+__date__ = "$Date$"[7:-2]
+
+import re, urlparse, os
+from urllib import splittype, splithost, splitnport, splitquery, quote, unquote
+
+# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
+_az09 = r"a-z0-9"
+_path = r"\-\_\.\!\~\*\'\(\)"
+_hex_safe = r"2-9a-f"
+_hex_full = r"0-9a-f"
+_safe_scheme_pattern = r"(https?|ftp)"
+_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals()
+_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals()
+_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals()
+safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\
+ _safe_path_pattern+_safe_fragment_pattern
+
+is_valid_url = re.compile("^%s$"%safe_url_pattern).match
+
+def safe_host_pattern (host):
+ return _safe_scheme_pattern+"://"+host+ \
+ _safe_path_pattern+_safe_fragment_pattern
+
+
+# XXX better name/implementation for this function
+def stripsite (url):
+ """remove scheme and host from url. return host, newurl"""
+ url = urlparse.urlsplit(url)
+ return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) )
+
+
+def url_norm (url):
+ """unquote and normalize url which must be quoted"""
+ urlparts = list(urlparse.urlsplit(url))
+ urlparts[0] = unquote(urlparts[0])
+ urlparts[1] = unquote(urlparts[1])
+ urlparts[2] = unquote(urlparts[2])
+ urlparts[4] = unquote(urlparts[4])
+ path = urlparts[2].replace('\\', '/')
+ if not path or path=='/':
+ urlparts[2] = '/'
+ else:
+ # XXX this works only under windows and posix??
+ # collapse redundant path segments
+ urlparts[2] = os.path.normpath(path).replace('\\', '/')
+ if path.endswith('/'):
+ urlparts[2] += '/'
+ return urlparse.urlunsplit(urlparts)
+
+
+def url_quote (url):
+ """quote given url"""
+ urlparts = list(urlparse.urlsplit(url))
+ urlparts[0] = quote(urlparts[0])
+ urlparts[1] = quote(urlparts[1], ':')
+ urlparts[2] = quote(urlparts[2], '/')
+ urlparts[4] = quote(urlparts[4])
+ return urlparse.urlunsplit(urlparts)
+
+
+def document_quote (document):
+ """quote given document"""
+ doc, query = splitquery(document)
+ doc = quote(doc, '/')
+ if query:
+ return "%s?%s" % (doc, query)
+ return doc
+
+
+default_ports = {
+ 'http' : 80,
+ 'https' : 443,
+ 'nntps' : 563,
+}
+
+def spliturl (url):
+ """split url in a tuple (scheme, hostname, port, document) where
+ hostname is always lowercased
+ precondition: url is syntactically correct URI (eg has no whitespace)"""
+ scheme, netloc = splittype(url)
+ host, document = splithost(netloc)
+ port = default_ports.get(scheme, 80)
+ if host:
+ host = host.lower()
+ host, port = splitnport(host, port)
+ return scheme, host, port, document
+
+
+# constants defining url part indexes
+SCHEME = 0
+HOSTNAME = DOMAIN = 1
+PORT = 2
+DOCUMENT = 3
|