quote url in output

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1255 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-27 23:33:59 +00:00 · 2004-02-20 14:13:42 +00:00 · 2004-02-20 14:13:42 +00:00 · 5187dbc4c2
commit 5187dbc4c2
parent 7d5ba1da34
7 changed files with 127 additions and 7 deletions
--- a/linkcheck/log/ColoredLogger.py
+++ b/linkcheck/log/ColoredLogger.py
@ -17,6 +17,7 @@

 from StandardLogger import StandardLogger
 from linkcheck import StringUtil, i18n, AnsiColor
+from linkcheck.url import url_quote


 class ColoredLogger (StandardLogger):
@ -88,7 +89,8 @@ class ColoredLogger (StandardLogger):
            if self.prefix:
                self.fd.write("|  ")
            self.fd.write(self.field("realurl")+self.spaces("realurl")+
-                          self.colorreal+urlData.url+self.colorreset+"\n")
+                          self.colorreal+url_quote(urlData.url)+
+                          self.colorreset+"\n")
        if urlData.dltime>=0 and self.has_field("dltime"):
            if self.prefix:
                self.fd.write("|  ")
--- a/linkcheck/log/GMLLogger.py
+++ b/linkcheck/log/GMLLogger.py
@ -18,6 +18,7 @@
 import time
 from linkcheck import Config, i18n
 from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
 from StandardLogger import StandardLogger
 from Logger import Logger

@ -56,7 +57,7 @@ class GMLLogger (StandardLogger):
            self.fd.write("  node [\n")
 	    self.fd.write("    id     %d\n" % node.id)
            if self.has_field("realurl"):
-                self.fd.write('    label  "%s"\n' % node.url)
+                self.fd.write('    label  "%s"\n' % url_quote(node.url))
            if node.dltime>=0 and self.has_field("dltime"):
                self.fd.write("    dltime %d\n" % node.dltime)
            if node.dlsize>=0 and self.has_field("dlsize"):
--- a/linkcheck/log/HtmlLogger.py
+++ b/linkcheck/log/HtmlLogger.py
@ -18,6 +18,7 @@
 from StandardLogger import StandardLogger
 from Logger import Logger
 from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
 from linkcheck import StringUtil, i18n, Config
 import time

@ -96,8 +97,8 @@ class HtmlLogger (StandardLogger):
 	                  urlData.baseRef+"</td>\n</tr>\n")
        if urlData.url and self.has_field("realurl"):
            self.fd.write("<tr>\n<td>"+self.field("realurl")+"</td>\n<td>"+
-	                  '<a target="top" href="'+urlData.url+
-			  '">'+urlData.url+"</a></td>\n</tr>\n")
+	                  '<a target="top" href="'+url_quote(urlData.url)+
+			  '">'+url_quote(urlData.url)+"</a></td>\n</tr>\n")
        if urlData.dltime>=0 and self.has_field("dltime"):
            self.fd.write("<tr>\n<td>"+self.field("dltime")+"</td>\n<td>"+
 	                  (i18n._("%.3f seconds") % urlData.dltime)+
--- a/linkcheck/log/SQLLogger.py
+++ b/linkcheck/log/SQLLogger.py
@ -19,6 +19,7 @@ from StandardLogger import StandardLogger
 from Logger import Logger
 import time
 from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
 from linkcheck import StringUtil, i18n, Config

 class SQLLogger (StandardLogger):
@ -59,7 +60,7 @@ class SQLLogger (StandardLogger):
               StringUtil.sqlify(urlData.warningString),
               StringUtil.sqlify(urlData.infoString),
               urlData.valid,
-               StringUtil.sqlify(urlData.url),
+               StringUtil.sqlify(url_quote(urlData.url)),
               urlData.line,
               urlData.column,
               StringUtil.sqlify(urlData.name),
--- a/linkcheck/log/StandardLogger.py
+++ b/linkcheck/log/StandardLogger.py
@ -17,6 +17,7 @@

 import sys, time
 from linkcheck import Config, i18n
+from linkcheck.url import url_quote
 from Logger import Logger
 from linkcheck.log import strtime, strduration
 from linkcheck import StringUtil
@ -101,7 +102,7 @@ __init__(self, **args)
                          urlData.baseRef+"\n")
        if urlData.url and self.has_field('realurl'):
            self.fd.write(self.field("realurl")+self.spaces("realurl")+
-                          urlData.url+"\n")
+                          url_quote(urlData.url)+"\n")
        if urlData.dltime>=0 and self.has_field('dltime'):
            self.fd.write(self.field("dltime")+self.spaces("dltime")+
 	                  i18n._("%.3f seconds\n") % urlData.dltime)
--- a/linkcheck/log/XMLLogger.py
+++ b/linkcheck/log/XMLLogger.py
@ -19,6 +19,7 @@ import time
 from linkcheck import Config, i18n
 from linkcheck.StringUtil import xmlify
 from linkcheck.log import strtime, strduration
+from linkcheck.url import url_quote
 from StandardLogger import StandardLogger
 from Logger import Logger

@ -59,7 +60,8 @@ class XMLLogger (StandardLogger):
            self.fd.write('  <node name="%d" ' % node.id)
            self.fd.write(">\n")
            if self.has_field("realurl"):
-                self.fd.write("    <label>%s</label>\n" % xmlify(node.url))
+                self.fd.write("    <label>%s</label>\n" %\
+                              xmlify(url_quote(node.url)))
            self.fd.write("    <data>\n")
            if node.dltime>=0 and self.has_field("dltime"):
                self.fd.write("      <dltime>%f</dltime>\n" % node.dltime)
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -0,0 +1,112 @@
+# -*- coding: iso-8859-1 -*-
+"""url utils"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+__version__ = "$Revision$"[11:-2]
+__date__    = "$Date$"[7:-2]
+
+import re, urlparse, os
+from urllib import splittype, splithost, splitnport, splitquery, quote, unquote
+
+# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
+_az09 = r"a-z0-9"
+_path = r"\-\_\.\!\~\*\'\(\)"
+_hex_safe = r"2-9a-f"
+_hex_full = r"0-9a-f"
+_safe_scheme_pattern = r"(https?|ftp)"
+_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals()
+_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals()
+_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals()
+safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\
+                    _safe_path_pattern+_safe_fragment_pattern
+
+is_valid_url = re.compile("^%s$"%safe_url_pattern).match
+
+def safe_host_pattern (host):
+    return _safe_scheme_pattern+"://"+host+ \
+           _safe_path_pattern+_safe_fragment_pattern
+
+
+# XXX better name/implementation for this function
+def stripsite (url):
+    """remove scheme and host from url. return host, newurl"""
+    url = urlparse.urlsplit(url)
+    return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) )
+
+
+def url_norm (url):
+    """unquote and normalize url which must be quoted"""
+    urlparts = list(urlparse.urlsplit(url))
+    urlparts[0] = unquote(urlparts[0])
+    urlparts[1] = unquote(urlparts[1])
+    urlparts[2] = unquote(urlparts[2])
+    urlparts[4] = unquote(urlparts[4])
+    path = urlparts[2].replace('\\', '/')
+    if not path or path=='/':
+        urlparts[2] = '/'
+    else:
+        # XXX this works only under windows and posix??
+        # collapse redundant path segments
+        urlparts[2] = os.path.normpath(path).replace('\\', '/')
+        if path.endswith('/'):
+            urlparts[2] += '/'
+    return urlparse.urlunsplit(urlparts)
+
+
+def url_quote (url):
+    """quote given url"""
+    urlparts = list(urlparse.urlsplit(url))
+    urlparts[0] = quote(urlparts[0])
+    urlparts[1] = quote(urlparts[1], ':')
+    urlparts[2] = quote(urlparts[2], '/')
+    urlparts[4] = quote(urlparts[4])
+    return urlparse.urlunsplit(urlparts)
+
+
+def document_quote (document):
+    """quote given document"""
+    doc, query = splitquery(document)
+    doc = quote(doc, '/')
+    if query:
+        return "%s?%s" % (doc, query)
+    return doc
+
+
+default_ports = {
+    'http' : 80,
+    'https' : 443,
+    'nntps' : 563,
+}
+
+def spliturl (url):
+    """split url in a tuple (scheme, hostname, port, document) where
+    hostname is always lowercased
+    precondition: url is syntactically correct URI (eg has no whitespace)"""
+    scheme, netloc = splittype(url)
+    host, document = splithost(netloc)
+    port = default_ports.get(scheme, 80)
+    if host:
+        host = host.lower()
+        host, port = splitnport(host, port)
+    return scheme, host, port, document
+
+
+# constants defining url part indexes
+SCHEME = 0
+HOSTNAME = DOMAIN = 1
+PORT = 2
+DOCUMENT = 3