From 58fab5a44fdf7009ecff0932791aaa2952ed84a3 Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Thu, 27 May 2004 08:29:43 +0000
Subject: [PATCH] updated from webcleaner

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1330 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 linkcheck/url.py | 126 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 95 insertions(+), 31 deletions(-)

diff --git a/linkcheck/url.py b/linkcheck/url.py
index 2696cbcd..a9e5a16a 100644
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-"""url utils"""
+"""url utils, can be used as a standalone module"""
 # Copyright (C) 2000-2004  Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
@@ -19,27 +19,58 @@
 __version__ = "$Revision$"[11:-2]
 __date__    = "$Date$"[7:-2]
 
-import re, urlparse, os
-from urllib import splittype, splithost, splitnport, splitquery, quote, unquote
+import re
+import urlparse
+import os
+import urllib
+import cgi
+
 
 # adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
 # http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/filter-html.html#VALIDATING-URIS
-_az09 = r"a-z0-9"
-_path = r"\-\_\.\!\~\*\'\(\)"
-_hex_safe = r"2-9a-f"
-_hex_full = r"0-9a-f"
+_basic = {
+    "_az09": r"a-z0-9",
+    "_path": r"\-\_\.\!\~\*\'\(\),",
+    "_hex_safe": r"2-9a-f",
+    "_hex_full": r"0-9a-f",
+}
+_safe_char = r"([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))"%_basic
 _safe_scheme_pattern = r"(https?|ftp)"
-_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals()
-_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals()
-_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals()
-safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\
-                    _safe_path_pattern+_safe_fragment_pattern
+_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)(:(80|8080|8000))?"%_basic
+_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%_basic
+_safe_fragment_pattern = r"%s*"%_safe_char
+_safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char)
+_safe_query_pattern = r"(%s(&%s)*)?"%(_safe_cgi, _safe_cgi)
+safe_url_pattern = r"%s://%s%s(#%s)?" % \
+    (_safe_scheme_pattern, _safe_host_pattern,
+     _safe_path_pattern, _safe_fragment_pattern)
+
+is_valid_url = re.compile("(?i)^%s$"%safe_url_pattern).match
+is_valid_host = re.compile("(?i)^%s$"%_safe_host_pattern).match
+is_valid_path = re.compile("(?i)^%s$"%_safe_path_pattern).match
+is_valid_query = re.compile("(?i)^%s$"%_safe_query_pattern).match
+is_valid_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match
+
+def is_valid_js_url (urlstr):
+    """test javascript urls"""
+    url = urlparse.urlsplit(urlstr)
+    if url[0].lower()!='http':
+        return False
+    if not is_valid_host(url[1]):
+        return False
+    if not is_valid_path(url[2]):
+        return False
+    if not is_valid_query(url[3]):
+        return False
+    if not is_valid_fragment(url[4]):
+        return False
+    return True
 
-is_valid_url = re.compile("^%s$"%safe_url_pattern).match
 
 def safe_host_pattern (host):
-    return _safe_scheme_pattern+"://"+host+ \
-           _safe_path_pattern+_safe_fragment_pattern
+    """return regular expression pattern with given host for url testing"""
+    return "(?i)%s://%s%s(#%s)?" % \
+     (_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
 
 
 # XXX better name/implementation for this function
@@ -52,11 +83,24 @@ def stripsite (url):
 def url_norm (url):
     """unquote and normalize url which must be quoted"""
     urlparts = list(urlparse.urlsplit(url))
-    urlparts[0] = unquote(urlparts[0])
-    urlparts[1] = unquote(urlparts[1])
-    urlparts[2] = unquote(urlparts[2])
-    urlparts[4] = unquote(urlparts[4])
-    path = urlparts[2].replace('\\', '/')
+    urlparts[0] = urllib.unquote(urlparts[0]) # scheme
+    urlparts[1] = urllib.unquote(urlparts[1]) # host
+    # a leading backslash in path causes urlsplit() to add the
+    # path components up to the first slash to host
+    # try to find this case...
+    i = urlparts[1].find("\\")
+    if i != -1:
+        # ...and fix it by prepending the misplaced components to the path
+        comps = urlparts[1][i:] # note: still has leading backslash
+        if not urlparts[2] or urlparts[2]=='/':
+            urlparts[2] = comps
+        else:
+            urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
+        urlparts[1] = urlparts[1][:i]
+    else:
+        urlparts[2] = urllib.unquote(urlparts[2]) # path
+    urlparts[4] = urllib.unquote(urlparts[4]) # anchor
+    path = urlparts[2].replace('\\', '/').replace('//', '/')
     if not path or path=='/':
         urlparts[2] = '/'
     else:
@@ -70,25 +114,45 @@ def url_norm (url):
 
 def url_quote (url):
     """quote given url"""
-    if not url:
-        return ""
     urlparts = list(urlparse.urlsplit(url))
-    urlparts[0] = quote(urlparts[0])
-    urlparts[1] = quote(urlparts[1], ':')
-    urlparts[2] = quote(urlparts[2], '/')
-    urlparts[4] = quote(urlparts[4])
+    urlparts[0] = urllib.quote(urlparts[0]) # scheme
+    urlparts[1] = urllib.quote(urlparts[1], ':') # host
+    urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
+    l = []
+    for k,v in cgi.parse_qsl(urlparts[3], True): # query
+        l.append("%s=%s" % (urllib.quote(k, '/-'), urllib.quote(v, '/-')))
+    urlparts[3] = '&'.join(l)
+    urlparts[4] = urllib.quote(urlparts[4]) # anchor
     return urlparse.urlunsplit(urlparts)
 
 
 def document_quote (document):
     """quote given document"""
-    doc, query = splitquery(document)
-    doc = quote(doc, '/')
+    doc, query = urllib.splitquery(document)
+    doc = urllib.quote(doc, '/=,')
     if query:
         return "%s?%s" % (doc, query)
     return doc
 
 
+def match_url (url, domainlist):
+    """return True if host part of url matches an entry in given domain
+       list"""
+    if not url:
+        return False
+    return match_host(spliturl(url)[1], domainlist)
+
+
+def match_host (host, domainlist):
+    """return True if host matches an entry in given domain list"""
+    if not host:
+        return False
+    for domain in domainlist:
+        if host.endswith(domain):
+            return True
+    return False
+
+
 default_ports = {
     'http' : 80,
     'https' : 443,
@@ -99,12 +163,12 @@ def spliturl (url):
     """split url in a tuple (scheme, hostname, port, document) where
     hostname is always lowercased
     precondition: url is syntactically correct URI (eg has no whitespace)"""
-    scheme, netloc = splittype(url)
-    host, document = splithost(netloc)
+    scheme, netloc = urllib.splittype(url)
+    host, document = urllib.splithost(netloc)
     port = default_ports.get(scheme, 80)
     if host:
         host = host.lower()
-        host, port = splitnport(host, port)
+        host, port = urllib.splitnport(host, port)
     return scheme, host, port, document