From 58fab5a44fdf7009ecff0932791aaa2952ed84a3 Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 27 May 2004 08:29:43 +0000 Subject: [PATCH] updated from webcleaner git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1330 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/url.py | 126 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 31 deletions(-) diff --git a/linkcheck/url.py b/linkcheck/url.py index 2696cbcd..a9e5a16a 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -"""url utils""" +"""url utils, can be used as a standalone module""" # Copyright (C) 2000-2004 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify @@ -19,27 +19,58 @@ __version__ = "$Revision$"[11:-2] __date__ = "$Date$"[7:-2] -import re, urlparse, os -from urllib import splittype, splithost, splitnport, splitquery, quote, unquote +import re +import urlparse +import os +import urllib +import cgi + # adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO" # http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/filter-html.html#VALIDATING-URIS -_az09 = r"a-z0-9" -_path = r"\-\_\.\!\~\*\'\(\)" -_hex_safe = r"2-9a-f" -_hex_full = r"0-9a-f" +_basic = { + "_az09": r"a-z0-9", + "_path": r"\-\_\.\!\~\*\'\(\),", + "_hex_safe": r"2-9a-f", + "_hex_full": r"0-9a-f", +} +_safe_char = r"([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))"%_basic _safe_scheme_pattern = r"(https?|ftp)" -_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals() -_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals() -_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals() -safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\ - _safe_path_pattern+_safe_fragment_pattern +_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)(:(80|8080|8000))?"%_basic +_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%_basic +_safe_fragment_pattern = r"%s*"%_safe_char +_safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char) +_safe_query_pattern = r"(%s(&%s)*)?"%(_safe_cgi, _safe_cgi) +safe_url_pattern = r"%s://%s%s(#%s)?" % \ + (_safe_scheme_pattern, _safe_host_pattern, + _safe_path_pattern, _safe_fragment_pattern) + +is_valid_url = re.compile("(?i)^%s$"%safe_url_pattern).match +is_valid_host = re.compile("(?i)^%s$"%_safe_host_pattern).match +is_valid_path = re.compile("(?i)^%s$"%_safe_path_pattern).match +is_valid_query = re.compile("(?i)^%s$"%_safe_query_pattern).match +is_valid_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match + +def is_valid_js_url (urlstr): + """test javascript urls""" + url = urlparse.urlsplit(urlstr) + if url[0].lower()!='http': + return False + if not is_valid_host(url[1]): + return False + if not is_valid_path(url[2]): + return False + if not is_valid_query(url[3]): + return False + if not is_valid_fragment(url[4]): + return False + return True -is_valid_url = re.compile("^%s$"%safe_url_pattern).match def safe_host_pattern (host): - return _safe_scheme_pattern+"://"+host+ \ - _safe_path_pattern+_safe_fragment_pattern + """return regular expression pattern with given host for url testing""" + return "(?i)%s://%s%s(#%s)?" % \ + (_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern) # XXX better name/implementation for this function @@ -52,11 +83,24 @@ def stripsite (url): def url_norm (url): """unquote and normalize url which must be quoted""" urlparts = list(urlparse.urlsplit(url)) - urlparts[0] = unquote(urlparts[0]) - urlparts[1] = unquote(urlparts[1]) - urlparts[2] = unquote(urlparts[2]) - urlparts[4] = unquote(urlparts[4]) - path = urlparts[2].replace('\\', '/') + urlparts[0] = urllib.unquote(urlparts[0]) # scheme + urlparts[1] = urllib.unquote(urlparts[1]) # host + # a leading backslash in path causes urlsplit() to add the + # path components up to the first slash to host + # try to find this case... + i = urlparts[1].find("\\") + if i != -1: + # ...and fix it by prepending the misplaced components to the path + comps = urlparts[1][i:] # note: still has leading backslash + if not urlparts[2] or urlparts[2]=='/': + urlparts[2] = comps + else: + urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2])) + urlparts[1] = urlparts[1][:i] + else: + urlparts[2] = urllib.unquote(urlparts[2]) # path + urlparts[4] = urllib.unquote(urlparts[4]) # anchor + path = urlparts[2].replace('\\', '/').replace('//', '/') if not path or path=='/': urlparts[2] = '/' else: @@ -70,25 +114,45 @@ def url_norm (url): def url_quote (url): """quote given url""" - if not url: - return "" urlparts = list(urlparse.urlsplit(url)) - urlparts[0] = quote(urlparts[0]) - urlparts[1] = quote(urlparts[1], ':') - urlparts[2] = quote(urlparts[2], '/') - urlparts[4] = quote(urlparts[4]) + urlparts[0] = urllib.quote(urlparts[0]) # scheme + urlparts[1] = urllib.quote(urlparts[1], ':') # host + urlparts[2] = urllib.quote(urlparts[2], '/=,') # path + l = [] + for k,v in cgi.parse_qsl(urlparts[3], True): # query + l.append("%s=%s" % (urllib.quote(k, '/-'), urllib.quote(v, '/-'))) + urlparts[3] = '&'.join(l) + urlparts[4] = urllib.quote(urlparts[4]) # anchor return urlparse.urlunsplit(urlparts) def document_quote (document): """quote given document""" - doc, query = splitquery(document) - doc = quote(doc, '/') + doc, query = urllib.splitquery(document) + doc = urllib.quote(doc, '/=,') if query: return "%s?%s" % (doc, query) return doc +def match_url (url, domainlist): + """return True if host part of url matches an entry in given domain + list""" + if not url: + return False + return match_host(spliturl(url)[1], domainlist) + + +def match_host (host, domainlist): + """return True if host matches an entry in given domain list""" + if not host: + return False + for domain in domainlist: + if host.endswith(domain): + return True + return False + + default_ports = { 'http' : 80, 'https' : 443, @@ -99,12 +163,12 @@ def spliturl (url): """split url in a tuple (scheme, hostname, port, document) where hostname is always lowercased precondition: url is syntactically correct URI (eg has no whitespace)""" - scheme, netloc = splittype(url) - host, document = splithost(netloc) + scheme, netloc = urllib.splittype(url) + host, document = urllib.splithost(netloc) port = default_ports.get(scheme, 80) if host: host = host.lower() - host, port = splitnport(host, port) + host, port = urllib.splitnport(host, port) return scheme, host, port, document