# -*- coding: iso-8859-1 -*- """url utils, can be used as a standalone module""" # Copyright (C) 2000-2004 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. __version__ = "$Revision$"[11:-2] __date__ = "$Date$"[7:-2] import re import urlparse import os import urllib import cgi # adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO" # http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/filter-html.html#VALIDATING-URIS _basic = { "_az09": r"a-z0-9", "_path": r"\-\_\.\!\~\*\'\(\),", "_hex_safe": r"2-9a-f", "_hex_full": r"0-9a-f", } _safe_char = r"([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))"%_basic _safe_scheme_pattern = r"(https?|ftp)" _safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)(:(80|8080|8000))?"%_basic _safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%_basic _safe_fragment_pattern = r"%s*"%_safe_char _safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char) _safe_query_pattern = r"(%s(&%s)*)?"%(_safe_cgi, _safe_cgi) safe_url_pattern = r"%s://%s%s(#%s)?" % \ (_safe_scheme_pattern, _safe_host_pattern, _safe_path_pattern, _safe_fragment_pattern) is_valid_url = re.compile("(?i)^%s$"%safe_url_pattern).match is_valid_host = re.compile("(?i)^%s$"%_safe_host_pattern).match is_valid_path = re.compile("(?i)^%s$"%_safe_path_pattern).match is_valid_query = re.compile("(?i)^%s$"%_safe_query_pattern).match is_valid_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match def is_valid_js_url (urlstr): """test javascript urls""" url = urlparse.urlsplit(urlstr) if url[0].lower()!='http': return False if not is_valid_host(url[1]): return False if not is_valid_path(url[2]): return False if not is_valid_query(url[3]): return False if not is_valid_fragment(url[4]): return False return True def safe_host_pattern (host): """return regular expression pattern with given host for url testing""" return "(?i)%s://%s%s(#%s)?" % \ (_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern) # XXX better name/implementation for this function def stripsite (url): """remove scheme and host from url. return host, newurl""" url = urlparse.urlsplit(url) return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) ) def url_norm (url): """unquote and normalize url which must be quoted""" urlparts = list(urlparse.urlsplit(url)) urlparts[0] = urllib.unquote(urlparts[0]) # scheme urlparts[1] = urllib.unquote(urlparts[1]) # host # a leading backslash in path causes urlsplit() to add the # path components up to the first slash to host # try to find this case... i = urlparts[1].find("\\") if i != -1: # ...and fix it by prepending the misplaced components to the path comps = urlparts[1][i:] # note: still has leading backslash if not urlparts[2] or urlparts[2]=='/': urlparts[2] = comps else: urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2])) urlparts[1] = urlparts[1][:i] else: urlparts[2] = urllib.unquote(urlparts[2]) # path urlparts[4] = urllib.unquote(urlparts[4]) # anchor path = urlparts[2].replace('\\', '/').replace('//', '/') if not path or path=='/': urlparts[2] = '/' else: # XXX this works only under windows and posix?? # collapse redundant path segments urlparts[2] = os.path.normpath(path).replace('\\', '/') if path.endswith('/'): urlparts[2] += '/' return urlparse.urlunsplit(urlparts) def url_quote (url): """quote given url""" urlparts = list(urlparse.urlsplit(url)) urlparts[0] = urllib.quote(urlparts[0]) # scheme urlparts[1] = urllib.quote(urlparts[1], ':') # host urlparts[2] = urllib.quote(urlparts[2], '/=,') # path l = [] for k,v in cgi.parse_qsl(urlparts[3], True): # query l.append("%s=%s" % (urllib.quote(k, '/-'), urllib.quote(v, '/-'))) urlparts[3] = '&'.join(l) urlparts[4] = urllib.quote(urlparts[4]) # anchor return urlparse.urlunsplit(urlparts) def document_quote (document): """quote given document""" doc, query = urllib.splitquery(document) doc = urllib.quote(doc, '/=,') if query: return "%s?%s" % (doc, query) return doc def match_url (url, domainlist): """return True if host part of url matches an entry in given domain list""" if not url: return False return match_host(spliturl(url)[1], domainlist) def match_host (host, domainlist): """return True if host matches an entry in given domain list""" if not host: return False for domain in domainlist: if host.endswith(domain): return True return False default_ports = { 'http' : 80, 'https' : 443, 'nntps' : 563, } def spliturl (url): """split url in a tuple (scheme, hostname, port, document) where hostname is always lowercased precondition: url is syntactically correct URI (eg has no whitespace)""" scheme, netloc = urllib.splittype(url) host, document = urllib.splithost(netloc) port = default_ports.get(scheme, 80) if host: host = host.lower() host, port = urllib.splitnport(host, port) return scheme, host, port, document # constants defining url part indexes SCHEME = 0 HOSTNAME = DOMAIN = 1 PORT = 2 DOCUMENT = 3