linkchecker/linkcheck/url.py

# -*- coding: iso-8859-1 -*-
"""url utils"""
# Copyright (C) 2000-2004  Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

import re
import urlparse
import urllib

# constants defining url part indexes
SCHEME = 0
HOSTNAME = DOMAIN = 1
PORT = 2
DOCUMENT = 3

default_ports = {
    'http' : 80,
    'https' : 443,
    'nntps' : 563,
}

# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
# http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/\
# filter-html.html#VALIDATING-URIS
_basic = {
    "_az09": r"a-z0-9",
    "_path": r"\-\_\.\!\~\*\'\(\),",
    "_hex_safe": r"2-9a-f",
    "_hex_full": r"0-9a-f",
}
_safe_char = r"([%(_az09)s%(_path)s\+]|"\
             r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
_safe_scheme_pattern = r"(https?|ftp)"
_safe_domain_pattern = r"([%(_az09)s][%(_az09)s\-]*"\
                       r"(\.[%(_az09)s][%(_az09)s\-]*)*\.?)" % _basic
_safe_host_pattern = _safe_domain_pattern+r"(:(80|8080|8000))?" % _basic
_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|"\
                     r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
_safe_fragment_pattern = r"%s*" % _safe_char
_safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char)
_safe_query_pattern = r"(%s(&%s)*)?" % (_safe_cgi, _safe_cgi)
safe_url_pattern = r"%s://%s%s(#%s)?" % \
    (_safe_scheme_pattern, _safe_host_pattern,
     _safe_path_pattern, _safe_fragment_pattern)

is_safe_url = re.compile("(?i)^%s$"%safe_url_pattern).match
is_safe_domain = re.compile("(?i)^%s$"%_safe_domain_pattern).match
is_safe_host = re.compile("(?i)^%s$"%_safe_host_pattern).match
is_safe_path = re.compile("(?i)^%s$"%_safe_path_pattern).match
is_safe_query = re.compile("(?i)^%s$"%_safe_query_pattern).match
is_safe_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match

def is_safe_js_url (urlstr):
    """test javascript URLs"""
    url = urlparse.urlsplit(urlstr)
    if url[0].lower() != 'http':
        return False
    if not is_safe_host(url[1]):
        return False
    if not is_safe_path(url[2]):
        return False
    if not is_safe_query(url[3]):
        return False
    if not is_safe_fragment(url[4]):
        return False
    return True


def is_numeric_port (portstr):
    """return True iff portstr is a valid port number"""
    if portstr.isdigit():
        port = int(portstr)
        # 65536 == 2**16
        return 0 < port < 65536
    return False


def safe_host_pattern (host):
    """return regular expression pattern with given host for URL testing"""
    return "(?i)%s://%s%s(#%s)?" % \
     (_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)


# XXX better name/implementation for this function
def stripsite (url):
    """remove scheme and host from URL. return host, newurl"""
    url = urlparse.urlsplit(url)
    return url[1], urlparse.urlunsplit((0, 0, url[2], url[3], url[4]))


def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
    """Parse a query given as a string argument.

    Arguments:

    qs: URL-encoded query string to be parsed

    keep_blank_values: flag indicating whether blank values in
        URL encoded queries should be treated as blank strings.  A
        true value indicates that blanks should be retained as blank
        strings.  The default false value indicates that blank values
        are to be ignored and treated as if they were  not included.

    strict_parsing: flag indicating what to do with parsing errors. If
        false (the default), errors are silently ignored. If true,
        errors raise a ValueError exception.

    Returns a list, as G-d intended.
    """
    pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
    r = []
    for name_value in pairs:
        nv = name_value.split('=', 1)
        if len(nv) != 2:
            if strict_parsing:
                raise ValueError, "bad query field: %s" % `name_value`
            elif len(nv) == 1:
                nv = (nv[0], "")
            else:
                continue
        if len(nv[1]) or keep_blank_values:
            name = urllib.unquote(nv[0].replace('+', ' '))
            value = urllib.unquote(nv[1].replace('+', ' '))
            r.append((name, value))
    return r


def url_norm (url):
    """fix and normalize URL which must be quoted"""
    urlparts = list(urlparse.urlsplit(url))
    urlparts[0] = urllib.unquote(urlparts[0]).lower() # scheme
    urlparts[1] = urllib.unquote(urlparts[1]).lower() # host
    # a leading backslash in path causes urlsplit() to add the
    # path components up to the first slash to host
    # try to find this case...
    i = urlparts[1].find("\\")
    if i != -1:
        # ...and fix it by prepending the misplaced components to the path
        comps = urlparts[1][i:] # note: still has leading backslash
        if not urlparts[2] or urlparts[2] == '/':
            urlparts[2] = comps
        else:
            urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
        urlparts[1] = urlparts[1][:i]
    else:
        urlparts[2] = urllib.unquote(urlparts[2]) # path
    if urlparts[1]:
        userpass, host = urllib.splituser(urlparts[1])
        if userpass:
            # append AT for easy concatenation
            userpass += "@"
        else:
            userpass = ""
        host, port = urllib.splitnport(host)
        # remove trailing dot
        if host.endswith("."):
            host = host[:-1]
        # remove a default (or invalid) port
        if port in [-1, None, default_ports.get(urlparts[0])]:
            urlparts[1] = userpass+host
        else:
            urlparts[1] = "%s%s:%d" % (userpass, host, port)
    l = []
    for k, v in parse_qsl(urlparts[3], True): # query
        k = urllib.quote(k, '/-:,')
        if v:
            v = urllib.quote(v, '/-:,')
            l.append("%s=%s" % (k, v))
        else:
            l.append(k)
    urlparts[3] = '&'.join(l)
    if not urlparts[2]:
        # empty path should be a slash, but not in certain schemes
        # note that in relative links, urlparts[0] might be empty
        # in this case, do not make any assumptions
        if urlparts[0] and urlparts[0] not in urlparse.non_hierarchical:
            urlparts[2] = '/'
    else:
        # fix redundant path parts
        urlparts[2] = collapse_segments(urlparts[2])
    # quote parts again
    urlparts[0] = urllib.quote(urlparts[0]) # scheme
    urlparts[1] = urllib.quote(urlparts[1], ':@') # host
    nopathquote = ';/=,~*-+()@'
    urlparts[2] = urllib.quote(urlparts[2], nopathquote) # path
    res = urlparse.urlunsplit(urlparts)
    if url.endswith('#') and not urlparts[4]:
        # re-append trailing empty fragment
        res += '#'
    return res


_slashes_ro = re.compile(r"/+")
_samedir_ro = re.compile(r"/\./|/\.$")
_parentdir_ro = re.compile(r"^/(\.\./)+|/[^/]+/\.\.(/|$)")
def collapse_segments (path):
    """Remove all redundant segments from the given URL path.
       Precondition: path is an unquoted url path
    """
    # replace backslashes
    # note: this is _against_ the specification (which would require
    # backslashes to be left alone, and finally quoted with '%5C')
    # But replacing has several positive effects:
    # - Prevents path attacks on Windows systems (using \.. parent refs)
    # - Fixes bad urls where users used backslashes instead of slashes.
    #   This is a far more probable case than users having an intentional
    #   backslash in the path name.
    path = path.replace('\\', '/')
    # shrink multiple slashes to one slash
    path = _slashes_ro.sub("/", path)
    # collapse redundant path segments
    path = _samedir_ro.sub("/", path)
    # collapse parent path segments
    # note: here we exploit the fact that the replacements happen
    # to be from left to right (see also _parentdir_ro above)
    newpath = _parentdir_ro.sub("/", path)
    while newpath != path:
        path = newpath
        newpath = _parentdir_ro.sub("/", path)
    return path


url_is_absolute = re.compile("^[a-z]+:", re.I).match


def url_quote (url):
    """quote given URL"""
    if not url_is_absolute(url):
        return document_quote(url)
    urlparts = list(urlparse.urlsplit(url))
    urlparts[0] = urllib.quote(urlparts[0]) # scheme
    urlparts[1] = urllib.quote(urlparts[1], ':') # host
    urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
    urlparts[3] = urllib.quote(urlparts[3], '&=,') # query
    l = []
    for k, v in parse_qsl(urlparts[3], True): # query
        k = urllib.quote(k, '/-:,')
        if v:
            v = urllib.quote(v, '/-:,')
            l.append("%s=%s" % (k, v))
        else:
            l.append(k)
    urlparts[3] = '&'.join(l)
    urlparts[4] = urllib.quote(urlparts[4]) # anchor
    return urlparse.urlunsplit(urlparts)


def document_quote (document):
    """quote given document"""
    doc, query = urllib.splitquery(document)
    doc = urllib.quote(doc, '/=,')
    if query:
        return "%s?%s" % (doc, query)
    return doc


def match_url (url, domainlist):
    """return True if host part of url matches an entry in given domain
       list"""
    if not url:
        return False
    return match_host(spliturl(url)[1], domainlist)


def match_host (host, domainlist):
    """return True if host matches an entry in given domain list"""
    if not host:
        return False
    for domain in domainlist:
        if host.endswith(domain):
            return True
    return False


_safe_url_chars = re.compile(r"^[-a-zA-Z0-9_:/\.,~;=&#%()@]*$")
def url_needs_quoting (url):
    """Check if url needs percent quoting. Note that the method does
       only check basic character sets, and not any other syntax.
       The URL might still be syntactically incorrect even when
       it is properly quoted..
    """
    if url.rstrip() != url:
        # handle trailing whitespace as a special case
        # since '$' matches immediately before a end-of-line
        return True
    return not _safe_url_chars.match(url)


def spliturl (url):
    """Split url in a tuple (scheme, hostname, port, document) where
       hostname is always lowercased.
       Precondition: url is syntactically correct URI (eg has no whitespace)
    """
    scheme, netloc = urllib.splittype(url)
    host, document = urllib.splithost(netloc)
    port = default_ports.get(scheme, 80)
    if host:
        host = host.lower()
        host, port = urllib.splitnport(host, port)
    return scheme, host, port, document