mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 08:20:25 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1611 e7d03fd6-7b0d-0410-9947-9c21f3af8025
313 lines
11 KiB
Python
313 lines
11 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
"""url utils"""
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import re
|
|
import urlparse
|
|
import urllib
|
|
|
|
# constants defining url part indexes
|
|
SCHEME = 0
|
|
HOSTNAME = DOMAIN = 1
|
|
PORT = 2
|
|
DOCUMENT = 3
|
|
|
|
default_ports = {
|
|
'http' : 80,
|
|
'https' : 443,
|
|
'nntps' : 563,
|
|
}
|
|
|
|
# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
|
|
# http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/\
|
|
# filter-html.html#VALIDATING-URIS
|
|
_basic = {
|
|
"_az09": r"a-z0-9",
|
|
"_path": r"\-\_\.\!\~\*\'\(\),",
|
|
"_hex_safe": r"2-9a-f",
|
|
"_hex_full": r"0-9a-f",
|
|
}
|
|
_safe_char = r"([%(_az09)s%(_path)s\+]|"\
|
|
r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
|
|
_safe_scheme_pattern = r"(https?|ftp)"
|
|
_safe_domain_pattern = r"([%(_az09)s][%(_az09)s\-]*"\
|
|
r"(\.[%(_az09)s][%(_az09)s\-]*)*\.?)" % _basic
|
|
_safe_host_pattern = _safe_domain_pattern+r"(:(80|8080|8000))?" % _basic
|
|
_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|"\
|
|
r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
|
|
_safe_fragment_pattern = r"%s*" % _safe_char
|
|
_safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char)
|
|
_safe_query_pattern = r"(%s(&%s)*)?" % (_safe_cgi, _safe_cgi)
|
|
safe_url_pattern = r"%s://%s%s(#%s)?" % \
|
|
(_safe_scheme_pattern, _safe_host_pattern,
|
|
_safe_path_pattern, _safe_fragment_pattern)
|
|
|
|
is_safe_url = re.compile("(?i)^%s$"%safe_url_pattern).match
|
|
is_safe_domain = re.compile("(?i)^%s$"%_safe_domain_pattern).match
|
|
is_safe_host = re.compile("(?i)^%s$"%_safe_host_pattern).match
|
|
is_safe_path = re.compile("(?i)^%s$"%_safe_path_pattern).match
|
|
is_safe_query = re.compile("(?i)^%s$"%_safe_query_pattern).match
|
|
is_safe_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match
|
|
|
|
def is_safe_js_url (urlstr):
|
|
"""test javascript URLs"""
|
|
url = urlparse.urlsplit(urlstr)
|
|
if url[0].lower() != 'http':
|
|
return False
|
|
if not is_safe_host(url[1]):
|
|
return False
|
|
if not is_safe_path(url[2]):
|
|
return False
|
|
if not is_safe_query(url[3]):
|
|
return False
|
|
if not is_safe_fragment(url[4]):
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_numeric_port (portstr):
|
|
"""return True iff portstr is a valid port number"""
|
|
if portstr.isdigit():
|
|
port = int(portstr)
|
|
# 65536 == 2**16
|
|
return 0 < port < 65536
|
|
return False
|
|
|
|
|
|
def safe_host_pattern (host):
|
|
"""return regular expression pattern with given host for URL testing"""
|
|
return "(?i)%s://%s%s(#%s)?" % \
|
|
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
|
|
|
|
|
|
# XXX better name/implementation for this function
|
|
def stripsite (url):
|
|
"""remove scheme and host from URL. return host, newurl"""
|
|
url = urlparse.urlsplit(url)
|
|
return url[1], urlparse.urlunsplit((0, 0, url[2], url[3], url[4]))
|
|
|
|
|
|
def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
|
|
"""Parse a query given as a string argument.
|
|
|
|
Arguments:
|
|
|
|
qs: URL-encoded query string to be parsed
|
|
|
|
keep_blank_values: flag indicating whether blank values in
|
|
URL encoded queries should be treated as blank strings. A
|
|
true value indicates that blanks should be retained as blank
|
|
strings. The default false value indicates that blank values
|
|
are to be ignored and treated as if they were not included.
|
|
|
|
strict_parsing: flag indicating what to do with parsing errors. If
|
|
false (the default), errors are silently ignored. If true,
|
|
errors raise a ValueError exception.
|
|
|
|
Returns a list, as G-d intended.
|
|
"""
|
|
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
|
r = []
|
|
for name_value in pairs:
|
|
nv = name_value.split('=', 1)
|
|
if len(nv) != 2:
|
|
if strict_parsing:
|
|
raise ValueError, "bad query field: %s" % `name_value`
|
|
elif len(nv) == 1:
|
|
nv = (nv[0], "")
|
|
else:
|
|
continue
|
|
if len(nv[1]) or keep_blank_values:
|
|
name = urllib.unquote(nv[0].replace('+', ' '))
|
|
value = urllib.unquote(nv[1].replace('+', ' '))
|
|
r.append((name, value))
|
|
return r
|
|
|
|
|
|
def url_norm (url):
|
|
"""fix and normalize URL which must be quoted"""
|
|
urlparts = list(urlparse.urlsplit(url))
|
|
urlparts[0] = urllib.unquote(urlparts[0]).lower() # scheme
|
|
urlparts[1] = urllib.unquote(urlparts[1]).lower() # host
|
|
# a leading backslash in path causes urlsplit() to add the
|
|
# path components up to the first slash to host
|
|
# try to find this case...
|
|
i = urlparts[1].find("\\")
|
|
if i != -1:
|
|
# ...and fix it by prepending the misplaced components to the path
|
|
comps = urlparts[1][i:] # note: still has leading backslash
|
|
if not urlparts[2] or urlparts[2] == '/':
|
|
urlparts[2] = comps
|
|
else:
|
|
urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
|
|
urlparts[1] = urlparts[1][:i]
|
|
else:
|
|
urlparts[2] = urllib.unquote(urlparts[2]) # path
|
|
if urlparts[1]:
|
|
userpass, host = urllib.splituser(urlparts[1])
|
|
if userpass:
|
|
# append AT for easy concatenation
|
|
userpass += "@"
|
|
else:
|
|
userpass = ""
|
|
host, port = urllib.splitnport(host)
|
|
# remove trailing dot
|
|
if host.endswith("."):
|
|
host = host[:-1]
|
|
# remove a default (or invalid) port
|
|
if port in [-1, None, default_ports.get(urlparts[0])]:
|
|
urlparts[1] = userpass+host
|
|
else:
|
|
urlparts[1] = "%s%s:%d" % (userpass, host, port)
|
|
l = []
|
|
for k, v in parse_qsl(urlparts[3], True): # query
|
|
k = urllib.quote(k, '/-:,')
|
|
if v:
|
|
v = urllib.quote(v, '/-:,')
|
|
l.append("%s=%s" % (k, v))
|
|
else:
|
|
l.append(k)
|
|
urlparts[3] = '&'.join(l)
|
|
if not urlparts[2]:
|
|
# empty path should be a slash, but not in certain schemes
|
|
# note that in relative links, urlparts[0] might be empty
|
|
# in this case, do not make any assumptions
|
|
if urlparts[0] and urlparts[0] not in urlparse.non_hierarchical:
|
|
urlparts[2] = '/'
|
|
else:
|
|
# fix redundant path parts
|
|
urlparts[2] = collapse_segments(urlparts[2])
|
|
# quote parts again
|
|
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
|
urlparts[1] = urllib.quote(urlparts[1], ':@') # host
|
|
nopathquote = ';/=,~*-+()@'
|
|
urlparts[2] = urllib.quote(urlparts[2], nopathquote) # path
|
|
res = urlparse.urlunsplit(urlparts)
|
|
if url.endswith('#') and not urlparts[4]:
|
|
# re-append trailing empty fragment
|
|
res += '#'
|
|
return res
|
|
|
|
|
|
_slashes_ro = re.compile(r"/+")
|
|
_samedir_ro = re.compile(r"/\./|/\.$")
|
|
_parentdir_ro = re.compile(r"^/(\.\./)+|/[^/]+/\.\.(/|$)")
|
|
def collapse_segments (path):
|
|
"""Remove all redundant segments from the given URL path.
|
|
Precondition: path is an unquoted url path
|
|
"""
|
|
# replace backslashes
|
|
# note: this is _against_ the specification (which would require
|
|
# backslashes to be left alone, and finally quoted with '%5C')
|
|
# But replacing has several positive effects:
|
|
# - Prevents path attacks on Windows systems (using \.. parent refs)
|
|
# - Fixes bad urls where users used backslashes instead of slashes.
|
|
# This is a far more probable case than users having an intentional
|
|
# backslash in the path name.
|
|
path = path.replace('\\', '/')
|
|
# shrink multiple slashes to one slash
|
|
path = _slashes_ro.sub("/", path)
|
|
# collapse redundant path segments
|
|
path = _samedir_ro.sub("/", path)
|
|
# collapse parent path segments
|
|
# note: here we exploit the fact that the replacements happen
|
|
# to be from left to right (see also _parentdir_ro above)
|
|
newpath = _parentdir_ro.sub("/", path)
|
|
while newpath != path:
|
|
path = newpath
|
|
newpath = _parentdir_ro.sub("/", path)
|
|
return path
|
|
|
|
|
|
url_is_absolute = re.compile("^[a-z]+:", re.I).match
|
|
|
|
|
|
def url_quote (url):
|
|
"""quote given URL"""
|
|
if not url_is_absolute(url):
|
|
return document_quote(url)
|
|
urlparts = list(urlparse.urlsplit(url))
|
|
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
|
urlparts[1] = urllib.quote(urlparts[1], ':') # host
|
|
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
|
|
urlparts[3] = urllib.quote(urlparts[3], '&=,') # query
|
|
l = []
|
|
for k, v in parse_qsl(urlparts[3], True): # query
|
|
k = urllib.quote(k, '/-:,')
|
|
if v:
|
|
v = urllib.quote(v, '/-:,')
|
|
l.append("%s=%s" % (k, v))
|
|
else:
|
|
l.append(k)
|
|
urlparts[3] = '&'.join(l)
|
|
urlparts[4] = urllib.quote(urlparts[4]) # anchor
|
|
return urlparse.urlunsplit(urlparts)
|
|
|
|
|
|
def document_quote (document):
|
|
"""quote given document"""
|
|
doc, query = urllib.splitquery(document)
|
|
doc = urllib.quote(doc, '/=,')
|
|
if query:
|
|
return "%s?%s" % (doc, query)
|
|
return doc
|
|
|
|
|
|
def match_url (url, domainlist):
|
|
"""return True if host part of url matches an entry in given domain
|
|
list"""
|
|
if not url:
|
|
return False
|
|
return match_host(spliturl(url)[1], domainlist)
|
|
|
|
|
|
def match_host (host, domainlist):
|
|
"""return True if host matches an entry in given domain list"""
|
|
if not host:
|
|
return False
|
|
for domain in domainlist:
|
|
if host.endswith(domain):
|
|
return True
|
|
return False
|
|
|
|
|
|
_safe_url_chars = re.compile(r"^[-a-zA-Z0-9_:/\.,~;=&#%()@]*$")
|
|
def url_needs_quoting (url):
|
|
"""Check if url needs percent quoting. Note that the method does
|
|
only check basic character sets, and not any other syntax.
|
|
The URL might still be syntactically incorrect even when
|
|
it is properly quoted..
|
|
"""
|
|
if url.rstrip() != url:
|
|
# handle trailing whitespace as a special case
|
|
# since '$' matches immediately before a end-of-line
|
|
return True
|
|
return not _safe_url_chars.match(url)
|
|
|
|
|
|
def spliturl (url):
|
|
"""Split url in a tuple (scheme, hostname, port, document) where
|
|
hostname is always lowercased.
|
|
Precondition: url is syntactically correct URI (eg has no whitespace)
|
|
"""
|
|
scheme, netloc = urllib.splittype(url)
|
|
host, document = urllib.splithost(netloc)
|
|
port = default_ports.get(scheme, 80)
|
|
if host:
|
|
host = host.lower()
|
|
host, port = urllib.splitnport(host, port)
|
|
return scheme, host, port, document
|