mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-07 16:10:58 +00:00
updated from webcleaner
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1330 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
7fd052db44
commit
58fab5a44f
1 changed files with 95 additions and 31 deletions
126
linkcheck/url.py
126
linkcheck/url.py
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""url utils"""
|
||||
"""url utils, can be used as a standalone module"""
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -19,27 +19,58 @@
|
|||
__version__ = "$Revision$"[11:-2]
|
||||
__date__ = "$Date$"[7:-2]
|
||||
|
||||
import re, urlparse, os
|
||||
from urllib import splittype, splithost, splitnport, splitquery, quote, unquote
|
||||
import re
|
||||
import urlparse
|
||||
import os
|
||||
import urllib
|
||||
import cgi
|
||||
|
||||
|
||||
# adapted from David Wheelers "Secure Programming for Linux and Unix HOWTO"
|
||||
# http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/filter-html.html#VALIDATING-URIS
|
||||
_az09 = r"a-z0-9"
|
||||
_path = r"\-\_\.\!\~\*\'\(\)"
|
||||
_hex_safe = r"2-9a-f"
|
||||
_hex_full = r"0-9a-f"
|
||||
_basic = {
|
||||
"_az09": r"a-z0-9",
|
||||
"_path": r"\-\_\.\!\~\*\'\(\),",
|
||||
"_hex_safe": r"2-9a-f",
|
||||
"_hex_full": r"0-9a-f",
|
||||
}
|
||||
_safe_char = r"([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))"%_basic
|
||||
_safe_scheme_pattern = r"(https?|ftp)"
|
||||
_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)"%locals()
|
||||
_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%locals()
|
||||
_safe_fragment_pattern = r"(\#([%(_az09)s%(_path)s\+]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)?"%locals()
|
||||
safe_url_pattern = "(?i)"+_safe_scheme_pattern+"://"+_safe_host_pattern+\
|
||||
_safe_path_pattern+_safe_fragment_pattern
|
||||
_safe_host_pattern = r"([%(_az09)s][%(_az09)s\-]*(\.[%(_az09)s][%(_az09)s\-]*)*\.?)(:(80|8080|8000))?"%_basic
|
||||
_safe_path_pattern = r"((/([%(_az09)s%(_path)s]|(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)"%_basic
|
||||
_safe_fragment_pattern = r"%s*"%_safe_char
|
||||
_safe_cgi = r"%s+(=%s+)?" % (_safe_char, _safe_char)
|
||||
_safe_query_pattern = r"(%s(&%s)*)?"%(_safe_cgi, _safe_cgi)
|
||||
safe_url_pattern = r"%s://%s%s(#%s)?" % \
|
||||
(_safe_scheme_pattern, _safe_host_pattern,
|
||||
_safe_path_pattern, _safe_fragment_pattern)
|
||||
|
||||
is_valid_url = re.compile("(?i)^%s$"%safe_url_pattern).match
|
||||
is_valid_host = re.compile("(?i)^%s$"%_safe_host_pattern).match
|
||||
is_valid_path = re.compile("(?i)^%s$"%_safe_path_pattern).match
|
||||
is_valid_query = re.compile("(?i)^%s$"%_safe_query_pattern).match
|
||||
is_valid_fragment = re.compile("(?i)^%s$"%_safe_fragment_pattern).match
|
||||
|
||||
def is_valid_js_url (urlstr):
|
||||
"""test javascript urls"""
|
||||
url = urlparse.urlsplit(urlstr)
|
||||
if url[0].lower()!='http':
|
||||
return False
|
||||
if not is_valid_host(url[1]):
|
||||
return False
|
||||
if not is_valid_path(url[2]):
|
||||
return False
|
||||
if not is_valid_query(url[3]):
|
||||
return False
|
||||
if not is_valid_fragment(url[4]):
|
||||
return False
|
||||
return True
|
||||
|
||||
is_valid_url = re.compile("^%s$"%safe_url_pattern).match
|
||||
|
||||
def safe_host_pattern (host):
|
||||
return _safe_scheme_pattern+"://"+host+ \
|
||||
_safe_path_pattern+_safe_fragment_pattern
|
||||
"""return regular expression pattern with given host for url testing"""
|
||||
return "(?i)%s://%s%s(#%s)?" % \
|
||||
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
|
||||
|
||||
|
||||
# XXX better name/implementation for this function
|
||||
|
|
@ -52,11 +83,24 @@ def stripsite (url):
|
|||
def url_norm (url):
|
||||
"""unquote and normalize url which must be quoted"""
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
urlparts[0] = unquote(urlparts[0])
|
||||
urlparts[1] = unquote(urlparts[1])
|
||||
urlparts[2] = unquote(urlparts[2])
|
||||
urlparts[4] = unquote(urlparts[4])
|
||||
path = urlparts[2].replace('\\', '/')
|
||||
urlparts[0] = urllib.unquote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.unquote(urlparts[1]) # host
|
||||
# a leading backslash in path causes urlsplit() to add the
|
||||
# path components up to the first slash to host
|
||||
# try to find this case...
|
||||
i = urlparts[1].find("\\")
|
||||
if i != -1:
|
||||
# ...and fix it by prepending the misplaced components to the path
|
||||
comps = urlparts[1][i:] # note: still has leading backslash
|
||||
if not urlparts[2] or urlparts[2]=='/':
|
||||
urlparts[2] = comps
|
||||
else:
|
||||
urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
|
||||
urlparts[1] = urlparts[1][:i]
|
||||
else:
|
||||
urlparts[2] = urllib.unquote(urlparts[2]) # path
|
||||
urlparts[4] = urllib.unquote(urlparts[4]) # anchor
|
||||
path = urlparts[2].replace('\\', '/').replace('//', '/')
|
||||
if not path or path=='/':
|
||||
urlparts[2] = '/'
|
||||
else:
|
||||
|
|
@ -70,25 +114,45 @@ def url_norm (url):
|
|||
|
||||
def url_quote (url):
|
||||
"""quote given url"""
|
||||
if not url:
|
||||
return ""
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
urlparts[0] = quote(urlparts[0])
|
||||
urlparts[1] = quote(urlparts[1], ':')
|
||||
urlparts[2] = quote(urlparts[2], '/')
|
||||
urlparts[4] = quote(urlparts[4])
|
||||
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.quote(urlparts[1], ':') # host
|
||||
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
|
||||
l = []
|
||||
for k,v in cgi.parse_qsl(urlparts[3], True): # query
|
||||
l.append("%s=%s" % (urllib.quote(k, '/-'), urllib.quote(v, '/-')))
|
||||
urlparts[3] = '&'.join(l)
|
||||
urlparts[4] = urllib.quote(urlparts[4]) # anchor
|
||||
return urlparse.urlunsplit(urlparts)
|
||||
|
||||
|
||||
def document_quote (document):
|
||||
"""quote given document"""
|
||||
doc, query = splitquery(document)
|
||||
doc = quote(doc, '/')
|
||||
doc, query = urllib.splitquery(document)
|
||||
doc = urllib.quote(doc, '/=,')
|
||||
if query:
|
||||
return "%s?%s" % (doc, query)
|
||||
return doc
|
||||
|
||||
|
||||
def match_url (url, domainlist):
|
||||
"""return True if host part of url matches an entry in given domain
|
||||
list"""
|
||||
if not url:
|
||||
return False
|
||||
return match_host(spliturl(url)[1], domainlist)
|
||||
|
||||
|
||||
def match_host (host, domainlist):
|
||||
"""return True if host matches an entry in given domain list"""
|
||||
if not host:
|
||||
return False
|
||||
for domain in domainlist:
|
||||
if host.endswith(domain):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
default_ports = {
|
||||
'http' : 80,
|
||||
'https' : 443,
|
||||
|
|
@ -99,12 +163,12 @@ def spliturl (url):
|
|||
"""split url in a tuple (scheme, hostname, port, document) where
|
||||
hostname is always lowercased
|
||||
precondition: url is syntactically correct URI (eg has no whitespace)"""
|
||||
scheme, netloc = splittype(url)
|
||||
host, document = splithost(netloc)
|
||||
scheme, netloc = urllib.splittype(url)
|
||||
host, document = urllib.splithost(netloc)
|
||||
port = default_ports.get(scheme, 80)
|
||||
if host:
|
||||
host = host.lower()
|
||||
host, port = splitnport(host, port)
|
||||
host, port = urllib.splitnport(host, port)
|
||||
return scheme, host, port, document
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue