mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-12 16:43:11 +00:00
test for hierarchical URLs, and retain non-numeric port parts
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2739 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
907facaf16
commit
b559c4f82e
1 changed files with 17 additions and 13 deletions
|
|
@ -23,6 +23,8 @@ import os
|
|||
import urlparse
|
||||
import urllib
|
||||
|
||||
urlparse.uses_netloc.append('ldap')
|
||||
|
||||
# constants defining url part indexes
|
||||
SCHEME = 0
|
||||
HOSTNAME = DOMAIN = 1
|
||||
|
|
@ -229,7 +231,9 @@ def url_fix_host (urlparts):
|
|||
userpass += "@"
|
||||
else:
|
||||
userpass = ""
|
||||
host, port = urllib.splitnport(host)
|
||||
newhost, port = urllib.splitnport(host)
|
||||
if port is not None:
|
||||
host = newhost
|
||||
# remove trailing dot
|
||||
if host.endswith("."):
|
||||
host = host[:-1]
|
||||
|
|
@ -301,18 +305,18 @@ def url_norm (url):
|
|||
is_idn = url_fix_host(urlparts)
|
||||
# query
|
||||
urlparts[3] = url_parse_query(urlparts[3])
|
||||
if not urlparts[2]:
|
||||
# empty path is allowed if url is non-hierarchical, or if both
|
||||
# query and fragment are also empty
|
||||
# note that in relative links, urlparts[0] might be empty
|
||||
# in this case, do not make any assumptions
|
||||
if urlparts[0] and \
|
||||
urlparts[0] not in urlparse.non_hierarchical and \
|
||||
(urlparts[3] or urlparts[4]):
|
||||
urlparts[2] = '/'
|
||||
else:
|
||||
# fix redundant path parts
|
||||
urlparts[2] = collapse_segments(urlparts[2])
|
||||
is_hierarchical = urlparts[0] not in urlparse.non_hierarchical
|
||||
if is_hierarchical:
|
||||
# URL has a hierarchical path we should norm
|
||||
if not urlparts[2]:
|
||||
# Empty path is allowed if both query and fragment are also empty.
|
||||
# Note that in relative links, urlparts[0] might be empty.
|
||||
# In this case, do not make any assumptions.
|
||||
if urlparts[0] and (urlparts[3] or urlparts[4]):
|
||||
urlparts[2] = '/'
|
||||
else:
|
||||
# fix redundant path parts
|
||||
urlparts[2] = collapse_segments(urlparts[2])
|
||||
# quote parts again
|
||||
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.quote(urlparts[1], '@:') # host
|
||||
|
|
|
|||
Loading…
Reference in a new issue