mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-05 13:14:46 +00:00
Allow wayback-format urls without affecting atom 'feed' urls
This commit is contained in:
parent
19a5f19282
commit
233e7dcf68
3 changed files with 16 additions and 0 deletions
|
|
@ -49,6 +49,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
|
|||
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
|
||||
WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG,
|
||||
ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
from ..url import url_fix_wayback_query
|
||||
|
||||
# helper alias
|
||||
unicode_safe = strformat.unicode_safe
|
||||
|
|
@ -377,6 +378,8 @@ class UrlBase (object):
|
|||
urlparts = list(urlparse.urlsplit(self.url))
|
||||
if urlparts[2]:
|
||||
urlparts[2] = urlutil.collapse_segments(urlparts[2])
|
||||
if not urlparts[0].startswith("feed"):
|
||||
urlparts[2] = url_fix_wayback_query(urlparts[2]) # restore second / in http[s]:// in wayback path
|
||||
self.url = urlutil.urlunsplit(urlparts)
|
||||
# split into (modifiable) list
|
||||
self.urlparts = strformat.url_unicode_split(self.url)
|
||||
|
|
|
|||
|
|
@ -246,6 +246,13 @@ def url_fix_mailto_urlsplit (urlparts):
|
|||
if "?" in urlparts[2]:
|
||||
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
|
||||
|
||||
# wayback urls include in the path http[s]://. By default the
|
||||
# tidying mechanism in linkchecker encodes the : and deletes the second slash
|
||||
# This function reverses these corrections. This function expects only the
|
||||
# path section of the URL as input.
|
||||
wayback_regex = re.compile(r'(https?)(\%3A/|:/)')
|
||||
def url_fix_wayback_query(path):
|
||||
return wayback_regex.sub(r'\1://', path)
|
||||
|
||||
def url_parse_query (query, encoding=None):
|
||||
"""Parse and re-join the given CGI query."""
|
||||
|
|
@ -329,6 +336,8 @@ def url_norm (url, encoding=None):
|
|||
urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
|
||||
urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
|
||||
urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
|
||||
if not urlparts[0].startswith("feed"):
|
||||
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
|
||||
urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
|
||||
res = urlunsplit(urlparts)
|
||||
if url.endswith('#') and not urlparts[4]:
|
||||
|
|
|
|||
|
|
@ -52,6 +52,10 @@ class TestUrl (unittest.TestCase):
|
|||
"Normed URL %r needs quoting" % nurl)
|
||||
self.assertEqual(nurl1, nurl)
|
||||
|
||||
def test_wayback(self):
|
||||
self.assertFalse("http%3A/x" in url_norm("https://a.b.c/*/http://x.y.z"))
|
||||
self.assertTrue("http://x" in url_norm("https://a.b.c/*/http://x.y.z"))
|
||||
|
||||
def test_pathattack (self):
|
||||
# Windows winamp path attack prevention.
|
||||
url = "http://server/..%5c..%5c..%5c..%5c..%5c..%5c..%5c.."\
|
||||
|
|
|
|||
Loading…
Reference in a new issue