Allow wayback-format urls without affecting atom 'feed' urls

This commit is contained in:
Graham Seaman 2017-02-02 11:17:27 +00:00
parent 19a5f19282
commit 233e7dcf68
3 changed files with 16 additions and 0 deletions

View file

@ -49,6 +49,7 @@ from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG,
ExcList, ExcSyntaxList, ExcNoCacheList)
from ..url import url_fix_wayback_query
# helper alias
unicode_safe = strformat.unicode_safe
@ -377,6 +378,8 @@ class UrlBase (object):
urlparts = list(urlparse.urlsplit(self.url))
if urlparts[2]:
urlparts[2] = urlutil.collapse_segments(urlparts[2])
if not urlparts[0].startswith("feed"):
urlparts[2] = url_fix_wayback_query(urlparts[2]) # restore second / in http[s]:// in wayback path
self.url = urlutil.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = strformat.url_unicode_split(self.url)

View file

@ -246,6 +246,13 @@ def url_fix_mailto_urlsplit (urlparts):
if "?" in urlparts[2]:
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
# wayback urls include in the path http[s]://. By default the
# tidying mechanism in linkchecker encodes the : and deletes the second slash
# This function reverses these corrections. This function expects only the
# path section of the URL as input.
wayback_regex = re.compile(r'(https?)(\%3A/|:/)')
def url_fix_wayback_query(path):
return wayback_regex.sub(r'\1://', path)
def url_parse_query (query, encoding=None):
"""Parse and re-join the given CGI query."""
@ -329,6 +336,8 @@ def url_norm (url, encoding=None):
urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
urlparts[2] = url_quote_part(urlparts[2], safechars=_nopathquote_chars, encoding=encoding) # path
if not urlparts[0].startswith("feed"):
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
urlparts[4] = url_quote_part(urlparts[4], encoding=encoding) # anchor
res = urlunsplit(urlparts)
if url.endswith('#') and not urlparts[4]:

View file

@ -52,6 +52,10 @@ class TestUrl (unittest.TestCase):
"Normed URL %r needs quoting" % nurl)
self.assertEqual(nurl1, nurl)
def test_wayback(self):
self.assertFalse("http%3A/x" in url_norm("https://a.b.c/*/http://x.y.z"))
self.assertTrue("http://x" in url_norm("https://a.b.c/*/http://x.y.z"))
def test_pathattack (self):
# Windows winamp path attack prevention.
url = "http://server/..%5c..%5c..%5c..%5c..%5c..%5c..%5c.."\