Merge branch 'takeflight-bug/url-check'

2026-05-02 04:24:46 +00:00 · 2014-10-23 12:40:38 +01:00 · 2014-10-23 12:40:38 +01:00 · 8bb1a8a459
commit 8bb1a8a459
parent bdcef9e05f 4b466ae041
2 changed files with 27 additions and 6 deletions
--- a/wagtail/wagtailcore/tests/test_whitelist.py
+++ b/wagtail/wagtailcore/tests/test_whitelist.py
@ -17,6 +17,13 @@ class TestCheckUrl(TestCase):
    def test_disallowed_url_scheme(self):
        self.assertFalse(bool(check_url("invalid://url")))

+    def test_crafty_disallowed_url_scheme(self):
+        """
+        Some URL parsers do not parse 'jav\tascript:' as a valid scheme.
+        Browsers, however, do. The checker needs to catch these crafty schemes
+        """
+        self.assertFalse(bool(check_url("jav\tascript:alert('XSS')")))
+

 class TestAttributeRule(TestCase):
    def setUp(self):
--- a/wagtail/wagtailcore/whitelist.py
+++ b/wagtail/wagtailcore/whitelist.py
@ -2,19 +2,33 @@
 A generic HTML whitelisting engine, designed to accommodate subclassing to override
 specific rules.
 """
-from six.moves.urllib.parse import urlparse
+import re
+

 from bs4 import BeautifulSoup, NavigableString, Tag


-ALLOWED_URL_SCHEMES = ['', 'http', 'https', 'ftp', 'mailto', 'tel']
+ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel']
+
+PROTOCOL_RE = re.compile("^[a-z0-9][-+.a-z0-9]*:")


 def check_url(url_string):
-    # TODO: more paranoid checks (urlparse doesn't catch
-    # "jav\tascript:alert('XSS')")
-    url = urlparse(url_string)
-    return (url_string if url.scheme in ALLOWED_URL_SCHEMES else None)
+    # Remove control characters and other disallowed characters
+    # Browsers sometimes ignore these, so that 'jav\tascript:alert("XSS")'
+    # is treated as a valid javascript: link
+
+    unescaped = url_string.lower()
+    unescaped = unescaped.replace("&lt;", "<")
+    unescaped = unescaped.replace("&gt;", ">")
+    unescaped = unescaped.replace("&amp;", "&")
+    unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescaped)
+    unescaped = unescaped.replace("\ufffd", "")
+    if PROTOCOL_RE.match(unescaped):
+        protocol = unescaped.split(':', 1)[0]
+        if protocol not in ALLOWED_URL_SCHEMES:
+            return None
+    return url_string


 def attribute_rule(allowed_attrs):