From 0cd53a766e3d68ecb5936fc0e6c2a910e4e5dc46 Mon Sep 17 00:00:00 2001 From: Matt Westcott Date: Wed, 27 Sep 2017 17:23:48 +0100 Subject: [PATCH] Make whitelister consistently use double-quotes on attributes --- wagtail/core/tests/test_whitelist.py | 5 +++++ wagtail/core/whitelist.py | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/wagtail/core/tests/test_whitelist.py b/wagtail/core/tests/test_whitelist.py index eb7c60d43..1eba4893b 100644 --- a/wagtail/core/tests/test_whitelist.py +++ b/wagtail/core/tests/test_whitelist.py @@ -143,3 +143,8 @@ class TestWhitelister(TestCase): string = 'snowman Yorkshire' cleaned_string = Whitelister.clean(string) self.assertEqual(cleaned_string, 'snowman Yorkshire') + + def test_quoting(self): + string = 'Arthur "two sheds" Jackson' + cleaned_string = Whitelister.clean(string) + self.assertEqual(cleaned_string, 'Arthur "two sheds" Jackson') diff --git a/wagtail/core/whitelist.py b/wagtail/core/whitelist.py index e93715d05..a2a4041e6 100644 --- a/wagtail/core/whitelist.py +++ b/wagtail/core/whitelist.py @@ -5,6 +5,7 @@ specific rules. import re from bs4 import BeautifulSoup, Comment, NavigableString, Tag +from django.utils.html import escape ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel'] @@ -96,7 +97,13 @@ class Whitelister: attributes""" doc = BeautifulSoup(html, 'html5lib') cls.clean_node(doc, doc) - return doc.decode() + + # Pass strings through django.utils.html.escape when generating the final HTML. + # This differs from BeautifulSoup's default EntitySubstitution.substitute_html formatter + # in that it escapes " to " as well as escaping < > & - if we don't do this, then + # BeautifulSoup will try to be clever and use single-quotes to wrap attribute values, + # which confuses our regexp-based db-HTML-to-real-HTML conversion. + return doc.decode(formatter=escape) @classmethod def clean_node(cls, doc, node):