Make whitelister consistently use double-quotes on attributes

This commit is contained in:
Matt Westcott 2017-09-27 17:23:48 +01:00 committed by Matt Westcott
parent 6866fa0d8d
commit 0cd53a766e
2 changed files with 13 additions and 1 deletions

View file

@ -143,3 +143,8 @@ class TestWhitelister(TestCase):
string = '<b>snowman Yorkshire<!--[if gte mso 10]>MS word junk<![endif]--></b>'
cleaned_string = Whitelister.clean(string)
self.assertEqual(cleaned_string, '<b>snowman Yorkshire</b>')
def test_quoting(self):
string = '<img alt="Arthur &quot;two sheds&quot; Jackson" sheds="2">'
cleaned_string = Whitelister.clean(string)
self.assertEqual(cleaned_string, '<img alt="Arthur &quot;two sheds&quot; Jackson"/>')

View file

@ -5,6 +5,7 @@ specific rules.
import re
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
from django.utils.html import escape
ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel']
@ -96,7 +97,13 @@ class Whitelister:
attributes"""
doc = BeautifulSoup(html, 'html5lib')
cls.clean_node(doc, doc)
return doc.decode()
# Pass strings through django.utils.html.escape when generating the final HTML.
# This differs from BeautifulSoup's default EntitySubstitution.substitute_html formatter
# in that it escapes " to &quot; as well as escaping < > & - if we don't do this, then
# BeautifulSoup will try to be clever and use single-quotes to wrap attribute values,
# which confuses our regexp-based db-HTML-to-real-HTML conversion.
return doc.decode(formatter=escape)
@classmethod
def clean_node(cls, doc, node):