Add horrible-but-necessary logic for normalising whitespace

This commit is contained in:
Matt Westcott 2017-12-06 00:13:45 +00:00 committed by Thibaud Colas
parent aa3b588cf4
commit 450edd24aa
2 changed files with 181 additions and 9 deletions

View file

@ -1,4 +1,5 @@
from html.parser import HTMLParser
import re
from wagtail.admin.rich_text.converters.contentstate_models import (
Block, ContentState, Entity, EntityRange, InlineStyleRange
@ -10,6 +11,8 @@ class HandlerState(object):
self.current_block = None
self.current_inline_styles = []
self.current_entity_ranges = []
# what to do with leading whitespace on the next text node we encounter: strip, keep or force
self.leading_whitespace = 'strip'
self.list_depth = 0
self.list_item_type = None
self.pushed_states = []
@ -19,6 +22,7 @@ class HandlerState(object):
'current_block': self.current_block,
'current_inline_styles': self.current_inline_styles,
'current_entity_ranges': self.current_entity_ranges,
'leading_whitespace': self.leading_whitespace,
'list_depth': self.list_depth,
'list_item_type': self.list_item_type
})
@ -28,6 +32,7 @@ class HandlerState(object):
self.current_block = last_state['current_block']
self.current_inline_styles = last_state['current_inline_styles']
self.current_entity_ranges = last_state['current_entity_ranges']
self.leading_whitespace = last_state['leading_whitespace']
self.list_depth = last_state['list_depth']
self.list_item_type = last_state['list_item_type']
@ -64,6 +69,7 @@ class BlockElementHandler(object):
block = self.create_block(name, dict(attrs), state, contentstate)
contentstate.blocks.append(block)
state.current_block = block
state.leading_whitespace = 'strip'
def handle_endtag(self, name, state, contentState):
assert not state.current_inline_styles, "End of block reached without closing inline style elements"
@ -88,6 +94,13 @@ class InlineStyleElementHandler(object):
def handle_starttag(self, name, attrs, state, contentstate):
assert state.current_block is not None, "%s element found at the top level" % name
if state.leading_whitespace == 'force':
# any pending whitespace should be output before handling this tag,
# and subsequent whitespace should be collapsed into it (= stripped)
state.current_block.text += ' '
state.leading_whitespace = 'strip'
inline_style_range = InlineStyleRange(self.style)
inline_style_range.offset = len(state.current_block.text)
state.current_block.inline_style_ranges.append(inline_style_range)
@ -105,6 +118,13 @@ class LinkElementHandler(object):
def handle_starttag(self, name, attrs, state, contentstate):
assert state.current_block is not None, "%s element found at the top level" % name
if state.leading_whitespace == 'force':
# any pending whitespace should be output before handling this tag,
# and subsequent whitespace should be collapsed into it (= stripped)
state.current_block.text += ' '
state.leading_whitespace = 'strip'
attrs = dict(attrs)
entity = Entity(self.entity_type, 'MUTABLE', {'url': attrs['href']})
@ -212,7 +232,8 @@ class HtmlToContentStateHandler(HTMLParser):
def add_block(self, block):
self.contentstate.blocks.append(block)
self.current_block = block
self.state.current_block = block
self.state.leading_whitespace = 'strip'
def handle_starttag(self, name, attrs):
self.element_depth += 1
@ -246,15 +267,41 @@ class HtmlToContentStateHandler(HTMLParser):
self.element_depth -= 1
def handle_data(self, content):
# normalise whitespace sequences to a single space
content = re.sub(r'\s+', ' ', content)
if self.state.current_block is None:
content = content.strip()
if content:
# create a new paragraph block for this content
block = Block('unstyled', depth=self.state.list_depth)
self.contentstate.blocks.append(block)
self.state.current_block = block
else:
if content == ' ':
# ignore top-level whitespace
return
else:
# create a new paragraph block for this content
self.add_block(Block('unstyled', depth=self.state.list_depth))
self.state.current_block.text += content
if content == ' ':
# if leading_whitespace = 'strip', this whitespace node is not significant
# and should be skipped.
# For other cases, _don't_ output the whitespace yet, but set leading_whitespace = 'force'
# so that a space is forced before the next text node or inline element. If no such node
# appears (= we reach the end of the block), the whitespace can rightfully be dropped.
if self.state.leading_whitespace != 'strip':
self.state.leading_whitespace = 'force'
else:
# strip or add leading whitespace according to the leading_whitespace flag
if self.state.leading_whitespace == 'strip':
content = content.lstrip()
elif self.state.leading_whitespace == 'force' and not content.startswith(' '):
content = ' ' + content
if content.endswith(' '):
# don't output trailing whitespace yet, because we want to discard it if the end
# of the block follows. Instead, we'll set leading_whitespace = 'force' so that
# any following text or inline element will be prefixed by a space
content = content.rstrip()
self.state.leading_whitespace = 'force'
else:
# no trailing whitespace here - any leading whitespace at the start of the
# next text node should be respected
self.state.leading_whitespace = 'keep'
self.state.current_block.text += content

View file

@ -87,3 +87,128 @@ class TestHtmlToContentState(TestCase):
{'inlineStyleRanges': [], 'text': 'after', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
]
})
def test_ignore_unrecognised_tags_in_blocks(self):
converter = ContentstateConverter(features=[])
result = json.loads(converter.from_database_format(
'''
<p>Hello <foo>frabjuous</foo> world!</p>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{'inlineStyleRanges': [], 'text': 'Hello frabjuous world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
]
})
def test_inline_styles(self):
converter = ContentstateConverter(features=['bold', 'italic'])
result = json.loads(converter.from_database_format(
'''
<p>You <b>do <em>not</em> talk</b> about Fight Club.</p>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{
'inlineStyleRanges': [
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
],
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
},
]
})
def test_inline_styles_at_top_level(self):
converter = ContentstateConverter(features=['bold', 'italic'])
result = json.loads(converter.from_database_format(
'''
You <b>do <em>not</em> talk</b> about Fight Club.
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{
'inlineStyleRanges': [
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
],
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
},
]
})
def test_inline_styles_depend_on_features(self):
converter = ContentstateConverter(features=['italic', 'just-made-it-up'])
result = json.loads(converter.from_database_format(
'''
<p>You <b>do <em>not</em> talk</b> about Fight Club.</p>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{
'inlineStyleRanges': [
{'offset': 7, 'length': 3, 'style': 'ITALIC'}
],
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
},
]
})
def test_ordered_list(self):
converter = ContentstateConverter(features=['h1', 'ol', 'bold', 'italic'])
result = json.loads(converter.from_database_format(
'''
<h1>The rules of Fight Club</h1>
<ol>
<li>You do not talk about Fight Club.</li>
<li>You <b>do <em>not</em> talk</b> about Fight Club.</li>
</ol>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{'inlineStyleRanges': [], 'text': 'The rules of Fight Club', 'depth': 0, 'type': 'header-one', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'ordered-list-item', 'key': '00000', 'entityRanges': []},
{
'inlineStyleRanges': [
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
],
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'ordered-list-item', 'key': '00000', 'entityRanges': []
},
]
})
def test_nested_list(self):
converter = ContentstateConverter(features=['h1', 'ul'])
result = json.loads(converter.from_database_format(
'''
<h1>Shopping list</h1>
<ul>
<li>Milk</li>
<li>
Flour
<ul>
<li>Plain</li>
<li>Self-raising</li>
</ul>
</li>
<li>Eggs</li>
</ul>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{'inlineStyleRanges': [], 'text': 'Shopping list', 'depth': 0, 'type': 'header-one', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Milk', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Flour', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Plain', 'depth': 1, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Self-raising', 'depth': 1, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Eggs', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
]
})