diff --git a/wagtail/admin/rich_text/converters/html_to_contentstate.py b/wagtail/admin/rich_text/converters/html_to_contentstate.py index cd3627a1e..6e45db25b 100644 --- a/wagtail/admin/rich_text/converters/html_to_contentstate.py +++ b/wagtail/admin/rich_text/converters/html_to_contentstate.py @@ -1,4 +1,5 @@ from html.parser import HTMLParser +import re from wagtail.admin.rich_text.converters.contentstate_models import ( Block, ContentState, Entity, EntityRange, InlineStyleRange @@ -10,6 +11,8 @@ class HandlerState(object): self.current_block = None self.current_inline_styles = [] self.current_entity_ranges = [] + # what to do with leading whitespace on the next text node we encounter: strip, keep or force + self.leading_whitespace = 'strip' self.list_depth = 0 self.list_item_type = None self.pushed_states = [] @@ -19,6 +22,7 @@ class HandlerState(object): 'current_block': self.current_block, 'current_inline_styles': self.current_inline_styles, 'current_entity_ranges': self.current_entity_ranges, + 'leading_whitespace': self.leading_whitespace, 'list_depth': self.list_depth, 'list_item_type': self.list_item_type }) @@ -28,6 +32,7 @@ class HandlerState(object): self.current_block = last_state['current_block'] self.current_inline_styles = last_state['current_inline_styles'] self.current_entity_ranges = last_state['current_entity_ranges'] + self.leading_whitespace = last_state['leading_whitespace'] self.list_depth = last_state['list_depth'] self.list_item_type = last_state['list_item_type'] @@ -64,6 +69,7 @@ class BlockElementHandler(object): block = self.create_block(name, dict(attrs), state, contentstate) contentstate.blocks.append(block) state.current_block = block + state.leading_whitespace = 'strip' def handle_endtag(self, name, state, contentState): assert not state.current_inline_styles, "End of block reached without closing inline style elements" @@ -88,6 +94,13 @@ class InlineStyleElementHandler(object): def handle_starttag(self, name, attrs, state, contentstate): assert state.current_block is not None, "%s element found at the top level" % name + + if state.leading_whitespace == 'force': + # any pending whitespace should be output before handling this tag, + # and subsequent whitespace should be collapsed into it (= stripped) + state.current_block.text += ' ' + state.leading_whitespace = 'strip' + inline_style_range = InlineStyleRange(self.style) inline_style_range.offset = len(state.current_block.text) state.current_block.inline_style_ranges.append(inline_style_range) @@ -105,6 +118,13 @@ class LinkElementHandler(object): def handle_starttag(self, name, attrs, state, contentstate): assert state.current_block is not None, "%s element found at the top level" % name + + if state.leading_whitespace == 'force': + # any pending whitespace should be output before handling this tag, + # and subsequent whitespace should be collapsed into it (= stripped) + state.current_block.text += ' ' + state.leading_whitespace = 'strip' + attrs = dict(attrs) entity = Entity(self.entity_type, 'MUTABLE', {'url': attrs['href']}) @@ -212,7 +232,8 @@ class HtmlToContentStateHandler(HTMLParser): def add_block(self, block): self.contentstate.blocks.append(block) - self.current_block = block + self.state.current_block = block + self.state.leading_whitespace = 'strip' def handle_starttag(self, name, attrs): self.element_depth += 1 @@ -246,15 +267,41 @@ class HtmlToContentStateHandler(HTMLParser): self.element_depth -= 1 def handle_data(self, content): + # normalise whitespace sequences to a single space + content = re.sub(r'\s+', ' ', content) + if self.state.current_block is None: - content = content.strip() - if content: - # create a new paragraph block for this content - block = Block('unstyled', depth=self.state.list_depth) - self.contentstate.blocks.append(block) - self.state.current_block = block - else: + if content == ' ': # ignore top-level whitespace return + else: + # create a new paragraph block for this content + self.add_block(Block('unstyled', depth=self.state.list_depth)) - self.state.current_block.text += content + if content == ' ': + # if leading_whitespace = 'strip', this whitespace node is not significant + # and should be skipped. + # For other cases, _don't_ output the whitespace yet, but set leading_whitespace = 'force' + # so that a space is forced before the next text node or inline element. If no such node + # appears (= we reach the end of the block), the whitespace can rightfully be dropped. + if self.state.leading_whitespace != 'strip': + self.state.leading_whitespace = 'force' + else: + # strip or add leading whitespace according to the leading_whitespace flag + if self.state.leading_whitespace == 'strip': + content = content.lstrip() + elif self.state.leading_whitespace == 'force' and not content.startswith(' '): + content = ' ' + content + + if content.endswith(' '): + # don't output trailing whitespace yet, because we want to discard it if the end + # of the block follows. Instead, we'll set leading_whitespace = 'force' so that + # any following text or inline element will be prefixed by a space + content = content.rstrip() + self.state.leading_whitespace = 'force' + else: + # no trailing whitespace here - any leading whitespace at the start of the + # next text node should be respected + self.state.leading_whitespace = 'keep' + + self.state.current_block.text += content diff --git a/wagtail/admin/tests/test_contentstate.py b/wagtail/admin/tests/test_contentstate.py index 88d230182..69e20a21a 100644 --- a/wagtail/admin/tests/test_contentstate.py +++ b/wagtail/admin/tests/test_contentstate.py @@ -87,3 +87,128 @@ class TestHtmlToContentState(TestCase): {'inlineStyleRanges': [], 'text': 'after', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []}, ] }) + + def test_ignore_unrecognised_tags_in_blocks(self): + converter = ContentstateConverter(features=[]) + result = json.loads(converter.from_database_format( + ''' +
Hello
You do not talk about Fight Club.
+ ''' + )) + self.assertContentStateEqual(result, { + 'entityMap': {}, + 'blocks': [ + { + 'inlineStyleRanges': [ + {'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'} + ], + 'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': [] + }, + ] + }) + + def test_inline_styles_at_top_level(self): + converter = ContentstateConverter(features=['bold', 'italic']) + result = json.loads(converter.from_database_format( + ''' + You do not talk about Fight Club. + ''' + )) + self.assertContentStateEqual(result, { + 'entityMap': {}, + 'blocks': [ + { + 'inlineStyleRanges': [ + {'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'} + ], + 'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': [] + }, + ] + }) + def test_inline_styles_depend_on_features(self): + converter = ContentstateConverter(features=['italic', 'just-made-it-up']) + result = json.loads(converter.from_database_format( + ''' +You do not talk about Fight Club.
+ ''' + )) + self.assertContentStateEqual(result, { + 'entityMap': {}, + 'blocks': [ + { + 'inlineStyleRanges': [ + {'offset': 7, 'length': 3, 'style': 'ITALIC'} + ], + 'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': [] + }, + ] + }) + + def test_ordered_list(self): + converter = ContentstateConverter(features=['h1', 'ol', 'bold', 'italic']) + result = json.loads(converter.from_database_format( + ''' +