mirror of
https://github.com/Hopiu/wagtail.git
synced 2026-04-03 14:50:41 +00:00
Add horrible-but-necessary logic for normalising whitespace
This commit is contained in:
parent
aa3b588cf4
commit
450edd24aa
2 changed files with 181 additions and 9 deletions
|
|
@ -1,4 +1,5 @@
|
|||
from html.parser import HTMLParser
|
||||
import re
|
||||
|
||||
from wagtail.admin.rich_text.converters.contentstate_models import (
|
||||
Block, ContentState, Entity, EntityRange, InlineStyleRange
|
||||
|
|
@ -10,6 +11,8 @@ class HandlerState(object):
|
|||
self.current_block = None
|
||||
self.current_inline_styles = []
|
||||
self.current_entity_ranges = []
|
||||
# what to do with leading whitespace on the next text node we encounter: strip, keep or force
|
||||
self.leading_whitespace = 'strip'
|
||||
self.list_depth = 0
|
||||
self.list_item_type = None
|
||||
self.pushed_states = []
|
||||
|
|
@ -19,6 +22,7 @@ class HandlerState(object):
|
|||
'current_block': self.current_block,
|
||||
'current_inline_styles': self.current_inline_styles,
|
||||
'current_entity_ranges': self.current_entity_ranges,
|
||||
'leading_whitespace': self.leading_whitespace,
|
||||
'list_depth': self.list_depth,
|
||||
'list_item_type': self.list_item_type
|
||||
})
|
||||
|
|
@ -28,6 +32,7 @@ class HandlerState(object):
|
|||
self.current_block = last_state['current_block']
|
||||
self.current_inline_styles = last_state['current_inline_styles']
|
||||
self.current_entity_ranges = last_state['current_entity_ranges']
|
||||
self.leading_whitespace = last_state['leading_whitespace']
|
||||
self.list_depth = last_state['list_depth']
|
||||
self.list_item_type = last_state['list_item_type']
|
||||
|
||||
|
|
@ -64,6 +69,7 @@ class BlockElementHandler(object):
|
|||
block = self.create_block(name, dict(attrs), state, contentstate)
|
||||
contentstate.blocks.append(block)
|
||||
state.current_block = block
|
||||
state.leading_whitespace = 'strip'
|
||||
|
||||
def handle_endtag(self, name, state, contentState):
|
||||
assert not state.current_inline_styles, "End of block reached without closing inline style elements"
|
||||
|
|
@ -88,6 +94,13 @@ class InlineStyleElementHandler(object):
|
|||
|
||||
def handle_starttag(self, name, attrs, state, contentstate):
|
||||
assert state.current_block is not None, "%s element found at the top level" % name
|
||||
|
||||
if state.leading_whitespace == 'force':
|
||||
# any pending whitespace should be output before handling this tag,
|
||||
# and subsequent whitespace should be collapsed into it (= stripped)
|
||||
state.current_block.text += ' '
|
||||
state.leading_whitespace = 'strip'
|
||||
|
||||
inline_style_range = InlineStyleRange(self.style)
|
||||
inline_style_range.offset = len(state.current_block.text)
|
||||
state.current_block.inline_style_ranges.append(inline_style_range)
|
||||
|
|
@ -105,6 +118,13 @@ class LinkElementHandler(object):
|
|||
|
||||
def handle_starttag(self, name, attrs, state, contentstate):
|
||||
assert state.current_block is not None, "%s element found at the top level" % name
|
||||
|
||||
if state.leading_whitespace == 'force':
|
||||
# any pending whitespace should be output before handling this tag,
|
||||
# and subsequent whitespace should be collapsed into it (= stripped)
|
||||
state.current_block.text += ' '
|
||||
state.leading_whitespace = 'strip'
|
||||
|
||||
attrs = dict(attrs)
|
||||
|
||||
entity = Entity(self.entity_type, 'MUTABLE', {'url': attrs['href']})
|
||||
|
|
@ -212,7 +232,8 @@ class HtmlToContentStateHandler(HTMLParser):
|
|||
|
||||
def add_block(self, block):
|
||||
self.contentstate.blocks.append(block)
|
||||
self.current_block = block
|
||||
self.state.current_block = block
|
||||
self.state.leading_whitespace = 'strip'
|
||||
|
||||
def handle_starttag(self, name, attrs):
|
||||
self.element_depth += 1
|
||||
|
|
@ -246,15 +267,41 @@ class HtmlToContentStateHandler(HTMLParser):
|
|||
self.element_depth -= 1
|
||||
|
||||
def handle_data(self, content):
|
||||
# normalise whitespace sequences to a single space
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
if self.state.current_block is None:
|
||||
content = content.strip()
|
||||
if content:
|
||||
# create a new paragraph block for this content
|
||||
block = Block('unstyled', depth=self.state.list_depth)
|
||||
self.contentstate.blocks.append(block)
|
||||
self.state.current_block = block
|
||||
else:
|
||||
if content == ' ':
|
||||
# ignore top-level whitespace
|
||||
return
|
||||
else:
|
||||
# create a new paragraph block for this content
|
||||
self.add_block(Block('unstyled', depth=self.state.list_depth))
|
||||
|
||||
self.state.current_block.text += content
|
||||
if content == ' ':
|
||||
# if leading_whitespace = 'strip', this whitespace node is not significant
|
||||
# and should be skipped.
|
||||
# For other cases, _don't_ output the whitespace yet, but set leading_whitespace = 'force'
|
||||
# so that a space is forced before the next text node or inline element. If no such node
|
||||
# appears (= we reach the end of the block), the whitespace can rightfully be dropped.
|
||||
if self.state.leading_whitespace != 'strip':
|
||||
self.state.leading_whitespace = 'force'
|
||||
else:
|
||||
# strip or add leading whitespace according to the leading_whitespace flag
|
||||
if self.state.leading_whitespace == 'strip':
|
||||
content = content.lstrip()
|
||||
elif self.state.leading_whitespace == 'force' and not content.startswith(' '):
|
||||
content = ' ' + content
|
||||
|
||||
if content.endswith(' '):
|
||||
# don't output trailing whitespace yet, because we want to discard it if the end
|
||||
# of the block follows. Instead, we'll set leading_whitespace = 'force' so that
|
||||
# any following text or inline element will be prefixed by a space
|
||||
content = content.rstrip()
|
||||
self.state.leading_whitespace = 'force'
|
||||
else:
|
||||
# no trailing whitespace here - any leading whitespace at the start of the
|
||||
# next text node should be respected
|
||||
self.state.leading_whitespace = 'keep'
|
||||
|
||||
self.state.current_block.text += content
|
||||
|
|
|
|||
|
|
@ -87,3 +87,128 @@ class TestHtmlToContentState(TestCase):
|
|||
{'inlineStyleRanges': [], 'text': 'after', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
|
||||
]
|
||||
})
|
||||
|
||||
def test_ignore_unrecognised_tags_in_blocks(self):
|
||||
converter = ContentstateConverter(features=[])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
<p>Hello <foo>frabjuous</foo> world!</p>
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{'inlineStyleRanges': [], 'text': 'Hello frabjuous world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
|
||||
]
|
||||
})
|
||||
|
||||
def test_inline_styles(self):
|
||||
converter = ContentstateConverter(features=['bold', 'italic'])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
<p>You <b>do <em>not</em> talk</b> about Fight Club.</p>
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{
|
||||
'inlineStyleRanges': [
|
||||
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
|
||||
],
|
||||
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
def test_inline_styles_at_top_level(self):
|
||||
converter = ContentstateConverter(features=['bold', 'italic'])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
You <b>do <em>not</em> talk</b> about Fight Club.
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{
|
||||
'inlineStyleRanges': [
|
||||
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
|
||||
],
|
||||
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
|
||||
},
|
||||
]
|
||||
})
|
||||
def test_inline_styles_depend_on_features(self):
|
||||
converter = ContentstateConverter(features=['italic', 'just-made-it-up'])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
<p>You <b>do <em>not</em> talk</b> about Fight Club.</p>
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{
|
||||
'inlineStyleRanges': [
|
||||
{'offset': 7, 'length': 3, 'style': 'ITALIC'}
|
||||
],
|
||||
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
def test_ordered_list(self):
|
||||
converter = ContentstateConverter(features=['h1', 'ol', 'bold', 'italic'])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
<h1>The rules of Fight Club</h1>
|
||||
<ol>
|
||||
<li>You do not talk about Fight Club.</li>
|
||||
<li>You <b>do <em>not</em> talk</b> about Fight Club.</li>
|
||||
</ol>
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{'inlineStyleRanges': [], 'text': 'The rules of Fight Club', 'depth': 0, 'type': 'header-one', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'ordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
{
|
||||
'inlineStyleRanges': [
|
||||
{'offset': 4, 'length': 11, 'style': 'BOLD'}, {'offset': 7, 'length': 3, 'style': 'ITALIC'}
|
||||
],
|
||||
'text': 'You do not talk about Fight Club.', 'depth': 0, 'type': 'ordered-list-item', 'key': '00000', 'entityRanges': []
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
def test_nested_list(self):
|
||||
converter = ContentstateConverter(features=['h1', 'ul'])
|
||||
result = json.loads(converter.from_database_format(
|
||||
'''
|
||||
<h1>Shopping list</h1>
|
||||
<ul>
|
||||
<li>Milk</li>
|
||||
<li>
|
||||
Flour
|
||||
<ul>
|
||||
<li>Plain</li>
|
||||
<li>Self-raising</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Eggs</li>
|
||||
</ul>
|
||||
'''
|
||||
))
|
||||
self.assertContentStateEqual(result, {
|
||||
'entityMap': {},
|
||||
'blocks': [
|
||||
{'inlineStyleRanges': [], 'text': 'Shopping list', 'depth': 0, 'type': 'header-one', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'Milk', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'Flour', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'Plain', 'depth': 1, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'Self-raising', 'depth': 1, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
{'inlineStyleRanges': [], 'text': 'Eggs', 'depth': 0, 'type': 'unordered-list-item', 'key': '00000', 'entityRanges': []},
|
||||
]
|
||||
})
|
||||
|
|
|
|||
Loading…
Reference in a new issue