gracefully handle unrecognised elements at the top level

This commit is contained in:
Matt Westcott 2017-12-05 16:19:40 +00:00 committed by Thibaud Colas
parent 1378b766ee
commit 808ad56e0f
2 changed files with 92 additions and 11 deletions

View file

@ -10,7 +10,7 @@ class HandlerState(object):
self.current_block = None
self.current_inline_styles = []
self.current_entity_ranges = []
self.depth = 0
self.list_depth = 0
self.list_item_type = None
self.pushed_states = []
@ -19,7 +19,7 @@ class HandlerState(object):
'current_block': self.current_block,
'current_inline_styles': self.current_inline_styles,
'current_entity_ranges': self.current_entity_ranges,
'depth': self.depth,
'list_depth': self.list_depth,
'list_item_type': self.list_item_type
})
@ -28,7 +28,7 @@ class HandlerState(object):
self.current_block = last_state['current_block']
self.current_inline_styles = last_state['current_inline_styles']
self.current_entity_ranges = last_state['current_entity_ranges']
self.depth = last_state['depth']
self.list_depth = last_state['list_depth']
self.list_item_type = last_state['list_item_type']
@ -45,7 +45,7 @@ class ListElementHandler(object):
pass
else:
# start the next nesting level
state.depth += 1
state.list_depth += 1
state.list_item_type = self.list_item_type
@ -59,7 +59,7 @@ class BlockElementHandler(object):
def create_block(self, name, attrs, state, contentstate):
assert state.current_block is None, "%s element found nested inside another block" % name
return Block(self.block_type, depth=state.depth)
return Block(self.block_type, depth=state.list_depth)
def handle_starttag(self, name, attrs, state, contentstate):
block = self.create_block(name, dict(attrs), state, contentstate)
@ -80,7 +80,7 @@ class ListItemElementHandler(BlockElementHandler):
def create_block(self, name, attrs, state, contentstate):
assert state.list_item_type is not None, "%s element found outside of an enclosing list element" % name
return Block(state.list_item_type, depth=state.depth)
return Block(state.list_item_type, depth=state.list_depth)
class InlineStyleElementHandler(object):
@ -131,7 +131,7 @@ class AtomicBlockEntityElementHandler(object):
entity = self.create_entity(name, dict(attrs), state, contentstate)
key = contentstate.add_entity(entity)
block = Block('atomic', depth=state.depth)
block = Block('atomic', depth=state.list_depth)
contentstate.blocks.append(block)
block.text = ' '
entity_range = EntityRange(key)
@ -208,6 +208,7 @@ class HtmlToContentStateHandler(HTMLParser):
def reset(self):
self.state = HandlerState()
self.contentstate = ContentState()
self.element_depth = 0 # number of unclosed start tags encountered, including the one currently being handled
super().reset()
def add_block(self, block):
@ -215,20 +216,35 @@ class HtmlToContentStateHandler(HTMLParser):
self.current_block = block
def handle_starttag(self, name, attrs):
self.element_depth += 1
try:
element_handler = self.element_handlers[name]
except KeyError:
return # ignore unrecognised elements
if self.element_depth == 1:
# treat unrecognised top-level elements as paragraphs
element_handler = self.element_handlers['p']
else:
# ignore unrecognised elements below the top-level
element_handler = None
element_handler.handle_starttag(name, attrs, self.state, self.contentstate)
if element_handler:
element_handler.handle_starttag(name, attrs, self.state, self.contentstate)
def handle_endtag(self, name):
try:
element_handler = self.element_handlers[name]
except KeyError:
return # ignore unrecognised elements
if self.element_depth == 1:
# treat unrecognised top-level elements as paragraphs
element_handler = self.element_handlers['p']
else:
# ignore unrecognised elements below the top-level
element_handler = None
element_handler.handle_endtag(name, self.state, self.contentstate)
if element_handler:
element_handler.handle_endtag(name, self.state, self.contentstate)
self.element_depth -= 1
def handle_data(self, content):
if self.state.current_block is None:

View file

@ -0,0 +1,65 @@
import json
from django.test import TestCase
from wagtail.admin.rich_text.converters.contentstate import ContentstateConverter
def content_state_equal(v1, v2):
"Test whether two contentState structures are equal, ignoring 'key' properties"
if type(v1) != type(v2):
return False
if type(v1) == dict:
if set(v1.keys()) != set(v2.keys()):
return False
return all(
k == 'key' or content_state_equal(v, v2[k])
for k, v in v1.items()
)
elif type(v1) == list:
if len(v1) != len(v2):
return False
return all(
content_state_equal(a, b) for a, b in zip(v1, v2)
)
else:
return v1 == v2
class TestHtmlToContentState(TestCase):
def assertContentStateEqual(self, v1, v2):
"Assert that two contentState structures are equal, ignoring 'key' properties"
self.assertTrue(content_state_equal(v1, v2), "%r does not match %r" % (v1, v2))
def test_paragraphs(self):
converter = ContentstateConverter(features=[])
result = json.loads(converter.from_database_format(
'''
<p>Hello world!</p>
<p>Goodbye world!</p>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{'inlineStyleRanges': [], 'text': 'Hello world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Goodbye world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
]
})
def test_unknown_block_becomes_paragraph(self):
converter = ContentstateConverter(features=[])
result = json.loads(converter.from_database_format(
'''
<foo>Hello world!</foo>
<p>Goodbye world!</p>
'''
))
self.assertContentStateEqual(result, {
'entityMap': {},
'blocks': [
{'inlineStyleRanges': [], 'text': 'Hello world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
{'inlineStyleRanges': [], 'text': 'Goodbye world!', 'depth': 0, 'type': 'unstyled', 'key': '00000', 'entityRanges': []},
]
})