bowser/src/parser/html.py
Benedikt Willi ce5dbff562 Enhance DOM graph visualization and image handling
- Implement PNG rendering for DOM graphs, with fallback to DOT format.
- Add support for max-width constraints in image layout based on CSS styles.
- Introduce caching mechanisms for image loading, including tracking failed and pending loads.
- Update HTML parser to handle void elements correctly.
- Modify template rendering to support PNG graph files.
2026-01-13 14:23:45 +01:00

267 lines
8.2 KiB
Python

"""Very small HTML parser that builds a simple DOM tree."""
from html import unescape
from html.parser import HTMLParser
import re
class Text:
def __init__(self, text, parent=None):
self.text = text
self.parent = parent
# Layout reference (set by layout engine)
self.layout = None
def __repr__(self): # pragma: no cover - debug helper
return f"Text({self.text!r})"
class Element:
def __init__(self, tag, attributes=None, parent=None):
self.tag = tag
self.attributes = attributes or {}
self.children = []
self.parent = parent
# Layout reference (set by layout engine)
self.layout = None
def __repr__(self): # pragma: no cover - debug helper
return f"Element({self.tag!r}, {self.attributes!r})"
@property
def bounding_box(self):
"""Get bounding box from layout if available."""
if self.layout:
return (self.layout.x, self.layout.y,
self.layout.x + self.layout.width,
self.layout.y + self.layout.height)
return None
def print_tree(node, indent=0):
spacer = " " * indent
print(f"{spacer}{node}")
if hasattr(node, "children"):
for child in node.children:
print_tree(child, indent + 1)
class _DOMBuilder(HTMLParser):
"""Tiny HTML parser that produces Element/Text nodes."""
# HTML5 void elements - elements that cannot have children
VOID_ELEMENTS = frozenset({
"area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr"
})
def __init__(self):
super().__init__(convert_charrefs=False)
self.root = Element("html")
self.current = self.root
self._skip_depth = 0 # for script/style skipping
self._body = None # The body element (real or implicit)
def _ensure_body(self):
"""Ensure we have a body element to add content to."""
if self._body is None:
self._body = Element("body", parent=self.root)
self.root.children.append(self._body)
if self.current is self.root:
self.current = self._body
# Helpers
def _push(self, el: Element):
el.parent = self.current
self.current.children.append(el)
self.current = el
def _pop(self, tag: str):
node = self.current
while node and node is not self.root:
if getattr(node, "tag", None) == tag:
self.current = node.parent or self._body or self.root
return
node = node.parent
self.current = self._body or self.root
def _append_text(self, text: str):
"""Append text to current node, merging with previous text when possible."""
if not text:
return
last = self.current.children[-1] if self.current.children else None
if isinstance(last, Text):
# Avoid accumulating duplicate whitespace when merging segments
if last.text.endswith(" ") and text.startswith(" "):
text = text.lstrip()
last.text += text
else:
self.current.children.append(Text(text, parent=self.current))
# HTMLParser callbacks
def handle_starttag(self, tag, attrs):
if tag in {"script"}:
self._skip_depth += 1
return
if self._skip_depth > 0:
return
# Skip html/head tags - we handle structure ourselves
if tag == "html":
return # Use our root instead
if tag == "head":
# We skip head but need to preserve style tags
return
if tag == "body":
if self._body is None:
# Create the body element
attr_dict = {k: v for k, v in attrs}
self._body = Element("body", attr_dict, parent=self.root)
self.root.children.append(self._body)
self.current = self._body
return
# Handle style tags - keep them in the tree for CSS extraction
if tag == "style":
attr_dict = {k: v for k, v in attrs}
el = Element(tag, attr_dict)
self._push(el)
return
attr_dict = {k: v for k, v in attrs}
el = Element(tag, attr_dict)
# Ensure we're inside a body
if self.current is self.root:
self._ensure_body()
# Handle implicit closure for certain elements
# A new <p> tag closes any open <p> tag (HTML5 implicit paragraph closure)
if tag == "p" and self.current.tag == "p":
self._pop("p")
# For void elements, add to tree but don't push onto stack
# (they can't have children and don't have closing tags)
if tag in self.VOID_ELEMENTS:
el.parent = self.current
self.current.children.append(el)
else:
self._push(el)
def handle_endtag(self, tag):
if tag in {"script"}:
if self._skip_depth > 0:
self._skip_depth -= 1
return
if self._skip_depth > 0:
return
if tag in {"html", "body", "head"}:
return # Don't pop these
self._pop(tag)
def handle_data(self, data):
if self._skip_depth > 0:
return
text = unescape(data)
# Collapse whitespace
if not text:
return
text = re.sub(r"\s+", " ", text)
if not text.strip():
return # Skip whitespace-only text at root level
# Ensure we're inside a body for text content
if self.current is self.root:
self._ensure_body()
self._append_text(text)
def handle_entityref(self, name):
self.handle_data(f"&{name};")
def handle_charref(self, name):
self.handle_data(f"&#{name};")
def parse_html(html_text: str) -> Element:
"""
Parse HTML into a small DOM tree of Element/Text nodes.
- Scripts and styles are skipped
- Whitespace is normalized within text nodes
- Entities are decoded
- A root <html><body> is always provided
"""
parser = _DOMBuilder()
parser.feed(html_text)
parser.close()
return parser.root
def parse_html_with_styles(html_text: str, apply_styles: bool = True) -> Element:
"""
Parse HTML and optionally extract and apply CSS styles.
Args:
html_text: The HTML source code
apply_styles: Whether to parse <style> tags and apply styles
Returns:
The root element with computed_style attributes on each node
"""
from .css import parse as parse_css
from .style import StyleResolver
from pathlib import Path
# Parse HTML
root = parse_html(html_text)
if not apply_styles:
return root
# Load default stylesheet
css_rules = []
default_css_path = Path(__file__).parent.parent.parent / "assets" / "default.css"
if default_css_path.exists():
with open(default_css_path, "r", encoding="utf-8") as f:
default_css = f.read()
default_rules = parse_css(default_css)
css_rules.extend(default_rules)
# Extract CSS from <style> tags
style_elements = _find_elements_by_tag(root, "style")
for style_elem in style_elements:
# Extract text content from style element
css_text = _text_of_element(style_elem)
if css_text:
rules = parse_css(css_text)
css_rules.extend(rules)
# Create style resolver and apply to tree
resolver = StyleResolver(css_rules)
resolver.resolve_tree(root)
return root
def _find_elements_by_tag(node, tag: str) -> list:
"""Find all elements with a given tag name."""
results = []
if isinstance(node, Element) and node.tag == tag:
results.append(node)
if hasattr(node, "children"):
for child in node.children:
results.extend(_find_elements_by_tag(child, tag))
return results
def _text_of_element(node) -> str:
"""Extract text content from an element."""
if isinstance(node, Text):
return node.text
if isinstance(node, Element):
parts = []
for child in node.children:
parts.append(_text_of_element(child))
return " ".join([p for p in parts if p])
return ""