"""Very small HTML parser that builds a simple DOM tree.""" from html import unescape from html.parser import HTMLParser import re class Text: def __init__(self, text, parent=None): self.text = text self.parent = parent # Layout reference (set by layout engine) self.layout = None def __repr__(self): # pragma: no cover - debug helper return f"Text({self.text!r})" class Element: def __init__(self, tag, attributes=None, parent=None): self.tag = tag self.attributes = attributes or {} self.children = [] self.parent = parent # Layout reference (set by layout engine) self.layout = None def __repr__(self): # pragma: no cover - debug helper return f"Element({self.tag!r}, {self.attributes!r})" @property def bounding_box(self): """Get bounding box from layout if available.""" if self.layout: return (self.layout.x, self.layout.y, self.layout.x + self.layout.width, self.layout.y + self.layout.height) return None def print_tree(node, indent=0): spacer = " " * indent print(f"{spacer}{node}") if hasattr(node, "children"): for child in node.children: print_tree(child, indent + 1) class _DOMBuilder(HTMLParser): """Tiny HTML parser that produces Element/Text nodes.""" # HTML5 void elements - elements that cannot have children VOID_ELEMENTS = frozenset({ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr" }) def __init__(self): super().__init__(convert_charrefs=False) self.root = Element("html") self.current = self.root self._skip_depth = 0 # for script/style skipping self._body = None # The body element (real or implicit) def _ensure_body(self): """Ensure we have a body element to add content to.""" if self._body is None: self._body = Element("body", parent=self.root) self.root.children.append(self._body) if self.current is self.root: self.current = self._body # Helpers def _push(self, el: Element): el.parent = self.current self.current.children.append(el) self.current = el def _pop(self, tag: str): node = self.current while node and node is not self.root: if getattr(node, "tag", None) == tag: self.current = node.parent or self._body or self.root return node = node.parent self.current = self._body or self.root def _append_text(self, text: str): """Append text to current node, merging with previous text when possible.""" if not text: return last = self.current.children[-1] if self.current.children else None if isinstance(last, Text): # Avoid accumulating duplicate whitespace when merging segments if last.text.endswith(" ") and text.startswith(" "): text = text.lstrip() last.text += text else: self.current.children.append(Text(text, parent=self.current)) # HTMLParser callbacks def handle_starttag(self, tag, attrs): if tag in {"script"}: self._skip_depth += 1 return if self._skip_depth > 0: return # Skip html/head tags - we handle structure ourselves if tag == "html": return # Use our root instead if tag == "head": # We skip head but need to preserve style tags return if tag == "body": if self._body is None: # Create the body element attr_dict = {k: v for k, v in attrs} self._body = Element("body", attr_dict, parent=self.root) self.root.children.append(self._body) self.current = self._body return # Handle style tags - keep them in the tree for CSS extraction if tag == "style": attr_dict = {k: v for k, v in attrs} el = Element(tag, attr_dict) self._push(el) return attr_dict = {k: v for k, v in attrs} el = Element(tag, attr_dict) # Ensure we're inside a body if self.current is self.root: self._ensure_body() # Handle implicit closure for certain elements # A new
tag closes any open
tag (HTML5 implicit paragraph closure) if tag == "p" and self.current.tag == "p": self._pop("p") # For void elements, add to tree but don't push onto stack # (they can't have children and don't have closing tags) if tag in self.VOID_ELEMENTS: el.parent = self.current self.current.children.append(el) else: self._push(el) def handle_endtag(self, tag): if tag in {"script"}: if self._skip_depth > 0: self._skip_depth -= 1 return if self._skip_depth > 0: return if tag in {"html", "body", "head"}: return # Don't pop these self._pop(tag) def handle_data(self, data): if self._skip_depth > 0: return text = unescape(data) # Collapse whitespace if not text: return text = re.sub(r"\s+", " ", text) if not text.strip(): return # Skip whitespace-only text at root level # Ensure we're inside a body for text content if self.current is self.root: self._ensure_body() self._append_text(text) def handle_entityref(self, name): self.handle_data(f"&{name};") def handle_charref(self, name): self.handle_data(f"{name};") def parse_html(html_text: str) -> Element: """ Parse HTML into a small DOM tree of Element/Text nodes. - Scripts and styles are skipped - Whitespace is normalized within text nodes - Entities are decoded - A root
is always provided """ parser = _DOMBuilder() parser.feed(html_text) parser.close() return parser.root def parse_html_with_styles(html_text: str, apply_styles: bool = True) -> Element: """ Parse HTML and optionally extract and apply CSS styles. Args: html_text: The HTML source code apply_styles: Whether to parse