2026-01-09 23:19:21 +00:00
|
|
|
"""Very small HTML parser that builds a simple DOM tree."""
|
2026-01-09 11:20:46 +00:00
|
|
|
|
2026-01-09 23:19:21 +00:00
|
|
|
from html import unescape
|
|
|
|
|
from html.parser import HTMLParser
|
2026-01-09 13:11:46 +00:00
|
|
|
import re
|
|
|
|
|
|
2026-01-09 11:20:46 +00:00
|
|
|
|
|
|
|
|
class Text:
|
|
|
|
|
def __init__(self, text, parent=None):
|
|
|
|
|
self.text = text
|
|
|
|
|
self.parent = parent
|
2026-01-11 22:54:50 +00:00
|
|
|
# Layout reference (set by layout engine)
|
|
|
|
|
self.layout = None
|
2026-01-09 11:20:46 +00:00
|
|
|
|
|
|
|
|
def __repr__(self): # pragma: no cover - debug helper
|
|
|
|
|
return f"Text({self.text!r})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Element:
|
|
|
|
|
def __init__(self, tag, attributes=None, parent=None):
|
|
|
|
|
self.tag = tag
|
|
|
|
|
self.attributes = attributes or {}
|
|
|
|
|
self.children = []
|
|
|
|
|
self.parent = parent
|
2026-01-11 22:54:50 +00:00
|
|
|
# Layout reference (set by layout engine)
|
|
|
|
|
self.layout = None
|
2026-01-09 11:20:46 +00:00
|
|
|
|
|
|
|
|
def __repr__(self): # pragma: no cover - debug helper
|
|
|
|
|
return f"Element({self.tag!r}, {self.attributes!r})"
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-11 22:54:50 +00:00
|
|
|
@property
|
|
|
|
|
def bounding_box(self):
|
|
|
|
|
"""Get bounding box from layout if available."""
|
|
|
|
|
if self.layout:
|
2026-01-12 08:16:23 +00:00
|
|
|
return (self.layout.x, self.layout.y,
|
2026-01-11 22:54:50 +00:00
|
|
|
self.layout.x + self.layout.width,
|
|
|
|
|
self.layout.y + self.layout.height)
|
|
|
|
|
return None
|
2026-01-09 11:20:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_tree(node, indent=0):
|
|
|
|
|
spacer = " " * indent
|
|
|
|
|
print(f"{spacer}{node}")
|
|
|
|
|
if hasattr(node, "children"):
|
|
|
|
|
for child in node.children:
|
|
|
|
|
print_tree(child, indent + 1)
|
2026-01-09 13:11:46 +00:00
|
|
|
|
|
|
|
|
|
2026-01-09 23:19:21 +00:00
|
|
|
class _DOMBuilder(HTMLParser):
|
|
|
|
|
"""Tiny HTML parser that produces Element/Text nodes."""
|
|
|
|
|
|
2026-01-13 13:23:45 +00:00
|
|
|
# HTML5 void elements - elements that cannot have children
|
|
|
|
|
VOID_ELEMENTS = frozenset({
|
|
|
|
|
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
|
|
|
"link", "meta", "param", "source", "track", "wbr"
|
|
|
|
|
})
|
|
|
|
|
|
2026-01-09 23:19:21 +00:00
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__(convert_charrefs=False)
|
|
|
|
|
self.root = Element("html")
|
2026-01-11 21:35:56 +00:00
|
|
|
self.current = self.root
|
2026-01-09 23:19:21 +00:00
|
|
|
self._skip_depth = 0 # for script/style skipping
|
2026-01-11 21:35:56 +00:00
|
|
|
self._body = None # The body element (real or implicit)
|
|
|
|
|
|
|
|
|
|
def _ensure_body(self):
|
|
|
|
|
"""Ensure we have a body element to add content to."""
|
|
|
|
|
if self._body is None:
|
|
|
|
|
self._body = Element("body", parent=self.root)
|
|
|
|
|
self.root.children.append(self._body)
|
|
|
|
|
if self.current is self.root:
|
|
|
|
|
self.current = self._body
|
2026-01-09 23:19:21 +00:00
|
|
|
|
|
|
|
|
# Helpers
|
|
|
|
|
def _push(self, el: Element):
|
|
|
|
|
el.parent = self.current
|
|
|
|
|
self.current.children.append(el)
|
|
|
|
|
self.current = el
|
|
|
|
|
|
|
|
|
|
def _pop(self, tag: str):
|
|
|
|
|
node = self.current
|
|
|
|
|
while node and node is not self.root:
|
|
|
|
|
if getattr(node, "tag", None) == tag:
|
2026-01-11 21:35:56 +00:00
|
|
|
self.current = node.parent or self._body or self.root
|
2026-01-09 23:19:21 +00:00
|
|
|
return
|
|
|
|
|
node = node.parent
|
2026-01-11 21:35:56 +00:00
|
|
|
self.current = self._body or self.root
|
2026-01-09 23:19:21 +00:00
|
|
|
|
|
|
|
|
def _append_text(self, text: str):
|
|
|
|
|
"""Append text to current node, merging with previous text when possible."""
|
|
|
|
|
if not text:
|
|
|
|
|
return
|
|
|
|
|
last = self.current.children[-1] if self.current.children else None
|
|
|
|
|
if isinstance(last, Text):
|
|
|
|
|
# Avoid accumulating duplicate whitespace when merging segments
|
|
|
|
|
if last.text.endswith(" ") and text.startswith(" "):
|
|
|
|
|
text = text.lstrip()
|
|
|
|
|
last.text += text
|
|
|
|
|
else:
|
|
|
|
|
self.current.children.append(Text(text, parent=self.current))
|
|
|
|
|
|
|
|
|
|
# HTMLParser callbacks
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
2026-01-12 10:41:18 +00:00
|
|
|
if tag in {"script"}:
|
2026-01-09 23:19:21 +00:00
|
|
|
self._skip_depth += 1
|
|
|
|
|
return
|
|
|
|
|
if self._skip_depth > 0:
|
|
|
|
|
return
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-11 21:35:56 +00:00
|
|
|
# Skip html/head tags - we handle structure ourselves
|
|
|
|
|
if tag == "html":
|
|
|
|
|
return # Use our root instead
|
|
|
|
|
if tag == "head":
|
2026-01-12 10:41:18 +00:00
|
|
|
# We skip head but need to preserve style tags
|
2026-01-11 21:35:56 +00:00
|
|
|
return
|
|
|
|
|
if tag == "body":
|
|
|
|
|
if self._body is None:
|
|
|
|
|
# Create the body element
|
|
|
|
|
attr_dict = {k: v for k, v in attrs}
|
|
|
|
|
self._body = Element("body", attr_dict, parent=self.root)
|
|
|
|
|
self.root.children.append(self._body)
|
|
|
|
|
self.current = self._body
|
|
|
|
|
return
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-12 10:41:18 +00:00
|
|
|
# Handle style tags - keep them in the tree for CSS extraction
|
|
|
|
|
if tag == "style":
|
|
|
|
|
attr_dict = {k: v for k, v in attrs}
|
|
|
|
|
el = Element(tag, attr_dict)
|
|
|
|
|
self._push(el)
|
|
|
|
|
return
|
|
|
|
|
|
2026-01-09 23:19:21 +00:00
|
|
|
attr_dict = {k: v for k, v in attrs}
|
|
|
|
|
el = Element(tag, attr_dict)
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-11 21:35:56 +00:00
|
|
|
# Ensure we're inside a body
|
|
|
|
|
if self.current is self.root:
|
|
|
|
|
self._ensure_body()
|
2026-01-12 08:16:23 +00:00
|
|
|
|
This commit introduces several enhancements to the browser rendering engine. Key changes include the ability to handle link clicks, improved link detection, and enhanced color parsing for proper rendering of styled links. The following modifications are included:
- **Link Detection and Navigation**: Added functionality to detect if a mouse click falls within a link area. If a link is clicked, the browser now navigates to the corresponding URL while logging the action. This also includes handling relative URLs based on the current page context.
- **Line Layout Enhancements**: The `LayoutLine` class now includes optional attributes for color and href, allowing links to maintain their designated colors in the rendered output.
- **Color Parsing**: Implemented a new `_parse_color` method in the `RenderPipeline` class to convert various color formats (hex and named colors) to Skia-compatible values. This ensures that default link colors are correctly applied and that extremely light colors are rendered as black for visibility.
- **Rendering Links**: During the rendering process, links in the text layout are now rendered with their specified colors, and an underline is drawn under links to indicate interactivity.
- **Document Layout Updates**: The document parsing system has been updated to extract link information correctly while preserving text hierarchy.
- **Tests**: A comprehensive suite of tests has been added, including tests for link parsing, layout characteristics, styling application, and default color handling for links.
2026-01-13 12:06:20 +00:00
|
|
|
# Handle implicit closure for certain elements
|
|
|
|
|
# A new <p> tag closes any open <p> tag (HTML5 implicit paragraph closure)
|
|
|
|
|
if tag == "p" and self.current.tag == "p":
|
|
|
|
|
self._pop("p")
|
|
|
|
|
|
2026-01-13 13:23:45 +00:00
|
|
|
# For void elements, add to tree but don't push onto stack
|
|
|
|
|
# (they can't have children and don't have closing tags)
|
|
|
|
|
if tag in self.VOID_ELEMENTS:
|
|
|
|
|
el.parent = self.current
|
|
|
|
|
self.current.children.append(el)
|
|
|
|
|
else:
|
|
|
|
|
self._push(el)
|
2026-01-09 23:19:21 +00:00
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
2026-01-12 10:41:18 +00:00
|
|
|
if tag in {"script"}:
|
2026-01-09 23:19:21 +00:00
|
|
|
if self._skip_depth > 0:
|
|
|
|
|
self._skip_depth -= 1
|
|
|
|
|
return
|
|
|
|
|
if self._skip_depth > 0:
|
|
|
|
|
return
|
2026-01-12 10:41:18 +00:00
|
|
|
if tag in {"html", "body", "head"}:
|
2026-01-11 21:35:56 +00:00
|
|
|
return # Don't pop these
|
2026-01-09 23:19:21 +00:00
|
|
|
self._pop(tag)
|
|
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
|
if self._skip_depth > 0:
|
|
|
|
|
return
|
|
|
|
|
text = unescape(data)
|
|
|
|
|
# Collapse whitespace
|
|
|
|
|
if not text:
|
|
|
|
|
return
|
|
|
|
|
text = re.sub(r"\s+", " ", text)
|
|
|
|
|
if not text.strip():
|
2026-01-11 21:35:56 +00:00
|
|
|
return # Skip whitespace-only text at root level
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-11 21:35:56 +00:00
|
|
|
# Ensure we're inside a body for text content
|
|
|
|
|
if self.current is self.root:
|
|
|
|
|
self._ensure_body()
|
2026-01-12 08:16:23 +00:00
|
|
|
|
2026-01-09 23:19:21 +00:00
|
|
|
self._append_text(text)
|
|
|
|
|
|
|
|
|
|
def handle_entityref(self, name):
|
|
|
|
|
self.handle_data(f"&{name};")
|
|
|
|
|
|
|
|
|
|
def handle_charref(self, name):
|
|
|
|
|
self.handle_data(f"&#{name};")
|
|
|
|
|
|
|
|
|
|
|
2026-01-09 13:11:46 +00:00
|
|
|
def parse_html(html_text: str) -> Element:
|
|
|
|
|
"""
|
2026-01-09 23:19:21 +00:00
|
|
|
Parse HTML into a small DOM tree of Element/Text nodes.
|
|
|
|
|
- Scripts and styles are skipped
|
|
|
|
|
- Whitespace is normalized within text nodes
|
|
|
|
|
- Entities are decoded
|
|
|
|
|
- A root <html><body> is always provided
|
2026-01-09 13:11:46 +00:00
|
|
|
"""
|
2026-01-09 23:19:21 +00:00
|
|
|
parser = _DOMBuilder()
|
|
|
|
|
parser.feed(html_text)
|
|
|
|
|
parser.close()
|
|
|
|
|
return parser.root
|
2026-01-12 10:41:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_html_with_styles(html_text: str, apply_styles: bool = True) -> Element:
|
|
|
|
|
"""
|
|
|
|
|
Parse HTML and optionally extract and apply CSS styles.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
html_text: The HTML source code
|
|
|
|
|
apply_styles: Whether to parse <style> tags and apply styles
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The root element with computed_style attributes on each node
|
|
|
|
|
"""
|
|
|
|
|
from .css import parse as parse_css
|
|
|
|
|
from .style import StyleResolver
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
# Parse HTML
|
|
|
|
|
root = parse_html(html_text)
|
|
|
|
|
|
|
|
|
|
if not apply_styles:
|
|
|
|
|
return root
|
|
|
|
|
|
|
|
|
|
# Load default stylesheet
|
|
|
|
|
css_rules = []
|
|
|
|
|
default_css_path = Path(__file__).parent.parent.parent / "assets" / "default.css"
|
|
|
|
|
if default_css_path.exists():
|
|
|
|
|
with open(default_css_path, "r", encoding="utf-8") as f:
|
|
|
|
|
default_css = f.read()
|
|
|
|
|
default_rules = parse_css(default_css)
|
|
|
|
|
css_rules.extend(default_rules)
|
|
|
|
|
|
|
|
|
|
# Extract CSS from <style> tags
|
|
|
|
|
style_elements = _find_elements_by_tag(root, "style")
|
|
|
|
|
|
|
|
|
|
for style_elem in style_elements:
|
|
|
|
|
# Extract text content from style element
|
|
|
|
|
css_text = _text_of_element(style_elem)
|
|
|
|
|
if css_text:
|
|
|
|
|
rules = parse_css(css_text)
|
|
|
|
|
css_rules.extend(rules)
|
|
|
|
|
|
|
|
|
|
# Create style resolver and apply to tree
|
|
|
|
|
resolver = StyleResolver(css_rules)
|
|
|
|
|
resolver.resolve_tree(root)
|
|
|
|
|
|
|
|
|
|
return root
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_elements_by_tag(node, tag: str) -> list:
|
|
|
|
|
"""Find all elements with a given tag name."""
|
|
|
|
|
results = []
|
|
|
|
|
if isinstance(node, Element) and node.tag == tag:
|
|
|
|
|
results.append(node)
|
|
|
|
|
if hasattr(node, "children"):
|
|
|
|
|
for child in node.children:
|
|
|
|
|
results.extend(_find_elements_by_tag(child, tag))
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _text_of_element(node) -> str:
|
|
|
|
|
"""Extract text content from an element."""
|
|
|
|
|
if isinstance(node, Text):
|
|
|
|
|
return node.text
|
|
|
|
|
if isinstance(node, Element):
|
|
|
|
|
parts = []
|
|
|
|
|
for child in node.children:
|
|
|
|
|
parts.append(_text_of_element(child))
|
|
|
|
|
return " ".join([p for p in parts if p])
|
|
|
|
|
return ""
|