mirror of
https://github.com/Hopiu/bowser.git
synced 2026-03-16 19:10:24 +00:00
- **Link Detection and Navigation**: Added functionality to detect if a mouse click falls within a link area. If a link is clicked, the browser now navigates to the corresponding URL while logging the action. This also includes handling relative URLs based on the current page context. - **Line Layout Enhancements**: The `LayoutLine` class now includes optional attributes for color and href, allowing links to maintain their designated colors in the rendered output. - **Color Parsing**: Implemented a new `_parse_color` method in the `RenderPipeline` class to convert various color formats (hex and named colors) to Skia-compatible values. This ensures that default link colors are correctly applied and that extremely light colors are rendered as black for visibility. - **Rendering Links**: During the rendering process, links in the text layout are now rendered with their specified colors, and an underline is drawn under links to indicate interactivity. - **Document Layout Updates**: The document parsing system has been updated to extract link information correctly while preserving text hierarchy. - **Tests**: A comprehensive suite of tests has been added, including tests for link parsing, layout characteristics, styling application, and default color handling for links.
97 lines
3 KiB
Python
97 lines
3 KiB
Python
"""Tests for HTML parsing functionality."""
|
|
|
|
from src.parser.html import parse_html, Text, Element
|
|
|
|
|
|
def collect_text(node):
|
|
texts = []
|
|
if isinstance(node, Text):
|
|
texts.append(node.text)
|
|
if hasattr(node, "children"):
|
|
for child in node.children:
|
|
texts.extend(collect_text(child))
|
|
return texts
|
|
|
|
|
|
class TestParseHTML:
|
|
def test_parse_simple_text(self):
|
|
html = "<html><body>Hello World</body></html>"
|
|
root = parse_html(html)
|
|
|
|
assert isinstance(root, Element)
|
|
assert root.tag == "html"
|
|
assert len(root.children) == 1
|
|
|
|
body = root.children[0]
|
|
assert body.tag == "body"
|
|
texts = collect_text(body)
|
|
joined = " ".join(texts)
|
|
assert "Hello World" in joined
|
|
|
|
def test_parse_strips_tags(self):
|
|
html = "<html><body><p>Hello</p><div>World</div></body></html>"
|
|
root = parse_html(html)
|
|
|
|
body = root.children[0]
|
|
joined = " ".join(collect_text(body))
|
|
assert "Hello" in joined
|
|
assert "World" in joined
|
|
|
|
def test_parse_removes_script_tags(self):
|
|
html = "<html><body>Visible<script>alert('bad')</script>Text</body></html>"
|
|
root = parse_html(html)
|
|
|
|
body = root.children[0]
|
|
joined = " ".join(collect_text(body))
|
|
assert "Visible" in joined
|
|
assert "Text" in joined
|
|
assert "alert" not in joined
|
|
assert "script" not in joined.lower()
|
|
|
|
def test_parse_keeps_style_tags(self):
|
|
"""Style tags are now kept in the DOM for CSS extraction."""
|
|
html = "<html><body>Text<style>body{color:red;}</style>More</body></html>"
|
|
root = parse_html(html)
|
|
|
|
body = root.children[0]
|
|
# Find style element
|
|
style_elem = None
|
|
for child in body.children:
|
|
if hasattr(child, "tag") and child.tag == "style":
|
|
style_elem = child
|
|
break
|
|
|
|
assert style_elem is not None
|
|
# Style content should be in the element
|
|
joined = " ".join(collect_text(style_elem))
|
|
assert "color" in joined
|
|
|
|
def test_parse_decodes_entities(self):
|
|
html = "<html><body><div> & "test"</body></html>"
|
|
root = parse_html(html)
|
|
|
|
body = root.children[0]
|
|
joined = " ".join(collect_text(body))
|
|
assert "<div>" in joined
|
|
assert "&" in joined
|
|
assert '"test"' in joined
|
|
|
|
def test_parse_normalizes_whitespace(self):
|
|
html = "<html><body>Hello \n\n World</body></html>"
|
|
root = parse_html(html)
|
|
|
|
body = root.children[0]
|
|
joined = " ".join(collect_text(body))
|
|
# Multiple whitespace should be collapsed
|
|
assert "Hello World" in joined
|
|
|
|
def test_parse_empty_document(self):
|
|
html = "<html><body></body></html>"
|
|
root = parse_html(html)
|
|
|
|
assert isinstance(root, Element)
|
|
assert root.tag == "html"
|
|
body = root.children[0]
|
|
assert body.tag == "body"
|
|
# Empty body should have no text children
|
|
assert len(collect_text(body)) == 0
|