"""Tests for HTML parsing functionality.""" from src.parser.html import parse_html, Text, Element def collect_text(node): texts = [] if isinstance(node, Text): texts.append(node.text) if hasattr(node, "children"): for child in node.children: texts.extend(collect_text(child)) return texts class TestParseHTML: def test_parse_simple_text(self): html = "Hello World" root = parse_html(html) assert isinstance(root, Element) assert root.tag == "html" assert len(root.children) == 1 body = root.children[0] assert body.tag == "body" texts = collect_text(body) joined = " ".join(texts) assert "Hello World" in joined def test_parse_strips_tags(self): html = "

Hello

World
" root = parse_html(html) body = root.children[0] joined = " ".join(collect_text(body)) assert "Hello" in joined assert "World" in joined def test_parse_removes_script_tags(self): html = "VisibleText" root = parse_html(html) body = root.children[0] joined = " ".join(collect_text(body)) assert "Visible" in joined assert "Text" in joined assert "alert" not in joined assert "script" not in joined.lower() def test_parse_removes_style_tags(self): html = "TextMore" root = parse_html(html) body = root.children[0] joined = " ".join(collect_text(body)) assert "Text" in joined assert "More" in joined assert "color" not in joined def test_parse_decodes_entities(self): html = "<div> & "test"" root = parse_html(html) body = root.children[0] joined = " ".join(collect_text(body)) assert "
" in joined assert "&" in joined assert '"test"' in joined def test_parse_normalizes_whitespace(self): html = "Hello \n\n World" root = parse_html(html) body = root.children[0] joined = " ".join(collect_text(body)) # Multiple whitespace should be collapsed assert "Hello World" in joined def test_parse_empty_document(self): html = "" root = parse_html(html) assert isinstance(root, Element) assert root.tag == "html" body = root.children[0] assert body.tag == "body" # Empty body should have no text children assert len(collect_text(body)) == 0