bowser/src/layout/document.py
Benedikt Willi ce5dbff562 Enhance DOM graph visualization and image handling
- Implement PNG rendering for DOM graphs, with fallback to DOT format.
- Add support for max-width constraints in image layout based on CSS styles.
- Introduce caching mechanisms for image loading, including tracking failed and pending loads.
- Update HTML parser to handle void elements correctly.
- Modify template rendering to support PNG graph files.
2026-01-13 14:23:45 +01:00

507 lines
19 KiB
Python

"""Document-level layout."""
from ..parser.html import Element, Text
from ..render.fonts import get_font, linespace
from .embed import ImageLayout
class LayoutLine:
"""A laid-out line ready for rendering."""
def __init__(self, text: str, x: float, y: float, font_size: int,
char_positions: list = None, font_family: str = "", color: str = None, href: str = None):
self.text = text
self.x = x
self.y = y # Top of line
self.font_size = font_size
self.font_family = font_family
self.color = color # Text color (e.g., "#0066cc" for links)
self.href = href # Link target URL if this is a link
self.height = linespace(font_size)
self.width = 0
self.char_positions = char_positions or []
# Calculate width - pass text to get_font for proper font selection
if text:
font = get_font(font_size, font_family, text=text)
self.width = font.measureText(text)
class LayoutImage:
"""A laid-out image ready for rendering."""
def __init__(self, image_layout: ImageLayout, x: float, y: float):
self.image_layout = image_layout
self.x = x
self.y = y
# Store initial dimensions but also provide dynamic access
self._initial_width = image_layout.width
self._initial_height = image_layout.height
@property
def width(self) -> float:
"""Get current width (may update after async image load)."""
return self.image_layout.width if self.image_layout.width > 0 else self._initial_width
@property
def height(self) -> float:
"""Get current height (may update after async image load)."""
return self.image_layout.height if self.image_layout.height > 0 else self._initial_height
class LayoutBlock:
"""A laid-out block with its lines."""
def __init__(self, tag: str, block_type: str = "block"):
self.tag = tag
self.block_type = block_type
self.lines = [] # List of LayoutLine
self.x = 0
self.y = 0
self.width = 0
self.height = 0
class DocumentLayout:
"""Layout engine for a document."""
def __init__(self, node, frame=None, base_url=None, async_images: bool = False):
self.node = node
self.frame = frame
self.base_url = base_url # For resolving relative image URLs
self.async_images = async_images # Load images in background
self.blocks = [] # List of LayoutBlock
self.lines = [] # Flat list of all LayoutLine for rendering
self.images = [] # List of LayoutImage for rendering
self.width = 0
self.height = 0
def layout(self, width: int, x_margin: int = 20, y_start: int = 30) -> list:
"""
Layout the document and return a list of LayoutLine objects.
Returns:
List of LayoutLine objects ready for rendering
"""
self.width = width
max_width = max(10, width - 2 * x_margin)
y = y_start
self.blocks = []
self.lines = []
self.images = []
# Find body
body = self._find_body(self.node)
if not body:
return self.lines
# Collect and layout blocks
raw_blocks = self._collect_blocks(body)
for block_info in raw_blocks:
# Handle images separately
if block_info.get("is_image"):
image_layout = block_info.get("image_layout")
if image_layout:
margin_top = block_info.get("margin_top", 6)
margin_bottom = block_info.get("margin_bottom", 10)
y += margin_top
# Position the image
image_layout.x = x_margin
image_layout.y = y
# Add to images list for rendering
layout_image = LayoutImage(image_layout, x_margin, y)
self.images.append(layout_image)
y += image_layout.height + margin_bottom
continue
font_size = block_info.get("font_size", 14)
font_family = block_info.get("font_family", "")
text = block_info.get("text", "")
margin_top = block_info.get("margin_top", 6)
margin_bottom = block_info.get("margin_bottom", 10)
block_type = block_info.get("block_type", "block")
tag = block_info.get("tag", "")
color = block_info.get("color") # Text color from style
href = block_info.get("href") # Link target URL
if not text:
y += font_size * 0.6
continue
# Optional bullet prefix
if block_info.get("bullet"):
text = f"{text}"
layout_block = LayoutBlock(tag, block_type)
layout_block.x = x_margin
layout_block.y = y + margin_top
# Word wrap - pass text to get appropriate font
font = get_font(font_size, font_family, text=text)
words = text.split()
wrapped_lines = []
current_line = []
current_width = 0
for word in words:
word_width = font.measureText(word + " ")
if current_width + word_width > max_width and current_line:
wrapped_lines.append(" ".join(current_line))
current_line = [word]
current_width = word_width
else:
current_line.append(word)
current_width += word_width
if current_line:
wrapped_lines.append(" ".join(current_line))
# Create LayoutLines
line_height = linespace(font_size)
y += margin_top
block_start_y = y
for line_text in wrapped_lines:
# Calculate character positions
char_positions = [0.0]
for i in range(1, len(line_text) + 1):
char_positions.append(font.measureText(line_text[:i]))
layout_line = LayoutLine(
text=line_text,
x=x_margin,
y=y, # Top of line, baseline is y + font_size
font_size=font_size,
char_positions=char_positions,
font_family=font_family,
color=color,
href=href
)
layout_block.lines.append(layout_line)
self.lines.append(layout_line)
y += line_height
layout_block.height = y - block_start_y
layout_block.width = max_width
self.blocks.append(layout_block)
y += margin_bottom
self.height = y + 50 # Padding at bottom
return self.lines
def _find_body(self, node):
"""Find the body element in the document."""
if isinstance(node, Element) and node.tag == "body":
return node
if hasattr(node, "children"):
for child in node.children:
if isinstance(child, Element) and child.tag == "body":
return child
found = self._find_body(child)
if found:
return found
return None
def _collect_blocks(self, node) -> list:
"""Collect renderable blocks from the DOM."""
blocks = []
for child in getattr(node, "children", []):
if isinstance(child, Text):
txt = child.text.strip()
if txt:
# Use computed style if available
style = getattr(child, "computed_style", None)
font_size = style.get_int("font-size", 14) if style else 14
font_family = style.get("font-family", "") if style else ""
blocks.append({
"text": txt,
"font_size": font_size,
"font_family": font_family,
"block_type": "text",
"style": style
})
continue
if isinstance(child, Element):
tag = child.tag.lower()
# Skip style and script tags - they shouldn't be rendered
if tag in {"style", "script", "head", "title", "meta", "link"}:
continue
# Handle img tags
if tag == "img":
image_layout = ImageLayout(child)
image_layout.load(self.base_url, async_load=self.async_images)
# Get computed style for max-width constraint
style = getattr(child, "computed_style", None)
max_width_css = None
if style:
max_width_val = style.get("max-width", "")
if max_width_val == "100%":
# 100% means constrain to container width
max_width_css = self.width - 40 if self.width > 40 else 800
elif max_width_val.endswith("px"):
try:
max_width_css = float(max_width_val[:-2])
except ValueError:
pass
# Use CSS max-width or default container width
effective_max_width = max_width_css if max_width_css else (self.width - 40 if self.width > 40 else 800)
image_layout.layout(max_width=effective_max_width)
# Get computed style for margins
style = getattr(child, "computed_style", None)
if style:
margin_top = style.get_int("margin-top", 6)
margin_bottom = style.get_int("margin-bottom", 10)
else:
margin_top = 6
margin_bottom = 10
blocks.append({
"is_image": True,
"image_layout": image_layout,
"margin_top": margin_top,
"margin_bottom": margin_bottom,
})
continue
# Container elements - just recurse, don't add as blocks
if tag in {"ul", "ol", "div", "section", "article", "main", "header", "footer", "nav"}:
blocks.extend(self._collect_blocks(child))
continue
# Inline elements inside block elements are handled by _text_of
# Only create separate blocks for inline elements if they're direct
# children of container elements (handled above via recursion)
if tag in {"span", "strong", "em", "b", "i", "code"}:
# Skip - these are handled as part of parent's text
continue
# Handle anchor elements - they can be inline or standalone
if tag == "a":
# Get the href and treat this as a clickable block
href = child.attributes.get("href")
content = self._text_of(child)
if not content:
continue
style = getattr(child, "computed_style", None)
if style:
font_size = style.get_int("font-size", 14)
color = style.get("color")
font_family = style.get("font-family", "")
else:
font_size = 14
color = None
font_family = ""
# Default link color
if not color:
color = "#0066cc"
blocks.append({
"text": content,
"font_size": font_size,
"font_family": font_family,
"margin_top": 0,
"margin_bottom": 0,
"block_type": "inline",
"tag": tag,
"bullet": False,
"style": style,
"color": color,
"href": href
})
continue
# For block elements (p, h1, etc), first collect any embedded images
embedded_images = self._collect_images(child)
blocks.extend(embedded_images)
# Check if this element contains only a link
link_info = self._extract_single_link(child)
content = self._text_of(child)
if not content:
continue
# Get computed style for this element
style = getattr(child, "computed_style", None)
# Extract style properties
if style:
font_size = style.get_int("font-size", 14)
margin_top = style.get_int("margin-top", 6)
margin_bottom = style.get_int("margin-bottom", 10)
display = style.get("display", "block")
font_family = style.get("font-family", "")
color = style.get("color") # Get text color from style
else:
# Fallback to hardcoded defaults
font_size = self._get_default_font_size(tag)
margin_top = self._get_default_margin_top(tag)
margin_bottom = self._get_default_margin_bottom(tag)
display = "inline" if tag in {"span", "a", "strong", "em", "b", "i", "code"} else "block"
font_family = ""
color = None
# If block contains only a link, use link info for href and color
href = None
if link_info:
href = link_info.get("href")
if not color:
color = link_info.get("color", "#0066cc")
# Determine block type
block_type = "inline" if display == "inline" else "block"
if tag == "li" or display == "list-item":
block_type = "list-item"
# Add bullet for list items
bullet = (tag == "li" or display == "list-item")
blocks.append({
"text": content,
"font_size": font_size,
"font_family": font_family,
"margin_top": margin_top,
"margin_bottom": margin_bottom,
"block_type": block_type,
"tag": tag,
"bullet": bullet,
"style": style,
"color": color,
"href": href
})
return blocks
def _get_default_font_size(self, tag: str) -> int:
"""Get default font size for a tag (fallback when no styles)."""
sizes = {
"h1": 24, "h2": 20, "h3": 18, "h4": 16, "h5": 15, "h6": 14
}
return sizes.get(tag, 14)
def _get_default_margin_top(self, tag: str) -> int:
"""Get default top margin for a tag (fallback when no styles)."""
margins = {
"h1": 12, "h2": 10, "h3": 8, "p": 6, "li": 4
}
return margins.get(tag, 0)
def _get_default_margin_bottom(self, tag: str) -> int:
"""Get default bottom margin for a tag (fallback when no styles)."""
margins = {
"h1": 12, "h2": 10, "h3": 8, "p": 12, "li": 4
}
return margins.get(tag, 0)
def _extract_single_link(self, node) -> dict | None:
"""Extract link info if node contains only a single link.
Returns dict with href and color if the element contains only
a link (possibly with some whitespace text), None otherwise.
"""
if not isinstance(node, Element):
return None
links = []
has_other_content = False
for child in node.children:
if isinstance(child, Text):
# Whitespace-only text is okay
if child.text.strip():
has_other_content = True
elif isinstance(child, Element):
if child.tag.lower() == "a":
links.append(child)
else:
# Has other elements besides links
has_other_content = True
# Return link info only if there's exactly one link and no other content
if len(links) == 1 and not has_other_content:
link = links[0]
style = getattr(link, "computed_style", None)
color = style.get("color") if style else None
return {
"href": link.attributes.get("href"),
"color": color or "#0066cc"
}
return None
def _collect_images(self, node) -> list:
"""Recursively collect all img elements from a node."""
images = []
if not isinstance(node, Element):
return images
for child in getattr(node, "children", []):
if isinstance(child, Element):
if child.tag.lower() == "img":
image_layout = ImageLayout(child)
image_layout.load(self.base_url, async_load=self.async_images)
# Get computed style for max-width constraint
style = getattr(child, "computed_style", None)
max_width_css = None
if style:
max_width_val = style.get("max-width", "")
if max_width_val == "100%":
# 100% means constrain to container width
max_width_css = self.width - 40 if self.width > 40 else 800
elif max_width_val.endswith("px"):
try:
max_width_css = float(max_width_val[:-2])
except ValueError:
pass
# Use CSS max-width or default container width
effective_max_width = max_width_css if max_width_css else (self.width - 40 if self.width > 40 else 800)
image_layout.layout(max_width=effective_max_width)
style = getattr(child, "computed_style", None)
if style:
margin_top = style.get_int("margin-top", 6)
margin_bottom = style.get_int("margin-bottom", 10)
else:
margin_top = 6
margin_bottom = 10
images.append({
"is_image": True,
"image_layout": image_layout,
"margin_top": margin_top,
"margin_bottom": margin_bottom,
})
else:
# Recurse into children
images.extend(self._collect_images(child))
return images
def _text_of(self, node) -> str:
"""Extract text content from a node."""
if isinstance(node, Text):
return node.text
if isinstance(node, Element):
parts = []
for child in node.children:
parts.append(self._text_of(child))
return " ".join([p for p in parts if p]).strip()
return ""