"""Document-level layout.""" from ..parser.html import Element, Text from ..render.fonts import get_font, linespace from .embed import ImageLayout class LayoutLine: """A laid-out line ready for rendering.""" def __init__(self, text: str, x: float, y: float, font_size: int, char_positions: list = None, font_family: str = "", color: str = None, href: str = None): self.text = text self.x = x self.y = y # Top of line self.font_size = font_size self.font_family = font_family self.color = color # Text color (e.g., "#0066cc" for links) self.href = href # Link target URL if this is a link self.height = linespace(font_size) self.width = 0 self.char_positions = char_positions or [] # Calculate width - pass text to get_font for proper font selection if text: font = get_font(font_size, font_family, text=text) self.width = font.measureText(text) class LayoutImage: """A laid-out image ready for rendering.""" def __init__(self, image_layout: ImageLayout, x: float, y: float): self.image_layout = image_layout self.x = x self.y = y # Store initial dimensions but also provide dynamic access self._initial_width = image_layout.width self._initial_height = image_layout.height @property def width(self) -> float: """Get current width (may update after async image load).""" return self.image_layout.width if self.image_layout.width > 0 else self._initial_width @property def height(self) -> float: """Get current height (may update after async image load).""" return self.image_layout.height if self.image_layout.height > 0 else self._initial_height class LayoutBlock: """A laid-out block with its lines.""" def __init__(self, tag: str, block_type: str = "block"): self.tag = tag self.block_type = block_type self.lines = [] # List of LayoutLine self.x = 0 self.y = 0 self.width = 0 self.height = 0 class DocumentLayout: """Layout engine for a document.""" def __init__(self, node, frame=None, base_url=None, async_images: bool = False): self.node = node self.frame = frame self.base_url = base_url # For resolving relative image URLs self.async_images = async_images # Load images in background self.blocks = [] # List of LayoutBlock self.lines = [] # Flat list of all LayoutLine for rendering self.images = [] # List of LayoutImage for rendering self.width = 0 self.height = 0 def layout(self, width: int, x_margin: int = 20, y_start: int = 30) -> list: """ Layout the document and return a list of LayoutLine objects. Returns: List of LayoutLine objects ready for rendering """ self.width = width max_width = max(10, width - 2 * x_margin) y = y_start self.blocks = [] self.lines = [] self.images = [] # Find body body = self._find_body(self.node) if not body: return self.lines # Collect and layout blocks raw_blocks = self._collect_blocks(body) for block_info in raw_blocks: # Handle images separately if block_info.get("is_image"): image_layout = block_info.get("image_layout") if image_layout: margin_top = block_info.get("margin_top", 6) margin_bottom = block_info.get("margin_bottom", 10) y += margin_top # Position the image image_layout.x = x_margin image_layout.y = y # Add to images list for rendering layout_image = LayoutImage(image_layout, x_margin, y) self.images.append(layout_image) y += image_layout.height + margin_bottom continue font_size = block_info.get("font_size", 14) font_family = block_info.get("font_family", "") text = block_info.get("text", "") margin_top = block_info.get("margin_top", 6) margin_bottom = block_info.get("margin_bottom", 10) block_type = block_info.get("block_type", "block") tag = block_info.get("tag", "") color = block_info.get("color") # Text color from style href = block_info.get("href") # Link target URL if not text: y += font_size * 0.6 continue # Optional bullet prefix if block_info.get("bullet"): text = f"• {text}" layout_block = LayoutBlock(tag, block_type) layout_block.x = x_margin layout_block.y = y + margin_top # Word wrap - pass text to get appropriate font font = get_font(font_size, font_family, text=text) words = text.split() wrapped_lines = [] current_line = [] current_width = 0 for word in words: word_width = font.measureText(word + " ") if current_width + word_width > max_width and current_line: wrapped_lines.append(" ".join(current_line)) current_line = [word] current_width = word_width else: current_line.append(word) current_width += word_width if current_line: wrapped_lines.append(" ".join(current_line)) # Create LayoutLines line_height = linespace(font_size) y += margin_top block_start_y = y for line_text in wrapped_lines: # Calculate character positions char_positions = [0.0] for i in range(1, len(line_text) + 1): char_positions.append(font.measureText(line_text[:i])) layout_line = LayoutLine( text=line_text, x=x_margin, y=y, # Top of line, baseline is y + font_size font_size=font_size, char_positions=char_positions, font_family=font_family, color=color, href=href ) layout_block.lines.append(layout_line) self.lines.append(layout_line) y += line_height layout_block.height = y - block_start_y layout_block.width = max_width self.blocks.append(layout_block) y += margin_bottom self.height = y + 50 # Padding at bottom return self.lines def _find_body(self, node): """Find the body element in the document.""" if isinstance(node, Element) and node.tag == "body": return node if hasattr(node, "children"): for child in node.children: if isinstance(child, Element) and child.tag == "body": return child found = self._find_body(child) if found: return found return None def _collect_blocks(self, node) -> list: """Collect renderable blocks from the DOM.""" blocks = [] for child in getattr(node, "children", []): if isinstance(child, Text): txt = child.text.strip() if txt: # Use computed style if available style = getattr(child, "computed_style", None) font_size = style.get_int("font-size", 14) if style else 14 font_family = style.get("font-family", "") if style else "" blocks.append({ "text": txt, "font_size": font_size, "font_family": font_family, "block_type": "text", "style": style }) continue if isinstance(child, Element): tag = child.tag.lower() # Skip style and script tags - they shouldn't be rendered if tag in {"style", "script", "head", "title", "meta", "link"}: continue # Handle img tags if tag == "img": image_layout = ImageLayout(child) image_layout.load(self.base_url, async_load=self.async_images) # Get computed style for max-width constraint style = getattr(child, "computed_style", None) max_width_css = None if style: max_width_val = style.get("max-width", "") if max_width_val == "100%": # 100% means constrain to container width max_width_css = self.width - 40 if self.width > 40 else 800 elif max_width_val.endswith("px"): try: max_width_css = float(max_width_val[:-2]) except ValueError: pass # Use CSS max-width or default container width effective_max_width = max_width_css if max_width_css else (self.width - 40 if self.width > 40 else 800) image_layout.layout(max_width=effective_max_width) # Get computed style for margins style = getattr(child, "computed_style", None) if style: margin_top = style.get_int("margin-top", 6) margin_bottom = style.get_int("margin-bottom", 10) else: margin_top = 6 margin_bottom = 10 blocks.append({ "is_image": True, "image_layout": image_layout, "margin_top": margin_top, "margin_bottom": margin_bottom, }) continue # Container elements - just recurse, don't add as blocks if tag in {"ul", "ol", "div", "section", "article", "main", "header", "footer", "nav"}: blocks.extend(self._collect_blocks(child)) continue # Inline elements inside block elements are handled by _text_of # Only create separate blocks for inline elements if they're direct # children of container elements (handled above via recursion) if tag in {"span", "strong", "em", "b", "i", "code"}: # Skip - these are handled as part of parent's text continue # Handle anchor elements - they can be inline or standalone if tag == "a": # Get the href and treat this as a clickable block href = child.attributes.get("href") content = self._text_of(child) if not content: continue style = getattr(child, "computed_style", None) if style: font_size = style.get_int("font-size", 14) color = style.get("color") font_family = style.get("font-family", "") else: font_size = 14 color = None font_family = "" # Default link color if not color: color = "#0066cc" blocks.append({ "text": content, "font_size": font_size, "font_family": font_family, "margin_top": 0, "margin_bottom": 0, "block_type": "inline", "tag": tag, "bullet": False, "style": style, "color": color, "href": href }) continue # For block elements (p, h1, etc), first collect any embedded images embedded_images = self._collect_images(child) blocks.extend(embedded_images) # Check if this element contains only a link link_info = self._extract_single_link(child) content = self._text_of(child) if not content: continue # Get computed style for this element style = getattr(child, "computed_style", None) # Extract style properties if style: font_size = style.get_int("font-size", 14) margin_top = style.get_int("margin-top", 6) margin_bottom = style.get_int("margin-bottom", 10) display = style.get("display", "block") font_family = style.get("font-family", "") color = style.get("color") # Get text color from style else: # Fallback to hardcoded defaults font_size = self._get_default_font_size(tag) margin_top = self._get_default_margin_top(tag) margin_bottom = self._get_default_margin_bottom(tag) display = "inline" if tag in {"span", "a", "strong", "em", "b", "i", "code"} else "block" font_family = "" color = None # If block contains only a link, use link info for href and color href = None if link_info: href = link_info.get("href") if not color: color = link_info.get("color", "#0066cc") # Determine block type block_type = "inline" if display == "inline" else "block" if tag == "li" or display == "list-item": block_type = "list-item" # Add bullet for list items bullet = (tag == "li" or display == "list-item") blocks.append({ "text": content, "font_size": font_size, "font_family": font_family, "margin_top": margin_top, "margin_bottom": margin_bottom, "block_type": block_type, "tag": tag, "bullet": bullet, "style": style, "color": color, "href": href }) return blocks def _get_default_font_size(self, tag: str) -> int: """Get default font size for a tag (fallback when no styles).""" sizes = { "h1": 24, "h2": 20, "h3": 18, "h4": 16, "h5": 15, "h6": 14 } return sizes.get(tag, 14) def _get_default_margin_top(self, tag: str) -> int: """Get default top margin for a tag (fallback when no styles).""" margins = { "h1": 12, "h2": 10, "h3": 8, "p": 6, "li": 4 } return margins.get(tag, 0) def _get_default_margin_bottom(self, tag: str) -> int: """Get default bottom margin for a tag (fallback when no styles).""" margins = { "h1": 12, "h2": 10, "h3": 8, "p": 12, "li": 4 } return margins.get(tag, 0) def _extract_single_link(self, node) -> dict | None: """Extract link info if node contains only a single link. Returns dict with href and color if the element contains only a link (possibly with some whitespace text), None otherwise. """ if not isinstance(node, Element): return None links = [] has_other_content = False for child in node.children: if isinstance(child, Text): # Whitespace-only text is okay if child.text.strip(): has_other_content = True elif isinstance(child, Element): if child.tag.lower() == "a": links.append(child) else: # Has other elements besides links has_other_content = True # Return link info only if there's exactly one link and no other content if len(links) == 1 and not has_other_content: link = links[0] style = getattr(link, "computed_style", None) color = style.get("color") if style else None return { "href": link.attributes.get("href"), "color": color or "#0066cc" } return None def _collect_images(self, node) -> list: """Recursively collect all img elements from a node.""" images = [] if not isinstance(node, Element): return images for child in getattr(node, "children", []): if isinstance(child, Element): if child.tag.lower() == "img": image_layout = ImageLayout(child) image_layout.load(self.base_url, async_load=self.async_images) # Get computed style for max-width constraint style = getattr(child, "computed_style", None) max_width_css = None if style: max_width_val = style.get("max-width", "") if max_width_val == "100%": # 100% means constrain to container width max_width_css = self.width - 40 if self.width > 40 else 800 elif max_width_val.endswith("px"): try: max_width_css = float(max_width_val[:-2]) except ValueError: pass # Use CSS max-width or default container width effective_max_width = max_width_css if max_width_css else (self.width - 40 if self.width > 40 else 800) image_layout.layout(max_width=effective_max_width) style = getattr(child, "computed_style", None) if style: margin_top = style.get_int("margin-top", 6) margin_bottom = style.get_int("margin-bottom", 10) else: margin_top = 6 margin_bottom = 10 images.append({ "is_image": True, "image_layout": image_layout, "margin_top": margin_top, "margin_bottom": margin_bottom, }) else: # Recurse into children images.extend(self._collect_images(child)) return images def _text_of(self, node) -> str: """Extract text content from a node.""" if isinstance(node, Text): return node.text if isinstance(node, Element): parts = [] for child in node.children: parts.append(self._text_of(child)) return " ".join([p for p in parts if p]).strip() return ""