Recursively skip verbatim elements (#847)

2026-04-20 13:11:16 +00:00 · 2022-12-12 01:06:45 +01:00 · 2022-12-12 01:06:45 +01:00 · ef391cea50
commit ef391cea50
parent 96dec6984a
3 changed files with 99 additions and 3 deletions
--- a/lychee-lib/src/extract/html5ever.rs
+++ b/lychee-lib/src/extract/html5ever.rs
@ -213,6 +213,27 @@ mod tests {
        assert_eq!(uris, expected);
    }

+    #[test]
+    fn test_include_verbatim_recursive() {
+        const HTML_INPUT: &str = r#"
+        <a href="https://example.com/">valid link</a>
+        <code>
+            <pre>
+                <span>https://example.org</span>
+            </pre>
+        </code>
+        "#;
+
+        let expected = vec![RawUri {
+            text: "https://example.com/".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
    #[test]
    fn test_include_nofollow() {
        let input = r#"
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@ -16,6 +16,7 @@ struct LinkExtractor {
    current_attribute_value: Vec<u8>,
    last_start_element: Vec<u8>,
    include_verbatim: bool,
+    current_verbatim_element_name: Option<Vec<u8>>,
 }

 /// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
@ -37,6 +38,7 @@ impl LinkExtractor {
            current_attribute_value: Vec::new(),
            last_start_element: Vec::new(),
            include_verbatim,
+            current_verbatim_element_name: None,
        }
    }

@ -92,7 +94,8 @@ impl LinkExtractor {
    fn flush_current_characters(&mut self) {
        // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
        let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
-        if !self.include_verbatim && is_verbatim_elem(name) {
+        if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
+            self.update_verbatim_element_name();
            // Early return if we don't want to extract links from preformatted text
            self.current_string.clear();
            return;
@ -103,14 +106,47 @@ impl LinkExtractor {
        self.current_string.clear();
    }

+    /// Check if we are currently inside a verbatim element.
+    const fn inside_verbatim_block(&self) -> bool {
+        self.current_verbatim_element_name.is_some()
+    }
+
+    /// Update the current verbatim element name.
+    ///
+    /// Keeps track of the last verbatim element name, so that we can
+    /// properly handle nested verbatim blocks.
+    fn update_verbatim_element_name(&mut self) {
+        if self.current_element_is_closing {
+            if self.inside_verbatim_block() {
+                // If we are closing a verbatim element, we need to check if it is the
+                // top-level verbatim element. If it is, we need to reset the verbatim block.
+                if Some(&self.current_element_name) == self.current_verbatim_element_name.as_ref() {
+                    self.current_verbatim_element_name = None;
+                }
+            }
+        } else if !self.include_verbatim
+            && is_verbatim_elem(unsafe { from_utf8_unchecked(&self.current_element_name) })
+        {
+            // If we are opening a verbatim element, we need to check if we are already
+            // inside a verbatim element. If so, we need to ignore this element.
+            if !self.inside_verbatim_block() {
+                self.current_verbatim_element_name = Some(self.current_element_name.clone());
+            }
+        }
+    }
+
    fn flush_old_attribute(&mut self) {
        {
            // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
            let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
-            if !self.include_verbatim && is_verbatim_elem(name) {
-                // Early return if we don't want to extract links from preformatted text
+
+            // Early return if we don't want to extract links from verbatim
+            // blocks (e.g. preformatted text)
+            if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
+                self.update_verbatim_element_name();
                return;
            }
+
            let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
            let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };

@ -301,6 +337,27 @@ mod tests {
        assert_eq!(uris, expected);
    }

+    #[test]
+    fn test_include_verbatim_recursive() {
+        const HTML_INPUT: &str = r#"
+        <a href="https://example.com/">valid link</a>
+        <code>
+            <pre>
+                <span>https://example.org</span>
+            </pre>
+        </code>
+        "#;
+
+        let expected = vec![RawUri {
+            text: "https://example.com/".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
    #[test]
    fn test_include_nofollow() {
        let input = r#"
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -103,6 +103,24 @@ mod tests {
        uris_html5gum
    }

+    #[test]
+    fn test_verbatim_matching() {
+        assert!(is_verbatim_elem("pre"));
+        assert!(is_verbatim_elem("code"));
+        assert!(is_verbatim_elem("listing"));
+    }
+
+    #[test]
+    fn verbatim_elem() {
+        let input = r#"
+        <pre>
+        https://example.com
+        </pre>
+        "#;
+        let uris = extract_uris(input, FileType::Html);
+        assert!(uris.is_empty());
+    }
+
    #[test]
    fn test_file_type() {
        assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);