Fix nested URL extraction in verbatim elements (#988)

Skipping URLs in verbatim elements didn't take nested elements into consideration, which were not verbatim. For instance, the following HTML snippet would yield `https://example.com` in non-verbatim mode, even if it is nested inside a verbatim `<pre>` element: ```html <pre><a href="https://example.com">link</a></pre> ``` This commit fixes the behavior for both `html5gum` and `html5ever`. Note that nested verbatim elements of the same kind still are not handled correctly. For instance, the following HTML snippet would still yield `https://example.com`: ```html <pre> <pre></pre> <a href="https://example.com">link</a> </pre> ``` The reason is that we currently only keep track of a single verbatim element and not a stack of elements, which we would need to unwind and resolve the situation. Fixes https://github.com/lycheeverse/lychee/issues/986.
2026-05-09 14:24:45 +00:00 · 2023-03-11 15:18:25 +01:00 · 2023-03-11 15:18:25 +01:00 · 55797071b0
commit 55797071b0
parent 2255ad9286
6 changed files with 81 additions and 20 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -25,12 +25,8 @@ jobs:
          toolchain: stable
      - uses: taiki-e/install-action@nextest
      - uses: Swatinem/rust-cache@v2
-      - name: Run cargo test
-        run: cargo nextest run --all-targets --all-features --filter-expr '!test(test_exclude_example_domains)'
-      - name: Run cargo test (include example domains)
-        run: cargo nextest run --filter-expr 'test(test_exclude_example_domains)'
-      - name: Run doctests
-        run: cargo test --doc
+      - name: Run tests
+        run: make test

  lint:
    runs-on: ubuntu-latest
--- a/4
+++ b/4
@ -40,7 +40,9 @@ lint: ## Run linter

 .PHONY: test
 test: ## Run tests
-	cargo nextest run --all-targets
+	cargo nextest run --all-targets --all-features --filter-expr '!test(test_exclude_example_domains)'
+	cargo nextest run --filter-expr 'test(test_exclude_example_domains)'
+	cargo test --doc

 .PHONY: doc
 doc: ## Open documentation
--- a/fixtures/TEST_VERBATIM.html
+++ b/fixtures/TEST_VERBATIM.html
@ -1,5 +1,4 @@
 <!-- Test URLs in verbatim HTML elements -->
-
 <html>
  <head>
    <title>Verbatim HTML</title>
@ -7,17 +6,14 @@
  <body>
    <h1>Verbatim HTML</h1>
    <p>Some verbatim HTML elements:</p>
-
    <pre>http://www.example.com/pre</pre>
-
+    <pre>
+      <a href="http://www.example.com/pre/a" target="_blank" rel="noopener">example</a>
+    </pre>
    <code>http://www.example.com/code</code>
-
    <samp> http://www.example.com/samp </samp>
-
    <kbd>http://www.example.com/kbd</kbd>
-
    <var>http://www.example.com/var</var>
-
    <script>
      // http://www.example.com/script
      "http://www.example.com/script";
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@ -946,6 +946,19 @@ mod cli {

        Ok(())
    }
+    #[tokio::test]
+    async fn test_verbatim_skipped_by_default_via_file() -> Result<()> {
+        let file = fixtures_path().join("TEST_VERBATIM.html");
+
+        main_command()
+            .arg("--dump")
+            .arg(file)
+            .assert()
+            .success()
+            .stdout(is_empty());
+
+        Ok(())
+    }

    #[tokio::test]
    async fn test_verbatim_skipped_by_default_via_remote_url() -> Result<()> {
--- a/lychee-lib/src/extract/html5ever.rs
+++ b/lychee-lib/src/extract/html5ever.rs
@ -11,7 +11,7 @@ use crate::types::uri::raw::RawUri;
 struct LinkExtractor {
    links: Vec<RawUri>,
    include_verbatim: bool,
-    inside_excluded_element: bool,
+    current_verbatim_element_name: Option<String>,
 }

 impl TokenSink for LinkExtractor {
@ -21,7 +21,7 @@ impl TokenSink for LinkExtractor {
    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
        match token {
            Token::CharacterTokens(raw) => {
-                if self.inside_excluded_element {
+                if self.current_verbatim_element_name.is_some() {
                    return TokenSinkResult::Continue;
                }
                self.links.extend(extract_plaintext(&raw));
@ -33,9 +33,30 @@ impl TokenSink for LinkExtractor {
                    self_closing: _self_closing,
                    attrs,
                } = tag;
+                // Check if this is a verbatim element, which we want to skip.
                if !self.include_verbatim && is_verbatim_elem(&name) {
-                    // Skip content inside excluded elements until we see the end tag.
-                    self.inside_excluded_element = matches!(kind, TagKind::StartTag);
+                    // Check if we're currently inside a verbatim block
+                    if let Some(current_verbatim_element_name) = &self.current_verbatim_element_name
+                    {
+                        // Inside a verbatim block. Check if the verbatim
+                        // element name matches with the current element name.
+                        if current_verbatim_element_name == name.as_ref() {
+                            // If so, we're done with the verbatim block,
+                            // -- but only if this is an end tag.
+                            if matches!(kind, TagKind::EndTag) {
+                                self.current_verbatim_element_name = None;
+                            }
+                        }
+                    } else if matches!(kind, TagKind::StartTag) {
+                        // We're not inside a verbatim block, but we just
+                        // encountered a verbatim element. Remember the name
+                        // of the element.
+                        self.current_verbatim_element_name = Some(name.to_string());
+                    }
+                }
+                if self.current_verbatim_element_name.is_some() {
+                    // We want to skip the content of this element
+                    // as we're inside a verbatim block.
                    return TokenSinkResult::Continue;
                }

@ -86,7 +107,7 @@ impl LinkExtractor {
        Self {
            links: vec![],
            include_verbatim,
-            inside_excluded_element: false,
+            current_verbatim_element_name: None,
        }
    }

@ -167,6 +188,7 @@ mod tests {
        Some random text
        https://foo.com and http://bar.com/some/path
        Something else
+        <a href="https://baz.org">example link inside pre</a>
        </pre>
        <p><b>bold</b></p>
    </body>
@ -207,6 +229,11 @@ mod tests {
                element: None,
                attribute: None,
            },
+            RawUri {
+                text: "https://baz.org".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
        ];

        let uris = extract_html(HTML_INPUT, true);
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@ -122,6 +122,8 @@ impl LinkExtractor {
                // top-level verbatim element. If it is, we need to reset the verbatim block.
                if Some(&self.current_element_name) == self.current_verbatim_element_name.as_ref() {
                    self.current_verbatim_element_name = None;
+                    self.current_attribute_name.clear();
+                    self.current_attribute_value.clear();
                }
            }
        } else if !self.include_verbatim
@ -291,6 +293,7 @@ mod tests {
        Some random text
        https://foo.com and http://bar.com/some/path
        Something else
+        <a href="https://baz.org">example link inside pre</a>
        </pre>
        <p><b>bold</b></p>
    </body>
@ -331,6 +334,11 @@ mod tests {
                element: None,
                attribute: None,
            },
+            RawUri {
+                text: "https://baz.org".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
        ];

        let uris = extract_html(HTML_INPUT, true);
@ -338,7 +346,7 @@ mod tests {
    }

    #[test]
-    fn test_include_verbatim_recursive() {
+    fn test_include_verbatim_nested() {
        const HTML_INPUT: &str = r#"
        <a href="https://example.com/">valid link</a>
        <code>
@ -358,6 +366,25 @@ mod tests {
        assert_eq!(uris, expected);
    }

+    // TODO: This test is currently failing because we don't handle nested
+    // verbatim elements of the same type correctly. The first closing tag will
+    // lift the verbatim flag. This is a known issue and could be handled by
+    // keeping a stack of verbatim flags.
+    #[test]
+    #[ignore]
+    fn test_include_verbatim_nested_identical() {
+        const HTML_INPUT: &str = r#"
+        <pre>
+            <pre>
+            </pre>
+            <a href="https://example.org">invalid link</a>
+        </pre>
+        "#;
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert!(uris.is_empty());
+    }
+
    #[test]
    fn test_include_nofollow() {
        let input = r#"