detect wikilinks, prevent plaintext extraction from links #1650 (#1679)

2026-05-08 13:54:44 +00:00 · 2025-05-12 23:06:51 +02:00 · 2025-05-12 23:06:51 +02:00 · 3a0922757e
commit 3a0922757e
parent a5cf40cbd4
1 changed files with 31 additions and 10 deletions
--- a/lychee-lib/src/extract/markdown.rs
+++ b/lychee-lib/src/extract/markdown.rs
@ -10,7 +10,7 @@ use super::html::html5gum::{extract_html, extract_html_fragments};
 /// Returns the default markdown extensions used by lychee.
 /// Sadly, `|` is not const for `Options` so we can't use a const global.
 fn md_extensions() -> Options {
-    Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH
+    Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH | Options::ENABLE_WIKILINKS
 }

 /// Extract unparsed URL strings from a Markdown string.
@ -18,6 +18,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
    // In some cases it is undesirable to extract links from within code blocks,
    // which is why we keep track of entries and exits while traversing the input.
    let mut inside_code_block = false;
+    let mut inside_link_block = false;

    let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions()));
    parser
@ -62,10 +63,8 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
                    LinkType::Email =>
                     Some(extract_raw_uri_from_plaintext(&dest_url)),
                    // Wiki URL (`[[http://example.com]]`)
-                    // This element is currently not matched and I'm not sure why.
-                    // However, we keep it in here for future compatibility with
-                    // markup5ever.
                    LinkType::WikiLink { has_pothole: _ } => {
+                        inside_link_block = true;
                        Some(vec![RawUri {
                            text: dest_url.to_string(),
                            element: Some("a".to_string()),
@ -100,7 +99,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr

            // A text node.
            Event::Text(txt) => {
-                if inside_code_block && !include_verbatim {
+                if (inside_code_block && !include_verbatim) || inside_link_block {
                    None
                } else {
                    Some(extract_raw_uri_from_plaintext(&txt))
@ -123,6 +122,12 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
                }
            }

+            // A detected link block.
+            Event::End(TagEnd::Link) => {
+                inside_link_block = false;
+                None
+            }
+
            // Silently skip over other events
            _ => None,
        })
@ -391,13 +396,29 @@ $$
        let markdown = r"[[https://example.com/destination]]";
        let expected = vec![RawUri {
            text: "https://example.com/destination".to_string(),
-            // This should be a link element, but is currently matched as plaintext
-            element: None,
-            attribute: None,
-            // element: Some("a".to_string()),
-            // attribute: Some("href".to_string()),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
        }];
        let uris = extract_markdown(markdown, true);
        assert_eq!(uris, expected);
    }
+
+    #[test]
+    fn test_multiple_wiki_links() {
+        let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
+        let expected = vec![
+            RawUri {
+                text: "https://example.com/destination".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://example.com/source".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+        ];
+        let uris = extract_markdown(markdown, true);
+        assert_eq!(uris, expected);
+    }
 }