detect wikilinks, prevent plaintext extraction from links #1650 (#1679)

This commit is contained in:
Jakob 2025-05-12 23:06:51 +02:00 committed by GitHub
parent a5cf40cbd4
commit 3a0922757e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -10,7 +10,7 @@ use super::html::html5gum::{extract_html, extract_html_fragments};
/// Returns the default markdown extensions used by lychee.
/// Sadly, `|` is not const for `Options` so we can't use a const global.
fn md_extensions() -> Options {
Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH
Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH | Options::ENABLE_WIKILINKS
}
/// Extract unparsed URL strings from a Markdown string.
@ -18,6 +18,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
// In some cases it is undesirable to extract links from within code blocks,
// which is why we keep track of entries and exits while traversing the input.
let mut inside_code_block = false;
let mut inside_link_block = false;
let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions()));
parser
@ -62,10 +63,8 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
LinkType::Email =>
Some(extract_raw_uri_from_plaintext(&dest_url)),
// Wiki URL (`[[http://example.com]]`)
// This element is currently not matched and I'm not sure why.
// However, we keep it in here for future compatibility with
// markup5ever.
LinkType::WikiLink { has_pothole: _ } => {
inside_link_block = true;
Some(vec![RawUri {
text: dest_url.to_string(),
element: Some("a".to_string()),
@ -100,7 +99,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
// A text node.
Event::Text(txt) => {
if inside_code_block && !include_verbatim {
if (inside_code_block && !include_verbatim) || inside_link_block {
None
} else {
Some(extract_raw_uri_from_plaintext(&txt))
@ -123,6 +122,12 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
}
}
// A detected link block.
Event::End(TagEnd::Link) => {
inside_link_block = false;
None
}
// Silently skip over other events
_ => None,
})
@ -391,13 +396,29 @@ $$
let markdown = r"[[https://example.com/destination]]";
let expected = vec![RawUri {
text: "https://example.com/destination".to_string(),
// This should be a link element, but is currently matched as plaintext
element: None,
attribute: None,
// element: Some("a".to_string()),
// attribute: Some("href".to_string()),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_markdown(markdown, true);
assert_eq!(uris, expected);
}
#[test]
fn test_multiple_wiki_links() {
let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
let expected = vec![
RawUri {
text: "https://example.com/destination".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://example.com/source".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
];
let uris = extract_markdown(markdown, true);
assert_eq!(uris, expected);
}
}