mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-17 05:00:26 +00:00
Fix nested URL extraction in verbatim elements (#988)
Skipping URLs in verbatim elements didn't take nested elements into consideration, which were not verbatim. For instance, the following HTML snippet would yield `https://example.com` in non-verbatim mode, even if it is nested inside a verbatim `<pre>` element: ```html <pre><a href="https://example.com">link</a></pre> ``` This commit fixes the behavior for both `html5gum` and `html5ever`. Note that nested verbatim elements of the same kind still are not handled correctly. For instance, the following HTML snippet would still yield `https://example.com`: ```html <pre> <pre></pre> <a href="https://example.com">link</a> </pre> ``` The reason is that we currently only keep track of a single verbatim element and not a stack of elements, which we would need to unwind and resolve the situation. Fixes https://github.com/lycheeverse/lychee/issues/986.
This commit is contained in:
parent
2255ad9286
commit
55797071b0
6 changed files with 81 additions and 20 deletions
8
.github/workflows/ci.yml
vendored
8
.github/workflows/ci.yml
vendored
|
|
@ -25,12 +25,8 @@ jobs:
|
|||
toolchain: stable
|
||||
- uses: taiki-e/install-action@nextest
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- name: Run cargo test
|
||||
run: cargo nextest run --all-targets --all-features --filter-expr '!test(test_exclude_example_domains)'
|
||||
- name: Run cargo test (include example domains)
|
||||
run: cargo nextest run --filter-expr 'test(test_exclude_example_domains)'
|
||||
- name: Run doctests
|
||||
run: cargo test --doc
|
||||
- name: Run tests
|
||||
run: make test
|
||||
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
|
|||
4
Makefile
4
Makefile
|
|
@ -40,7 +40,9 @@ lint: ## Run linter
|
|||
|
||||
.PHONY: test
|
||||
test: ## Run tests
|
||||
cargo nextest run --all-targets
|
||||
cargo nextest run --all-targets --all-features --filter-expr '!test(test_exclude_example_domains)'
|
||||
cargo nextest run --filter-expr 'test(test_exclude_example_domains)'
|
||||
cargo test --doc
|
||||
|
||||
.PHONY: doc
|
||||
doc: ## Open documentation
|
||||
|
|
|
|||
10
fixtures/TEST_VERBATIM.html
vendored
10
fixtures/TEST_VERBATIM.html
vendored
|
|
@ -1,5 +1,4 @@
|
|||
<!-- Test URLs in verbatim HTML elements -->
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Verbatim HTML</title>
|
||||
|
|
@ -7,17 +6,14 @@
|
|||
<body>
|
||||
<h1>Verbatim HTML</h1>
|
||||
<p>Some verbatim HTML elements:</p>
|
||||
|
||||
<pre>http://www.example.com/pre</pre>
|
||||
|
||||
<pre>
|
||||
<a href="http://www.example.com/pre/a" target="_blank" rel="noopener">example</a>
|
||||
</pre>
|
||||
<code>http://www.example.com/code</code>
|
||||
|
||||
<samp> http://www.example.com/samp </samp>
|
||||
|
||||
<kbd>http://www.example.com/kbd</kbd>
|
||||
|
||||
<var>http://www.example.com/var</var>
|
||||
|
||||
<script>
|
||||
// http://www.example.com/script
|
||||
"http://www.example.com/script";
|
||||
|
|
|
|||
|
|
@ -946,6 +946,19 @@ mod cli {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_verbatim_skipped_by_default_via_file() -> Result<()> {
|
||||
let file = fixtures_path().join("TEST_VERBATIM.html");
|
||||
|
||||
main_command()
|
||||
.arg("--dump")
|
||||
.arg(file)
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_verbatim_skipped_by_default_via_remote_url() -> Result<()> {
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ use crate::types::uri::raw::RawUri;
|
|||
struct LinkExtractor {
|
||||
links: Vec<RawUri>,
|
||||
include_verbatim: bool,
|
||||
inside_excluded_element: bool,
|
||||
current_verbatim_element_name: Option<String>,
|
||||
}
|
||||
|
||||
impl TokenSink for LinkExtractor {
|
||||
|
|
@ -21,7 +21,7 @@ impl TokenSink for LinkExtractor {
|
|||
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
|
||||
match token {
|
||||
Token::CharacterTokens(raw) => {
|
||||
if self.inside_excluded_element {
|
||||
if self.current_verbatim_element_name.is_some() {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
self.links.extend(extract_plaintext(&raw));
|
||||
|
|
@ -33,9 +33,30 @@ impl TokenSink for LinkExtractor {
|
|||
self_closing: _self_closing,
|
||||
attrs,
|
||||
} = tag;
|
||||
// Check if this is a verbatim element, which we want to skip.
|
||||
if !self.include_verbatim && is_verbatim_elem(&name) {
|
||||
// Skip content inside excluded elements until we see the end tag.
|
||||
self.inside_excluded_element = matches!(kind, TagKind::StartTag);
|
||||
// Check if we're currently inside a verbatim block
|
||||
if let Some(current_verbatim_element_name) = &self.current_verbatim_element_name
|
||||
{
|
||||
// Inside a verbatim block. Check if the verbatim
|
||||
// element name matches with the current element name.
|
||||
if current_verbatim_element_name == name.as_ref() {
|
||||
// If so, we're done with the verbatim block,
|
||||
// -- but only if this is an end tag.
|
||||
if matches!(kind, TagKind::EndTag) {
|
||||
self.current_verbatim_element_name = None;
|
||||
}
|
||||
}
|
||||
} else if matches!(kind, TagKind::StartTag) {
|
||||
// We're not inside a verbatim block, but we just
|
||||
// encountered a verbatim element. Remember the name
|
||||
// of the element.
|
||||
self.current_verbatim_element_name = Some(name.to_string());
|
||||
}
|
||||
}
|
||||
if self.current_verbatim_element_name.is_some() {
|
||||
// We want to skip the content of this element
|
||||
// as we're inside a verbatim block.
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
|
||||
|
|
@ -86,7 +107,7 @@ impl LinkExtractor {
|
|||
Self {
|
||||
links: vec![],
|
||||
include_verbatim,
|
||||
inside_excluded_element: false,
|
||||
current_verbatim_element_name: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -167,6 +188,7 @@ mod tests {
|
|||
Some random text
|
||||
https://foo.com and http://bar.com/some/path
|
||||
Something else
|
||||
<a href="https://baz.org">example link inside pre</a>
|
||||
</pre>
|
||||
<p><b>bold</b></p>
|
||||
</body>
|
||||
|
|
@ -207,6 +229,11 @@ mod tests {
|
|||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "https://baz.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, true);
|
||||
|
|
|
|||
|
|
@ -122,6 +122,8 @@ impl LinkExtractor {
|
|||
// top-level verbatim element. If it is, we need to reset the verbatim block.
|
||||
if Some(&self.current_element_name) == self.current_verbatim_element_name.as_ref() {
|
||||
self.current_verbatim_element_name = None;
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
}
|
||||
}
|
||||
} else if !self.include_verbatim
|
||||
|
|
@ -291,6 +293,7 @@ mod tests {
|
|||
Some random text
|
||||
https://foo.com and http://bar.com/some/path
|
||||
Something else
|
||||
<a href="https://baz.org">example link inside pre</a>
|
||||
</pre>
|
||||
<p><b>bold</b></p>
|
||||
</body>
|
||||
|
|
@ -331,6 +334,11 @@ mod tests {
|
|||
element: None,
|
||||
attribute: None,
|
||||
},
|
||||
RawUri {
|
||||
text: "https://baz.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
let uris = extract_html(HTML_INPUT, true);
|
||||
|
|
@ -338,7 +346,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_verbatim_recursive() {
|
||||
fn test_include_verbatim_nested() {
|
||||
const HTML_INPUT: &str = r#"
|
||||
<a href="https://example.com/">valid link</a>
|
||||
<code>
|
||||
|
|
@ -358,6 +366,25 @@ mod tests {
|
|||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
// TODO: This test is currently failing because we don't handle nested
|
||||
// verbatim elements of the same type correctly. The first closing tag will
|
||||
// lift the verbatim flag. This is a known issue and could be handled by
|
||||
// keeping a stack of verbatim flags.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_include_verbatim_nested_identical() {
|
||||
const HTML_INPUT: &str = r#"
|
||||
<pre>
|
||||
<pre>
|
||||
</pre>
|
||||
<a href="https://example.org">invalid link</a>
|
||||
</pre>
|
||||
"#;
|
||||
|
||||
let uris = extract_html(HTML_INPUT, false);
|
||||
assert!(uris.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_nofollow() {
|
||||
let input = r#"
|
||||
|
|
|
|||
Loading…
Reference in a new issue