mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-29 17:34:46 +00:00
Skipping URLs in verbatim elements didn't take nested elements into consideration, which were not verbatim. For instance, the following HTML snippet would yield `https://example.com` in non-verbatim mode, even if it is nested inside a verbatim `<pre>` element: ```html <pre><a href="https://example.com">link</a></pre> ``` This commit fixes the behavior for both `html5gum` and `html5ever`. Note that nested verbatim elements of the same kind still are not handled correctly. For instance, the following HTML snippet would still yield `https://example.com`: ```html <pre> <pre></pre> <a href="https://example.com">link</a> </pre> ``` The reason is that we currently only keep track of a single verbatim element and not a stack of elements, which we would need to unwind and resolve the situation. Fixes https://github.com/lycheeverse/lychee/issues/986.
420 lines
14 KiB
Rust
420 lines
14 KiB
Rust
use html5gum::{Emitter, Error, State, Tokenizer};
|
|
|
|
use super::is_verbatim_elem;
|
|
use super::plaintext::extract_plaintext;
|
|
use crate::types::uri::raw::RawUri;
|
|
|
|
#[derive(Clone)]
|
|
struct LinkExtractor {
|
|
// note: what html5gum calls a tag, lychee calls an element
|
|
links: Vec<RawUri>,
|
|
current_string: Vec<u8>,
|
|
current_element_name: Vec<u8>,
|
|
current_element_is_closing: bool,
|
|
current_element_nofollow: bool,
|
|
current_attribute_name: Vec<u8>,
|
|
current_attribute_value: Vec<u8>,
|
|
last_start_element: Vec<u8>,
|
|
include_verbatim: bool,
|
|
current_verbatim_element_name: Option<Vec<u8>>,
|
|
}
|
|
|
|
/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
|
|
/// of debugging
|
|
unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
|
|
debug_assert!(std::str::from_utf8(s).is_ok());
|
|
std::str::from_utf8_unchecked(s)
|
|
}
|
|
|
|
impl LinkExtractor {
|
|
pub(crate) const fn new(include_verbatim: bool) -> Self {
|
|
LinkExtractor {
|
|
links: Vec::new(),
|
|
current_string: Vec::new(),
|
|
current_element_name: Vec::new(),
|
|
current_element_is_closing: false,
|
|
current_element_nofollow: false,
|
|
current_attribute_name: Vec::new(),
|
|
current_attribute_value: Vec::new(),
|
|
last_start_element: Vec::new(),
|
|
include_verbatim,
|
|
current_verbatim_element_name: None,
|
|
}
|
|
}
|
|
|
|
/// Extract all semantically known links from a given html attribute.
|
|
#[allow(clippy::unnested_or_patterns)]
|
|
pub(crate) fn extract_urls_from_elem_attr<'a>(
|
|
attr_name: &str,
|
|
elem_name: &str,
|
|
attr_value: &'a str,
|
|
) -> Option<impl Iterator<Item = &'a str>> {
|
|
// For a comprehensive list of elements that might contain URLs/URIs
|
|
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
|
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
|
|
|
match (elem_name, attr_name) {
|
|
// Common element/attribute combinations for links
|
|
(_, "href" | "src" | "cite" | "usemap")
|
|
// Less common (but still valid!) combinations
|
|
| ("applet", "codebase")
|
|
| ("body", "background")
|
|
| ("button", "formaction")
|
|
| ("command", "icon")
|
|
| ("form", "action")
|
|
| ("frame", "longdesc")
|
|
| ("head", "profile")
|
|
| ("html", "manifest")
|
|
| ("iframe", "longdesc")
|
|
| ("img", "longdesc")
|
|
| ("input", "formaction")
|
|
| ("object", "classid")
|
|
| ("object", "codebase")
|
|
| ("object", "data")
|
|
| ("video", "poster") => {
|
|
Some(vec![attr_value].into_iter())
|
|
}
|
|
(_, "srcset") => {
|
|
let mut urls = Vec::new();
|
|
for image_candidate_string in attr_value.trim().split(',') {
|
|
for part in image_candidate_string.split_ascii_whitespace() {
|
|
if part.is_empty() {
|
|
continue;
|
|
}
|
|
urls.push(part);
|
|
break;
|
|
}
|
|
}
|
|
Some(urls.into_iter())
|
|
}
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn flush_current_characters(&mut self) {
|
|
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
|
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
|
if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
|
|
self.update_verbatim_element_name();
|
|
// Early return if we don't want to extract links from preformatted text
|
|
self.current_string.clear();
|
|
return;
|
|
}
|
|
|
|
let raw = unsafe { from_utf8_unchecked(&self.current_string) };
|
|
self.links.extend(extract_plaintext(raw));
|
|
self.current_string.clear();
|
|
}
|
|
|
|
/// Check if we are currently inside a verbatim element.
|
|
const fn inside_verbatim_block(&self) -> bool {
|
|
self.current_verbatim_element_name.is_some()
|
|
}
|
|
|
|
/// Update the current verbatim element name.
|
|
///
|
|
/// Keeps track of the last verbatim element name, so that we can
|
|
/// properly handle nested verbatim blocks.
|
|
fn update_verbatim_element_name(&mut self) {
|
|
if self.current_element_is_closing {
|
|
if self.inside_verbatim_block() {
|
|
// If we are closing a verbatim element, we need to check if it is the
|
|
// top-level verbatim element. If it is, we need to reset the verbatim block.
|
|
if Some(&self.current_element_name) == self.current_verbatim_element_name.as_ref() {
|
|
self.current_verbatim_element_name = None;
|
|
self.current_attribute_name.clear();
|
|
self.current_attribute_value.clear();
|
|
}
|
|
}
|
|
} else if !self.include_verbatim
|
|
&& is_verbatim_elem(unsafe { from_utf8_unchecked(&self.current_element_name) })
|
|
{
|
|
// If we are opening a verbatim element, we need to check if we are already
|
|
// inside a verbatim element. If so, we need to ignore this element.
|
|
if !self.inside_verbatim_block() {
|
|
self.current_verbatim_element_name = Some(self.current_element_name.clone());
|
|
}
|
|
}
|
|
}
|
|
|
|
fn flush_old_attribute(&mut self) {
|
|
{
|
|
// safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
|
|
let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
|
|
|
|
// Early return if we don't want to extract links from verbatim
|
|
// blocks (e.g. preformatted text)
|
|
if !self.include_verbatim && (is_verbatim_elem(name) || self.inside_verbatim_block()) {
|
|
self.update_verbatim_element_name();
|
|
return;
|
|
}
|
|
|
|
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
|
|
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
|
|
|
|
// Ignore links with rel=nofollow
|
|
// This may be set on a different iteration on the same element/tag before,
|
|
// so we check the boolean separately right after
|
|
if attr == "rel" && value.contains("nofollow") {
|
|
self.current_element_nofollow = true;
|
|
}
|
|
if self.current_element_nofollow {
|
|
self.current_attribute_name.clear();
|
|
self.current_attribute_value.clear();
|
|
return;
|
|
}
|
|
|
|
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
|
|
|
|
let new_urls = match urls {
|
|
None => extract_plaintext(value),
|
|
Some(urls) => urls
|
|
.into_iter()
|
|
.map(|url| RawUri {
|
|
text: url.to_string(),
|
|
element: Some(name.to_string()),
|
|
attribute: Some(attr.to_string()),
|
|
})
|
|
.collect::<Vec<_>>(),
|
|
};
|
|
|
|
self.links.extend(new_urls);
|
|
}
|
|
|
|
self.current_attribute_name.clear();
|
|
self.current_attribute_value.clear();
|
|
}
|
|
}
|
|
|
|
impl Emitter for &mut LinkExtractor {
|
|
type Token = ();
|
|
|
|
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
|
|
self.last_start_element.clear();
|
|
self.last_start_element
|
|
.extend(last_start_tag.unwrap_or_default());
|
|
}
|
|
|
|
fn emit_eof(&mut self) {
|
|
self.flush_current_characters();
|
|
}
|
|
fn emit_error(&mut self, _: Error) {}
|
|
fn pop_token(&mut self) -> Option<()> {
|
|
None
|
|
}
|
|
|
|
fn emit_string(&mut self, c: &[u8]) {
|
|
self.current_string.extend(c);
|
|
}
|
|
|
|
fn init_start_tag(&mut self) {
|
|
self.flush_current_characters();
|
|
self.current_element_name.clear();
|
|
self.current_element_nofollow = false;
|
|
self.current_element_is_closing = false;
|
|
}
|
|
|
|
fn init_end_tag(&mut self) {
|
|
self.init_start_tag();
|
|
self.current_element_is_closing = true;
|
|
}
|
|
|
|
fn init_comment(&mut self) {
|
|
self.flush_current_characters();
|
|
}
|
|
|
|
fn emit_current_tag(&mut self) -> Option<State> {
|
|
let next_state = if self.current_element_is_closing {
|
|
None
|
|
} else {
|
|
self.last_start_element.clear();
|
|
self.last_start_element.extend(&self.current_element_name);
|
|
html5gum::naive_next_state(&self.current_element_name)
|
|
};
|
|
|
|
self.flush_old_attribute();
|
|
next_state
|
|
}
|
|
|
|
fn emit_current_doctype(&mut self) {}
|
|
fn set_self_closing(&mut self) {
|
|
self.current_element_is_closing = true;
|
|
}
|
|
fn set_force_quirks(&mut self) {}
|
|
|
|
fn push_tag_name(&mut self, s: &[u8]) {
|
|
self.current_element_name.extend(s);
|
|
}
|
|
|
|
fn push_comment(&mut self, _: &[u8]) {}
|
|
fn push_doctype_name(&mut self, _: &[u8]) {}
|
|
fn init_doctype(&mut self) {
|
|
self.flush_current_characters();
|
|
}
|
|
fn init_attribute(&mut self) {
|
|
self.flush_old_attribute();
|
|
}
|
|
fn push_attribute_name(&mut self, s: &[u8]) {
|
|
self.current_attribute_name.extend(s);
|
|
}
|
|
fn push_attribute_value(&mut self, s: &[u8]) {
|
|
self.current_attribute_value.extend(s);
|
|
}
|
|
|
|
fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
|
|
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
|
|
fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
|
|
fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
|
|
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
|
|
self.current_element_is_closing
|
|
&& !self.current_element_name.is_empty()
|
|
&& self.current_element_name == self.last_start_element
|
|
}
|
|
|
|
fn emit_current_comment(&mut self) {}
|
|
}
|
|
|
|
/// Extract unparsed URL strings from an HTML string.
|
|
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
|
|
let mut extractor = LinkExtractor::new(include_verbatim);
|
|
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
|
|
assert!(tokenizer.next().is_none());
|
|
extractor.links
|
|
}
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
const HTML_INPUT: &str = r#"
|
|
<html>
|
|
<body>
|
|
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
|
|
<pre>
|
|
Some random text
|
|
https://foo.com and http://bar.com/some/path
|
|
Something else
|
|
<a href="https://baz.org">example link inside pre</a>
|
|
</pre>
|
|
<p><b>bold</b></p>
|
|
</body>
|
|
</html>"#;
|
|
|
|
#[test]
|
|
fn test_skip_verbatim() {
|
|
let expected = vec![RawUri {
|
|
text: "https://example.org".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
}];
|
|
|
|
let uris = extract_html(HTML_INPUT, false);
|
|
assert_eq!(uris, expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_include_verbatim() {
|
|
let expected = vec![
|
|
RawUri {
|
|
text: "https://example.com".to_string(),
|
|
element: None,
|
|
attribute: None,
|
|
},
|
|
RawUri {
|
|
text: "https://example.org".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
},
|
|
RawUri {
|
|
text: "https://foo.com".to_string(),
|
|
element: None,
|
|
attribute: None,
|
|
},
|
|
RawUri {
|
|
text: "http://bar.com/some/path".to_string(),
|
|
element: None,
|
|
attribute: None,
|
|
},
|
|
RawUri {
|
|
text: "https://baz.org".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
},
|
|
];
|
|
|
|
let uris = extract_html(HTML_INPUT, true);
|
|
assert_eq!(uris, expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_include_verbatim_nested() {
|
|
const HTML_INPUT: &str = r#"
|
|
<a href="https://example.com/">valid link</a>
|
|
<code>
|
|
<pre>
|
|
<span>https://example.org</span>
|
|
</pre>
|
|
</code>
|
|
"#;
|
|
|
|
let expected = vec![RawUri {
|
|
text: "https://example.com/".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
}];
|
|
|
|
let uris = extract_html(HTML_INPUT, false);
|
|
assert_eq!(uris, expected);
|
|
}
|
|
|
|
// TODO: This test is currently failing because we don't handle nested
|
|
// verbatim elements of the same type correctly. The first closing tag will
|
|
// lift the verbatim flag. This is a known issue and could be handled by
|
|
// keeping a stack of verbatim flags.
|
|
#[test]
|
|
#[ignore]
|
|
fn test_include_verbatim_nested_identical() {
|
|
const HTML_INPUT: &str = r#"
|
|
<pre>
|
|
<pre>
|
|
</pre>
|
|
<a href="https://example.org">invalid link</a>
|
|
</pre>
|
|
"#;
|
|
|
|
let uris = extract_html(HTML_INPUT, false);
|
|
assert!(uris.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_include_nofollow() {
|
|
let input = r#"
|
|
<a rel="nofollow" href="https://foo.com">do not follow me</a>
|
|
<a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
|
|
<a href="https://example.org">i'm fine</a>
|
|
"#;
|
|
let expected = vec![RawUri {
|
|
text: "https://example.org".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
}];
|
|
let uris = extract_html(input, false);
|
|
assert_eq!(uris, expected);
|
|
}
|
|
|
|
#[test]
|
|
fn test_exclude_script_tags() {
|
|
let input = r#"
|
|
<script>
|
|
var foo = "https://example.com";
|
|
</script>
|
|
<a href="https://example.org">i'm fine</a>
|
|
"#;
|
|
let expected = vec![RawUri {
|
|
text: "https://example.org".to_string(),
|
|
element: Some("a".to_string()),
|
|
attribute: Some("href".to_string()),
|
|
}];
|
|
let uris = extract_html(input, false);
|
|
assert_eq!(uris, expected);
|
|
}
|
|
}
|