mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-29 19:00:25 +00:00
Add support for nofollow (#572)
This commit is contained in:
parent
b338ba2abc
commit
debe958766
2 changed files with 56 additions and 0 deletions
|
|
@ -39,6 +39,15 @@ impl TokenSink for LinkExtractor {
|
|||
return TokenSinkResult::Continue;
|
||||
}
|
||||
|
||||
// Check for rel=nofollow. We only extract the first `rel` attribute.
|
||||
// This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states
|
||||
// "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other."
|
||||
if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
|
||||
if rel.value.contains("nofollow") {
|
||||
return TokenSinkResult::Continue;
|
||||
}
|
||||
}
|
||||
|
||||
for attr in attrs {
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(
|
||||
&attr.name.local,
|
||||
|
|
@ -203,4 +212,20 @@ mod tests {
|
|||
let uris = extract_html(HTML_INPUT, true);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_nofollow() {
|
||||
let input = r#"
|
||||
<a rel="nofollow" href="https://foo.com">do not follow me</a>
|
||||
<a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
|
||||
<a href="https://example.org">do not follow me</a>
|
||||
"#;
|
||||
let expected = vec![RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ struct LinkExtractor {
|
|||
current_string: Vec<u8>,
|
||||
current_element_name: Vec<u8>,
|
||||
current_element_is_closing: bool,
|
||||
current_element_nofollow: bool,
|
||||
current_attribute_name: Vec<u8>,
|
||||
current_attribute_value: Vec<u8>,
|
||||
last_start_element: Vec<u8>,
|
||||
|
|
@ -31,6 +32,7 @@ impl LinkExtractor {
|
|||
current_string: Vec::new(),
|
||||
current_element_name: Vec::new(),
|
||||
current_element_is_closing: false,
|
||||
current_element_nofollow: false,
|
||||
current_attribute_name: Vec::new(),
|
||||
current_attribute_value: Vec::new(),
|
||||
last_start_element: Vec::new(),
|
||||
|
|
@ -112,6 +114,18 @@ impl LinkExtractor {
|
|||
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
|
||||
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
|
||||
|
||||
// Ignore links with rel=nofollow
|
||||
// This may be set on a different iteration on the same element/tag before,
|
||||
// so we check the boolean separately right after
|
||||
if attr == "rel" && value.contains("nofollow") {
|
||||
self.current_element_nofollow = true;
|
||||
}
|
||||
if self.current_element_nofollow {
|
||||
self.current_attribute_name.clear();
|
||||
self.current_attribute_value.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
|
||||
|
||||
let new_urls = match urls {
|
||||
|
|
@ -173,6 +187,7 @@ impl Emitter for &mut LinkExtractor {
|
|||
|
||||
fn emit_current_tag(&mut self) {
|
||||
self.flush_old_attribute();
|
||||
self.current_element_nofollow = false;
|
||||
}
|
||||
|
||||
fn emit_current_doctype(&mut self) {}
|
||||
|
|
@ -277,4 +292,20 @@ mod tests {
|
|||
let uris = extract_html(HTML_INPUT, true);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_nofollow() {
|
||||
let input = r#"
|
||||
<a rel="nofollow" href="https://foo.com">do not follow me</a>
|
||||
<a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
|
||||
<a href="https://example.org">i'm fine</a>
|
||||
"#;
|
||||
let expected = vec![RawUri {
|
||||
text: "https://example.org".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue