diff --git a/lychee-lib/src/extract/html5ever.rs b/lychee-lib/src/extract/html5ever.rs
index 5f5cd53..b6afcbe 100644
--- a/lychee-lib/src/extract/html5ever.rs
+++ b/lychee-lib/src/extract/html5ever.rs
@@ -39,6 +39,15 @@ impl TokenSink for LinkExtractor {
return TokenSinkResult::Continue;
}
+ // Check for rel=nofollow. We only extract the first `rel` attribute.
+ // This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states
+ // "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other."
+ if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
+ if rel.value.contains("nofollow") {
+ return TokenSinkResult::Continue;
+ }
+ }
+
for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
&attr.name.local,
@@ -203,4 +212,20 @@ mod tests {
let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
+
+ #[test]
+ fn test_include_nofollow() {
+ let input = r#"
+ do not follow me
+ do not follow me
+ do not follow me
+ "#;
+ let expected = vec![RawUri {
+ text: "https://example.org".to_string(),
+ element: Some("a".to_string()),
+ attribute: Some("href".to_string()),
+ }];
+ let uris = extract_html(input, false);
+ assert_eq!(uris, expected);
+ }
}
diff --git a/lychee-lib/src/extract/html5gum.rs b/lychee-lib/src/extract/html5gum.rs
index 379d4b9..55d2947 100644
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@@ -11,6 +11,7 @@ struct LinkExtractor {
current_string: Vec,
current_element_name: Vec,
current_element_is_closing: bool,
+ current_element_nofollow: bool,
current_attribute_name: Vec,
current_attribute_value: Vec,
last_start_element: Vec,
@@ -31,6 +32,7 @@ impl LinkExtractor {
current_string: Vec::new(),
current_element_name: Vec::new(),
current_element_is_closing: false,
+ current_element_nofollow: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
@@ -112,6 +114,18 @@ impl LinkExtractor {
let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
+ // Ignore links with rel=nofollow
+ // This may be set on a different iteration on the same element/tag before,
+ // so we check the boolean separately right after
+ if attr == "rel" && value.contains("nofollow") {
+ self.current_element_nofollow = true;
+ }
+ if self.current_element_nofollow {
+ self.current_attribute_name.clear();
+ self.current_attribute_value.clear();
+ return;
+ }
+
let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
let new_urls = match urls {
@@ -173,6 +187,7 @@ impl Emitter for &mut LinkExtractor {
fn emit_current_tag(&mut self) {
self.flush_old_attribute();
+ self.current_element_nofollow = false;
}
fn emit_current_doctype(&mut self) {}
@@ -277,4 +292,20 @@ mod tests {
let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
+
+ #[test]
+ fn test_include_nofollow() {
+ let input = r#"
+ do not follow me
+ do not follow me
+ i'm fine
+ "#;
+ let expected = vec![RawUri {
+ text: "https://example.org".to_string(),
+ element: Some("a".to_string()),
+ attribute: Some("href".to_string()),
+ }];
+ let uris = extract_html(input, false);
+ assert_eq!(uris, expected);
+ }
}