From debe9587665df75ae2dbf295bfef2418f78a10ed Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 4 Apr 2022 10:32:00 +0200 Subject: [PATCH] Add support for nofollow (#572) --- lychee-lib/src/extract/html5ever.rs | 25 +++++++++++++++++++++++ lychee-lib/src/extract/html5gum.rs | 31 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/lychee-lib/src/extract/html5ever.rs b/lychee-lib/src/extract/html5ever.rs index 5f5cd53..b6afcbe 100644 --- a/lychee-lib/src/extract/html5ever.rs +++ b/lychee-lib/src/extract/html5ever.rs @@ -39,6 +39,15 @@ impl TokenSink for LinkExtractor { return TokenSinkResult::Continue; } + // Check for rel=nofollow. We only extract the first `rel` attribute. + // This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states + // "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other." + if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { + if rel.value.contains("nofollow") { + return TokenSinkResult::Continue; + } + } + for attr in attrs { let urls = LinkExtractor::extract_urls_from_elem_attr( &attr.name.local, @@ -203,4 +212,20 @@ mod tests { let uris = extract_html(HTML_INPUT, true); assert_eq!(uris, expected); } + + #[test] + fn test_include_nofollow() { + let input = r#" + do not follow me + do not follow me + do not follow me + "#; + let expected = vec![RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } } diff --git a/lychee-lib/src/extract/html5gum.rs b/lychee-lib/src/extract/html5gum.rs index 379d4b9..55d2947 100644 --- a/lychee-lib/src/extract/html5gum.rs +++ b/lychee-lib/src/extract/html5gum.rs @@ -11,6 +11,7 @@ struct LinkExtractor { current_string: Vec, current_element_name: Vec, current_element_is_closing: bool, + current_element_nofollow: bool, current_attribute_name: Vec, current_attribute_value: Vec, last_start_element: Vec, @@ -31,6 +32,7 @@ impl LinkExtractor { current_string: Vec::new(), current_element_name: Vec::new(), current_element_is_closing: false, + current_element_nofollow: false, current_attribute_name: Vec::new(), current_attribute_value: Vec::new(), last_start_element: Vec::new(), @@ -112,6 +114,18 @@ impl LinkExtractor { let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) }; let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) }; + // Ignore links with rel=nofollow + // This may be set on a different iteration on the same element/tag before, + // so we check the boolean separately right after + if attr == "rel" && value.contains("nofollow") { + self.current_element_nofollow = true; + } + if self.current_element_nofollow { + self.current_attribute_name.clear(); + self.current_attribute_value.clear(); + return; + } + let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value); let new_urls = match urls { @@ -173,6 +187,7 @@ impl Emitter for &mut LinkExtractor { fn emit_current_tag(&mut self) { self.flush_old_attribute(); + self.current_element_nofollow = false; } fn emit_current_doctype(&mut self) {} @@ -277,4 +292,20 @@ mod tests { let uris = extract_html(HTML_INPUT, true); assert_eq!(uris, expected); } + + #[test] + fn test_include_nofollow() { + let input = r#" + do not follow me + do not follow me + i'm fine + "#; + let expected = vec![RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } }