Avoid false positives when checking email addresses in HTML input (#1123)

Skip email addresses outside href attributes in HTML
2026-04-17 20:00:58 +00:00 · 2023-07-01 00:12:11 +02:00 · 2023-07-01 00:12:11 +02:00 · 15e420b8ad
commit 15e420b8ad
parent f1adc788cf
3 changed files with 173 additions and 2 deletions
--- a/lychee-lib/src/extract/html5ever.rs
+++ b/lychee-lib/src/extract/html5ever.rs
@ -4,7 +4,7 @@ use html5ever::{
    tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
 };

-use super::{is_verbatim_elem, plaintext::extract_plaintext};
+use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
 use crate::types::uri::raw::RawUri;

 #[derive(Clone, Default)]
@ -80,6 +80,19 @@ impl TokenSink for LinkExtractor {
                        None => extract_plaintext(&attr.value),
                        Some(urls) => urls
                            .into_iter()
+                            .filter(|url| {
+                                // Only accept email addresses, which occur in `href` attributes
+                                // and start with `mailto:`. Technically, email addresses could
+                                // also occur in plain text, but we don't want to extract those
+                                // because of the high false positive rate.
+                                //
+                                // This ignores links like `<img srcset="v2@1.5x.png">`
+                                let is_email = is_email_link(url);
+                                let is_mailto = url.starts_with("mailto:");
+                                let is_href = attr.name.local.as_ref() == "href";
+
+                                !is_email || (is_mailto && is_href)
+                            })
                            .map(|url| RawUri {
                                text: url.to_string(),
                                element: Some(name.to_string()),
@ -293,4 +306,61 @@ mod tests {
        let uris = extract_html(input, false);
        assert_eq!(uris, expected);
    }
+
+    #[test]
+    fn test_valid_email() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <a href="mailto:foo@bar.com">
+          </body>
+        </html>"#;
+
+        let expected = vec![RawUri {
+            text: "mailto:foo@bar.com".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
+    #[test]
+    fn test_exclude_email_without_mailto() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <a href="foo@bar.com">
+          </body>
+        </html>"#;
+
+        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_email_false_postive() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
+          </body>
+        </html>"#;
+
+        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
 }
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@ -1,7 +1,7 @@
 use html5gum::{Emitter, Error, State, Tokenizer};

-use super::is_verbatim_elem;
 use super::plaintext::extract_plaintext;
+use super::{is_email_link, is_verbatim_elem};
 use crate::types::uri::raw::RawUri;

 #[derive(Clone)]
@ -170,6 +170,19 @@ impl LinkExtractor {
                None => extract_plaintext(value),
                Some(urls) => urls
                    .into_iter()
+                    .filter(|url| {
+                        // Only accept email addresses, which occur in `href` attributes
+                        // and start with `mailto:`. Technically, email addresses could
+                        // also occur in plain text, but we don't want to extract those
+                        // because of the high false positive rate.
+                        //
+                        // This ignores links like `<img srcset="v2@1.5x.png">`
+                        let is_email = is_email_link(url);
+                        let is_mailto = url.starts_with("mailto:");
+                        let is_href = attr == "href";
+
+                        !is_email || (is_mailto && is_href)
+                    })
                    .map(|url| RawUri {
                        text: url.to_string(),
                        element: Some(name.to_string()),
@ -417,4 +430,61 @@ mod tests {
        let uris = extract_html(input, false);
        assert_eq!(uris, expected);
    }
+
+    #[test]
+    fn test_valid_email() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <a href="mailto:foo@bar.com">
+          </body>
+        </html>"#;
+
+        let expected = vec![RawUri {
+            text: "mailto:foo@bar.com".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
+    #[test]
+    fn test_exclude_email_without_mailto() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <a href="foo@bar.com">
+          </body>
+        </html>"#;
+
+        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_email_false_postive() {
+        let input = r#"<!DOCTYPE html>
+        <html lang="en-US">
+          <head>
+            <meta charset="utf-8">
+            <title>Test</title>
+          </head>
+          <body>
+            <img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
+          </body>
+        </html>"#;
+
+        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
 }
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -5,9 +5,30 @@ mod html5gum;
 mod markdown;
 mod plaintext;

+use linkify::{LinkFinder, LinkKind};
 use markdown::extract_markdown;
 use plaintext::extract_plaintext;

+/// Check if the given URL is an email link.
+///
+/// This operates on the raw URL strings, not the linkified version because it
+/// gets used in the HTML extractors, which parse the HTML attributes directly
+/// and return the raw strings.
+///
+/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
+/// so there should be no big performance penalty for calling this function.
+pub(crate) fn is_email_link(input: &str) -> bool {
+    let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
+    let email = match findings.next() {
+        None => return false,
+        Some(email) => email.as_str(),
+    };
+
+    // Email needs to match the full string.
+    // Strip the "mailto:" prefix if it exists.
+    input.strip_prefix("mailto:").unwrap_or(input) == email
+}
+
 /// Check if the given element is in the list of preformatted ("verbatim") tags.
 ///
 /// These will be excluded from link checking by default.
@ -341,4 +362,14 @@ mod tests {

        assert_eq!(links, expected_links);
    }
+
+    #[test]
+    fn test_is_email_link() {
+        assert!(is_email_link("mailto:steve@apple.com"));
+        assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
+
+        assert!(is_email_link("foo@example.org"));
+        assert!(!is_email_link("foo@example.org in sentence"));
+        assert!(!is_email_link("https://example.org"));
+    }
 }