Avoid false positives when checking email addresses in HTML input (#1123)

Skip email addresses outside href attributes in HTML
This commit is contained in:
Matthias Endler 2023-07-01 00:12:11 +02:00 committed by GitHub
parent f1adc788cf
commit 15e420b8ad
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 173 additions and 2 deletions

View file

@ -4,7 +4,7 @@ use html5ever::{
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};
use super::{is_verbatim_elem, plaintext::extract_plaintext};
use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
use crate::types::uri::raw::RawUri;
#[derive(Clone, Default)]
@ -80,6 +80,19 @@ impl TokenSink for LinkExtractor {
None => extract_plaintext(&attr.value),
Some(urls) => urls
.into_iter()
.filter(|url| {
// Only accept email addresses, which occur in `href` attributes
// and start with `mailto:`. Technically, email addresses could
// also occur in plain text, but we don't want to extract those
// because of the high false positive rate.
//
// This ignores links like `<img srcset="v2@1.5x.png">`
let is_email = is_email_link(url);
let is_mailto = url.starts_with("mailto:");
let is_href = attr.name.local.as_ref() == "href";
!is_email || (is_mailto && is_href)
})
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
@ -293,4 +306,61 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_valid_email() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<a href="mailto:foo@bar.com">
</body>
</html>"#;
let expected = vec![RawUri {
text: "mailto:foo@bar.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_exclude_email_without_mailto() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<a href="foo@bar.com">
</body>
</html>"#;
let expected = vec![];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_email_false_postive() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
</body>
</html>"#;
let expected = vec![];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
}

View file

@ -1,7 +1,7 @@
use html5gum::{Emitter, Error, State, Tokenizer};
use super::is_verbatim_elem;
use super::plaintext::extract_plaintext;
use super::{is_email_link, is_verbatim_elem};
use crate::types::uri::raw::RawUri;
#[derive(Clone)]
@ -170,6 +170,19 @@ impl LinkExtractor {
None => extract_plaintext(value),
Some(urls) => urls
.into_iter()
.filter(|url| {
// Only accept email addresses, which occur in `href` attributes
// and start with `mailto:`. Technically, email addresses could
// also occur in plain text, but we don't want to extract those
// because of the high false positive rate.
//
// This ignores links like `<img srcset="v2@1.5x.png">`
let is_email = is_email_link(url);
let is_mailto = url.starts_with("mailto:");
let is_href = attr == "href";
!is_email || (is_mailto && is_href)
})
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
@ -417,4 +430,61 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_valid_email() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<a href="mailto:foo@bar.com">
</body>
</html>"#;
let expected = vec![RawUri {
text: "mailto:foo@bar.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_exclude_email_without_mailto() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<a href="foo@bar.com">
</body>
</html>"#;
let expected = vec![];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
#[test]
fn test_email_false_postive() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="utf-8">
<title>Test</title>
</head>
<body>
<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
</body>
</html>"#;
let expected = vec![];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
}

View file

@ -5,9 +5,30 @@ mod html5gum;
mod markdown;
mod plaintext;
use linkify::{LinkFinder, LinkKind};
use markdown::extract_markdown;
use plaintext::extract_plaintext;
/// Check if the given URL is an email link.
///
/// This operates on the raw URL strings, not the linkified version because it
/// gets used in the HTML extractors, which parse the HTML attributes directly
/// and return the raw strings.
///
/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
/// so there should be no big performance penalty for calling this function.
pub(crate) fn is_email_link(input: &str) -> bool {
let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
let email = match findings.next() {
None => return false,
Some(email) => email.as_str(),
};
// Email needs to match the full string.
// Strip the "mailto:" prefix if it exists.
input.strip_prefix("mailto:").unwrap_or(input) == email
}
/// Check if the given element is in the list of preformatted ("verbatim") tags.
///
/// These will be excluded from link checking by default.
@ -341,4 +362,14 @@ mod tests {
assert_eq!(links, expected_links);
}
#[test]
fn test_is_email_link() {
assert!(is_email_link("mailto:steve@apple.com"));
assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
assert!(is_email_link("foo@example.org"));
assert!(!is_email_link("foo@example.org in sentence"));
assert!(!is_email_link("https://example.org"));
}
}