mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-17 20:00:58 +00:00
Avoid false positives when checking email addresses in HTML input (#1123)
Skip email addresses outside href attributes in HTML
This commit is contained in:
parent
f1adc788cf
commit
15e420b8ad
3 changed files with 173 additions and 2 deletions
|
|
@ -4,7 +4,7 @@ use html5ever::{
|
|||
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
|
||||
};
|
||||
|
||||
use super::{is_verbatim_elem, plaintext::extract_plaintext};
|
||||
use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
|
||||
use crate::types::uri::raw::RawUri;
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
|
|
@ -80,6 +80,19 @@ impl TokenSink for LinkExtractor {
|
|||
None => extract_plaintext(&attr.value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.filter(|url| {
|
||||
// Only accept email addresses, which occur in `href` attributes
|
||||
// and start with `mailto:`. Technically, email addresses could
|
||||
// also occur in plain text, but we don't want to extract those
|
||||
// because of the high false positive rate.
|
||||
//
|
||||
// This ignores links like `<img srcset="v2@1.5x.png">`
|
||||
let is_email = is_email_link(url);
|
||||
let is_mailto = url.starts_with("mailto:");
|
||||
let is_href = attr.name.local.as_ref() == "href";
|
||||
|
||||
!is_email || (is_mailto && is_href)
|
||||
})
|
||||
.map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(name.to_string()),
|
||||
|
|
@ -293,4 +306,61 @@ mod tests {
|
|||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_email() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="mailto:foo@bar.com">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![RawUri {
|
||||
text: "mailto:foo@bar.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
#[test]
|
||||
fn test_exclude_email_without_mailto() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="foo@bar.com">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_email_false_postive() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use html5gum::{Emitter, Error, State, Tokenizer};
|
||||
|
||||
use super::is_verbatim_elem;
|
||||
use super::plaintext::extract_plaintext;
|
||||
use super::{is_email_link, is_verbatim_elem};
|
||||
use crate::types::uri::raw::RawUri;
|
||||
|
||||
#[derive(Clone)]
|
||||
|
|
@ -170,6 +170,19 @@ impl LinkExtractor {
|
|||
None => extract_plaintext(value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.filter(|url| {
|
||||
// Only accept email addresses, which occur in `href` attributes
|
||||
// and start with `mailto:`. Technically, email addresses could
|
||||
// also occur in plain text, but we don't want to extract those
|
||||
// because of the high false positive rate.
|
||||
//
|
||||
// This ignores links like `<img srcset="v2@1.5x.png">`
|
||||
let is_email = is_email_link(url);
|
||||
let is_mailto = url.starts_with("mailto:");
|
||||
let is_href = attr == "href";
|
||||
|
||||
!is_email || (is_mailto && is_href)
|
||||
})
|
||||
.map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(name.to_string()),
|
||||
|
|
@ -417,4 +430,61 @@ mod tests {
|
|||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_email() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="mailto:foo@bar.com">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![RawUri {
|
||||
text: "mailto:foo@bar.com".to_string(),
|
||||
element: Some("a".to_string()),
|
||||
attribute: Some("href".to_string()),
|
||||
}];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
#[test]
|
||||
fn test_exclude_email_without_mailto() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="foo@bar.com">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_email_false_postive() {
|
||||
let input = r#"<!DOCTYPE html>
|
||||
<html lang="en-US">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
|
||||
</body>
|
||||
</html>"#;
|
||||
|
||||
let expected = vec![];
|
||||
let uris = extract_html(input, false);
|
||||
assert_eq!(uris, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,9 +5,30 @@ mod html5gum;
|
|||
mod markdown;
|
||||
mod plaintext;
|
||||
|
||||
use linkify::{LinkFinder, LinkKind};
|
||||
use markdown::extract_markdown;
|
||||
use plaintext::extract_plaintext;
|
||||
|
||||
/// Check if the given URL is an email link.
|
||||
///
|
||||
/// This operates on the raw URL strings, not the linkified version because it
|
||||
/// gets used in the HTML extractors, which parse the HTML attributes directly
|
||||
/// and return the raw strings.
|
||||
///
|
||||
/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
|
||||
/// so there should be no big performance penalty for calling this function.
|
||||
pub(crate) fn is_email_link(input: &str) -> bool {
|
||||
let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
|
||||
let email = match findings.next() {
|
||||
None => return false,
|
||||
Some(email) => email.as_str(),
|
||||
};
|
||||
|
||||
// Email needs to match the full string.
|
||||
// Strip the "mailto:" prefix if it exists.
|
||||
input.strip_prefix("mailto:").unwrap_or(input) == email
|
||||
}
|
||||
|
||||
/// Check if the given element is in the list of preformatted ("verbatim") tags.
|
||||
///
|
||||
/// These will be excluded from link checking by default.
|
||||
|
|
@ -341,4 +362,14 @@ mod tests {
|
|||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_email_link() {
|
||||
assert!(is_email_link("mailto:steve@apple.com"));
|
||||
assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
|
||||
|
||||
assert!(is_email_link("foo@example.org"));
|
||||
assert!(!is_email_link("foo@example.org in sentence"));
|
||||
assert!(!is_email_link("https://example.org"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue