Make checking email addresses optional (#1171)

E-Mail checks cause too many false-postives,
so we put them behind a flag.

* `--exclude-mail` is deprecated (to be removed in 1.0)
* `--include-mail` is the new flag

This PR also removes the obsolete tests for `--exclude-file`, which was superseded by `.lycheeignore`.

Fixes #1089
This commit is contained in:
Matthias Endler 2023-07-19 19:58:38 +02:00 committed by GitHub
parent 0760d9241b
commit 04887ee293
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 113 additions and 79 deletions

View file

@ -372,7 +372,10 @@ Options:
Exclude loopback IP address range and localhost from checking
--exclude-mail
Exclude all mail addresses from checking
Exclude all mail addresses from checking (deprecated; excluded by default)
--include-mail
Also check email addresses
--remap <REMAP>
Remap URI matching pattern to different URI

View file

@ -1,6 +1,5 @@
https://endler.dev
test@example.com
foo@bar.dev
https://example.com
octocat+github@github.com
mailto:test2@example.com

View file

@ -1,3 +0,0 @@
https://en.wikipedia.org/*
https://ldra.com
https://url-does-not-exist

View file

@ -1 +0,0 @@
https://i.creativecommons.org/p/zero/1.0/88x31.png

View file

@ -37,6 +37,25 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
None => None,
};
// `exclude_mail` will be removed in 1.0. Until then, we need to support it.
// Therefore, we need to check if both `include_mail` and `exclude_mail` are set to `true`
// and return an error if that's the case.
if cfg.include_mail && cfg.exclude_mail {
return Err(anyhow::anyhow!(
"Cannot set both `include-mail` and `exclude-mail` to true"
));
}
// By default, clap sets `exclude_mail` to `false`.
// Therefore, we need to check if `exclude_mail` is explicitly set to
// `true`. If so, we need to set `include_mail` to `false`.
// Otherwise, we use the value of `include_mail`.
let include_mail = if cfg.exclude_mail {
false
} else {
cfg.include_mail
};
ClientBuilder::builder()
.remaps(remaps)
.includes(includes)
@ -45,7 +64,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
.exclude_private_ips(cfg.exclude_private)
.exclude_link_local_ips(cfg.exclude_link_local)
.exclude_loopback_ips(cfg.exclude_loopback)
.exclude_mail(cfg.exclude_mail)
.include_mail(include_mail)
.max_redirects(cfg.max_redirects)
.user_agent(cfg.user_agent.clone())
.allow_insecure(cfg.insecure)

View file

@ -177,9 +177,14 @@ fn load_config() -> Result<LycheeOptions> {
opts.config.exclude.append(&mut read_lines(&lycheeignore)?);
}
// TODO: Remove this warning and the parameter in a future release
// TODO: Remove this warning and the parameter with 1.0
if !&opts.config.exclude_file.is_empty() {
warn!("WARNING: `--exclude-file` is deprecated and will soon be removed; use `{}` file to ignore URL patterns instead. To exclude paths of files and directories, use `--exclude-path`.", LYCHEE_IGNORE_FILE);
warn!("WARNING: `--exclude-file` is deprecated and will soon be removed; use the `{}` file to ignore URL patterns instead. To exclude paths of files and directories, use `--exclude-path`.", LYCHEE_IGNORE_FILE);
}
// TODO: Remove this warning and the parameter with 1.0
if opts.config.exclude_mail {
warn!("WARNING: `--exclude-mail` is deprecated and will soon be removed; E-Mail is no longer checked by default. Use `--include-mail` to enable E-Mail checking.");
}
// Load excludes from file

View file

@ -282,10 +282,16 @@ pub(crate) struct Config {
pub(crate) exclude_loopback: bool,
/// Exclude all mail addresses from checking
/// (deprecated; excluded by default)
#[arg(long)]
#[serde(default)]
pub(crate) exclude_mail: bool,
/// Also check email addresses
#[arg(long)]
#[serde(default)]
pub(crate) include_mail: bool,
/// Remap URI matching pattern to different URI
#[serde(default)]
#[arg(long)]

View file

@ -121,16 +121,29 @@ mod cli {
}
#[test]
fn test_exclude_email() -> Result<()> {
fn test_email() -> Result<()> {
test_json_output!(
"TEST_EMAIL.md",
MockResponseStats {
total: 6,
excludes: 4,
successful: 2,
total: 5,
excludes: 0,
successful: 5,
..MockResponseStats::default()
},
"--exclude-mail"
"--include-mail"
)
}
#[test]
fn test_exclude_email_by_default() -> Result<()> {
test_json_output!(
"TEST_EMAIL.md",
MockResponseStats {
total: 5,
excludes: 3,
successful: 2,
..MockResponseStats::default()
}
)
}
@ -141,6 +154,7 @@ mod cli {
cmd.arg("--dump")
.arg(input)
.arg("--include-mail")
.assert()
.success()
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
@ -155,6 +169,7 @@ mod cli {
cmd.arg("--dump")
.arg(input)
.arg("--include-mail")
.assert()
.success()
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
@ -475,7 +490,8 @@ mod cli {
"TEST.md",
MockResponseStats {
total: 11,
successful: 11,
successful: 9,
excludes: 2,
..MockResponseStats::default()
}
)
@ -491,6 +507,7 @@ mod cli {
cmd.arg("--output")
.arg(&outfile)
.arg("--dump")
.arg("--include-mail")
.arg(test_path)
.assert()
.success();
@ -533,42 +550,7 @@ mod cli {
.arg("https://ldra.com/")
.assert()
.success()
.stdout(contains("2 Excluded"));
Ok(())
}
#[test]
fn test_exclude_file() -> Result<()> {
let mut cmd = main_command();
let test_path = fixtures_path().join("TEST.md");
let excludes_path = fixtures_path().join("TEST_EXCLUDE_1.txt");
cmd.arg(test_path)
.arg("--exclude-file")
.arg(excludes_path)
.assert()
.success()
.stdout(contains("2 Excluded"));
Ok(())
}
#[test]
fn test_multiple_exclude_files() -> Result<()> {
let mut cmd = main_command();
let test_path = fixtures_path().join("TEST.md");
let excludes_path1 = fixtures_path().join("TEST_EXCLUDE_1.txt");
let excludes_path2 = fixtures_path().join("TEST_EXCLUDE_2.txt");
cmd.arg(test_path)
.arg("--exclude-file")
.arg(excludes_path1)
.arg("--exclude-file")
.arg(excludes_path2)
.assert()
.success()
.stdout(contains("3 Excluded"));
.stdout(contains("4 Excluded"));
Ok(())
}

View file

@ -32,6 +32,7 @@ mod cli {
let cmd = cmd
.arg(input)
.arg("--include-mail")
.arg("--dump")
.assert()
.success()

View file

@ -179,8 +179,8 @@ pub struct ClientBuilder {
/// [IETF RFC 4291 section 2.5.3]: https://tools.ietf.org/html/rfc4291#section-2.5.3
exclude_loopback_ips: bool,
/// When `true`, don't check mail addresses.
exclude_mail: bool,
/// When `true`, check mail addresses.
include_mail: bool,
/// Maximum number of redirects per request before returning an error.
///
@ -367,7 +367,7 @@ impl ClientBuilder {
exclude_private_ips: self.exclude_all_private || self.exclude_private_ips,
exclude_link_local_ips: self.exclude_all_private || self.exclude_link_local_ips,
exclude_loopback_ips: self.exclude_all_private || self.exclude_loopback_ips,
exclude_mail: self.exclude_mail,
include_mail: self.include_mail,
};
let quirks = Quirks::default();
@ -837,19 +837,8 @@ mod tests {
}
#[tokio::test]
async fn test_exclude_mail() {
async fn test_exclude_mail_by_default() {
let client = ClientBuilder::builder()
.exclude_mail(false)
.exclude_all_private(true)
.build()
.client()
.unwrap();
assert!(!client.is_excluded(&Uri {
url: "mailto://mail@example.com".try_into().unwrap()
}));
let client = ClientBuilder::builder()
.exclude_mail(true)
.exclude_all_private(true)
.build()
.client()
@ -859,6 +848,29 @@ mod tests {
}));
}
#[tokio::test]
async fn test_include_mail() {
let client = ClientBuilder::builder()
.include_mail(false)
.exclude_all_private(true)
.build()
.client()
.unwrap();
assert!(client.is_excluded(&Uri {
url: "mailto://mail@example.com".try_into().unwrap()
}));
let client = ClientBuilder::builder()
.include_mail(true)
.exclude_all_private(true)
.build()
.client()
.unwrap();
assert!(!client.is_excluded(&Uri {
url: "mailto://mail@example.com".try_into().unwrap()
}));
}
#[tokio::test]
async fn test_require_https() {
let client = ClientBuilder::builder().build().client().unwrap();

View file

@ -113,15 +113,15 @@ pub struct Filter {
/// For IPv6: ::1/128
pub exclude_loopback_ips: bool,
/// Example: octocat@github.com
pub exclude_mail: bool,
pub include_mail: bool,
}
impl Filter {
#[inline]
#[must_use]
/// Whether e-mails aren't checked
/// Whether e-mails aren't checked (which is the default)
pub fn is_mail_excluded(&self, uri: &Uri) -> bool {
self.exclude_mail && uri.is_mail()
uri.is_mail() && !self.include_mail
}
#[must_use]
@ -179,7 +179,7 @@ impl Filter {
/// # Details
///
/// 1. If any of the following conditions are met, the URI is excluded:
/// - If it's a mail address and it's configured to ignore mail addresses.
/// - If it's a mail address and it's not configured to include mail addresses.
/// - If the IP address belongs to a type that is configured to exclude.
/// - If the host belongs to a type that is configured to exclude.
/// - If the scheme of URI is not the allowed scheme.
@ -196,10 +196,10 @@ impl Filter {
#[must_use]
pub fn is_excluded(&self, uri: &Uri) -> bool {
// Skip mail address, specific IP, specific host and scheme
if self.is_mail_excluded(uri)
|| self.is_ip_excluded(uri)
if self.is_scheme_excluded(uri)
|| self.is_host_excluded(uri)
|| self.is_scheme_excluded(uri)
|| self.is_ip_excluded(uri)
|| self.is_mail_excluded(uri)
|| is_example_domain(uri)
|| is_unsupported_domain(uri)
{
@ -211,7 +211,7 @@ impl Filter {
if self.is_includes_empty() {
if self.is_excludes_empty() {
// Both excludes and includes rules are empty:
// *Presumably included* unless it's false positive
// *Presumably included* unless it's a false positive
return is_false_positive(input);
}
} else if self.is_includes_match(input) {
@ -363,9 +363,8 @@ mod tests {
}
#[test]
fn test_exclude_mail() {
fn test_exclude_mail_by_default() {
let filter = Filter {
exclude_mail: true,
..Filter::default()
};
@ -374,6 +373,18 @@ mod tests {
assert!(!filter.is_excluded(&website("http://bar.dev")));
}
#[test]
fn test_include_mail() {
let filter = Filter {
include_mail: true,
..Filter::default()
};
assert!(!filter.is_excluded(&mail("mail@example.com")));
assert!(!filter.is_excluded(&mail("foo@bar.dev")));
assert!(!filter.is_excluded(&website("http://bar.dev")));
}
#[test]
fn test_exclude_regex() {
let excludes = Excludes {
@ -389,7 +400,7 @@ mod tests {
assert!(filter.is_excluded(&mail("mail@example.com")));
assert!(!filter.is_excluded(&website("http://bar.dev")));
assert!(!filter.is_excluded(&mail("foo@bar.dev")));
assert!(filter.is_excluded(&mail("foo@bar.dev")));
}
#[test]
fn test_exclude_include_regex() {

View file

@ -110,5 +110,5 @@ exclude_link_local = false
# Exclude loopback IP address range and localhost from checking.
exclude_loopback = false
# Exclude all mail addresses from checking.
exclude_mail = false
# Check mail addresses
include_mail = true