Fix parsing error of email addresses with query params (#809)

Email addresses with query parameters often get used in
contact forms on websites. They can also be found in
other documents like Markdown.

A common use-case is to add a subject line to the email
as a parameter e.g. `mailto:mail@example.com?subject="Hello"`.

Previously we handled such cases incorrectly by recognizing
them as files. The reason was that our email parsing was too strict
to allow for that use-case.
With `email_address` we switched to a more permissive parser.

Note that this does not affect the actual address email checking,
as this is still done `check-if-email-exists`, which has more strict
check functionality.
This commit is contained in:
Matthias 2022-11-05 23:40:33 +01:00 committed by GitHub
parent 264af23822
commit d61105edbb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 132 additions and 31 deletions

11
Cargo.lock generated
View file

@ -981,6 +981,15 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "email_address"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2153bd83ebc09db15bcbdc3e2194d901804952e3dc96967e1cd3b0c5c32d112"
dependencies = [
"serde",
]
[[package]]
name = "encode_unicode"
version = "0.3.6"
@ -1800,7 +1809,7 @@ dependencies = [
"cached",
"check-if-email-exists",
"doc-comment",
"fast_chemail",
"email_address",
"futures",
"glob",
"html5ever",

View file

@ -55,21 +55,21 @@ Or, you can accept all content/MIME types: `--headers "accept=*/*"`.
See more info about the Accept header
[over at MDN](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept).
## Unreachable Mail Address
We use https://github.com/reacherhq/check-if-email-exists for email checking.
You can test your mail address with curl:
```
```bash
curl -X POST \
'https://api.reacher.email/v0/check_email' \
-H 'content-type: application/json' \
-H 'authorization: test_api_token' \
-d '{"to_email": "box@domain.test"}'
```
Some settings on your mail server (such as SPF Policy, DNSBL) may prevent your email from being verified.
If you have an error with checking a working email, you can disable this check using the
[commandline parameter](https://github.com/lycheeverse/lychee#commandline-parameters) `--exclude-mail`.
Some settings on your mail server (such as `SPF` Policy, `DNSBL`) may prevent
your email from being verified. If you have an error with checking a working
email, you can disable this check using the [commandline
parameter](https://github.com/lycheeverse/lychee#commandline-parameters)
`--exclude-mail`.

View file

@ -9,9 +9,9 @@ use std::{collections::HashSet, time::Duration};
#[allow(clippy::trivial_regex)]
async fn main() -> Result<()> {
// Excludes
let excludes = Some(RegexSet::new(&[r"example"]).unwrap());
let excludes = Some(RegexSet::new([r"example"]).unwrap());
// Includes take precedence over excludes
let includes = Some(RegexSet::new(&[r"example.com"]).unwrap());
let includes = Some(RegexSet::new([r"example.com"]).unwrap());
// Set custom request headers
let mut headers = HeaderMap::new();

14
fixtures/TEST_EMAIL_QUERY_PARAMS.html vendored Normal file
View file

@ -0,0 +1,14 @@
<html lang="en">
<head>
<title>Lychee Test</title>
</head>
<body>
<p>
Please email
<a href="mailto:hello@example.org?subject=%5BHello%5D"
>hello@example.org</a
>
for any questions.
</p>
</body>
</html>

1
fixtures/TEST_EMAIL_QUERY_PARAMS.md vendored Normal file
View file

@ -0,0 +1 @@
Please email [hello@example.org](mailto:hello@example.org?subject=%5BHello%5D) for any questions.

View file

@ -127,6 +127,34 @@ mod cli {
)
}
#[test]
fn test_email_html_with_subject() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.html");
cmd.arg("--dump")
.arg(input)
.assert()
.success()
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
Ok(())
}
#[test]
fn test_email_markdown_with_subject() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.md");
cmd.arg("--dump")
.arg(input)
.assert()
.success()
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
Ok(())
}
/// Test that a GitHub link can be checked without specifying the token.
#[test]
fn test_check_github_no_token() -> Result<()> {

View file

@ -18,7 +18,7 @@ version = "0.10.1"
[dependencies]
check-if-email-exists = "0.9.0"
fast_chemail = "0.9.6"
email_address = "0.2.4"
glob = "0.3.0"
http = "0.2.8"
linkify = "0.9.0"

View file

@ -549,8 +549,13 @@ impl Client {
}
/// Check a mail address, or equivalently a `mailto` URI.
///
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
/// which are ignored by this check. The are not part of the mail address
/// and instead passed to a mail client.
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(uri.as_str().to_owned());
let address = uri.url.path().to_string();
let input = CheckEmailInput::new(address);
let result = &(check_email(&input).await);
if let Reachable::Invalid = result.is_reachable {

View file

@ -304,7 +304,7 @@ mod tests {
#[test]
fn test_overwrite_false_positives() {
let includes = Includes {
regex: RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap(),
regex: RegexSet::new([r"http://www.w3.org/1999/xhtml"]).unwrap(),
};
let filter = Filter {
includes: Some(includes),
@ -316,7 +316,7 @@ mod tests {
#[test]
fn test_include_regex() {
let includes = Includes {
regex: RegexSet::new(&[r"foo.example.com"]).unwrap(),
regex: RegexSet::new([r"foo.example.com"]).unwrap(),
};
let filter = Filter {
includes: Some(includes),
@ -344,7 +344,7 @@ mod tests {
#[test]
fn test_exclude_regex() {
let excludes = Excludes {
regex: RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(),
regex: RegexSet::new([r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(),
};
let filter = Filter {
excludes: Some(excludes),
@ -361,10 +361,10 @@ mod tests {
#[test]
fn test_exclude_include_regex() {
let includes = Includes {
regex: RegexSet::new(&[r"foo.example.com"]).unwrap(),
regex: RegexSet::new([r"foo.example.com"]).unwrap(),
};
let excludes = Excludes {
regex: RegexSet::new(&[r"example.com"]).unwrap(),
regex: RegexSet::new([r"example.com"]).unwrap(),
};
let filter = Filter {
includes: Some(includes),

View file

@ -97,8 +97,8 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
//
// Unfortunately requires real files for `fs::canonicalize`.
pub(crate) fn contains(parent: &PathBuf, child: &PathBuf) -> Result<bool> {
let parent = fs::canonicalize(&parent)?;
let child = fs::canonicalize(&child)?;
let parent = fs::canonicalize(parent)?;
let child = fs::canonicalize(child)?;
Ok(child.starts_with(parent))
}

View file

@ -43,6 +43,9 @@ pub enum ErrorKind {
/// Invalid Github URL
#[error("Github URL is invalid: {0}")]
InvalidGithubUrl(String),
/// The input is empty and not accepted as a valid URL
#[error("URL cannot be empty")]
EmptyUrl,
/// The given string can not be parsed into a valid URL, e-mail address, or file path
#[error("Cannot parse string `{1}` as website url: {0}")]
ParseUrl(#[source] url::ParseError, String),
@ -180,6 +183,7 @@ impl Hash for ErrorKind {
Self::InvalidGithubUrl(s) => s.hash(state),
Self::DirTraversal(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::EmptyUrl => "Empty URL".hash(state),
Self::ParseUrl(e, s) => (e.to_string(), s).hash(state),
Self::InvalidURI(u) => u.hash(state),
Self::InvalidUrlFromPath(p) => p.hash(state),

View file

@ -49,7 +49,7 @@ impl TryFrom<&PathBuf> for InputContent {
fn try_from(path: &PathBuf) -> std::result::Result<Self, Self::Error> {
let input =
fs::read_to_string(&path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?;
fs::read_to_string(path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?;
Ok(Self {
source: InputSource::String(input.clone()),

View file

@ -1,7 +1,7 @@
use check_if_email_exists::{CheckEmailOutput, Reachable};
/// A crude way to extract error details from the mail output.
/// This was added because `CheckEmailOutput` doesn't impl `Display`
/// This was added because `CheckEmailOutput` doesn't impl `Display`.
pub(crate) fn error_from_output(o: &CheckEmailOutput) -> String {
if let Err(_e) = o.misc.as_ref() {
return "Error occurred connecting to this email server via SMTP".to_string();

View file

@ -1,6 +1,6 @@
use std::{convert::TryFrom, fmt::Display, net::IpAddr};
use fast_chemail::parse_email;
use email_address::EmailAddress;
use ip_network::Ipv6Network;
use serde::{Deserialize, Serialize};
use url::Url;
@ -191,16 +191,48 @@ impl TryFrom<String> for Uri {
impl TryFrom<&str> for Uri {
type Error = ErrorKind;
/// Create a new URI from a string
///
/// Note:
/// We do not handle relative URLs here, as we do not know the base URL.
/// Furthermore paths also cannot be resolved, as we do not know the file system.
///
/// # Errors
///
/// Returns an error if the string is not a valid URI
///
fn try_from(s: &str) -> Result<Self> {
let s = s.trim_start_matches("mailto:");
// Silently ignore mail parse errors as they are very common and expected for most URIs
if parse_email(s).is_err() {
match Url::parse(s) {
Ok(uri) => Ok(uri.into()),
Err(url_err) => Err(ErrorKind::ParseUrl(url_err, s.to_owned())),
// Empty strings are accepted when being parsed with `Url::parse`,
// but we don't want to accept them because there is no clear definition
// of "validity" in this case.
if s.is_empty() {
return Err(ErrorKind::EmptyUrl);
}
match Url::parse(s) {
Ok(uri) => Ok(uri.into()),
Err(err) => {
// This could be a relative URL or a mail address or something
// else entirely. Try the mail address check first, as it's the
// most common case. Note that we use a relatively weak check
// here because
// - `fast_chemail::parse_email` does not accept parameters
// (`foo@example?subject=bar`), which are common for website
// contact forms
// - `check_if_email_exists` does additional spam detection,
// which we only want to execute when checking the email
// addresses, but not when printing all links with `--dump`.
if EmailAddress::is_valid(s) {
// Use the `mailto:` scheme for mail addresses,
// which will allow `Url::parse` to parse them.
if let Ok(uri) = Url::parse(&format!("mailto:{s}")) {
return Ok(uri.into());
};
};
// We do not handle relative URLs here, as we do not know the base URL.
Err(ErrorKind::ParseUrl(err, s.to_owned()))
}
} else {
Ok(Url::parse(&format!("mailto:{s}")).unwrap().into())
}
}
}
@ -242,7 +274,7 @@ mod tests {
}
#[test]
fn test_uri_from_str() {
fn test_uri_from_url() {
assert!(Uri::try_from("").is_err());
assert_eq!(
Uri::try_from("https://example.com"),
@ -252,6 +284,10 @@ mod tests {
Uri::try_from("https://example.com/@test/testing"),
Ok(website("https://example.com/@test/testing"))
);
}
#[test]
fn test_uri_from_email_str() {
assert_eq!(
Uri::try_from("mail@example.com"),
Ok(mail("mail@example.com"))
@ -260,6 +296,10 @@ mod tests {
Uri::try_from("mailto:mail@example.com"),
Ok(mail("mail@example.com"))
);
assert_eq!(
Uri::try_from("mail@example.com?foo=bar"),
Ok(mail("mail@example.com?foo=bar"))
);
}
#[test]