mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-16 09:43:12 +00:00
Fix parsing error of email addresses with query params (#809)
Email addresses with query parameters often get used in contact forms on websites. They can also be found in other documents like Markdown. A common use-case is to add a subject line to the email as a parameter e.g. `mailto:mail@example.com?subject="Hello"`. Previously we handled such cases incorrectly by recognizing them as files. The reason was that our email parsing was too strict to allow for that use-case. With `email_address` we switched to a more permissive parser. Note that this does not affect the actual address email checking, as this is still done `check-if-email-exists`, which has more strict check functionality.
This commit is contained in:
parent
264af23822
commit
d61105edbb
14 changed files with 132 additions and 31 deletions
11
Cargo.lock
generated
11
Cargo.lock
generated
|
|
@ -981,6 +981,15 @@ version = "1.6.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
|
||||
|
||||
[[package]]
|
||||
name = "email_address"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2153bd83ebc09db15bcbdc3e2194d901804952e3dc96967e1cd3b0c5c32d112"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "0.3.6"
|
||||
|
|
@ -1800,7 +1809,7 @@ dependencies = [
|
|||
"cached",
|
||||
"check-if-email-exists",
|
||||
"doc-comment",
|
||||
"fast_chemail",
|
||||
"email_address",
|
||||
"futures",
|
||||
"glob",
|
||||
"html5ever",
|
||||
|
|
|
|||
|
|
@ -55,21 +55,21 @@ Or, you can accept all content/MIME types: `--headers "accept=*/*"`.
|
|||
See more info about the Accept header
|
||||
[over at MDN](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept).
|
||||
|
||||
|
||||
## Unreachable Mail Address
|
||||
|
||||
We use https://github.com/reacherhq/check-if-email-exists for email checking.
|
||||
You can test your mail address with curl:
|
||||
```
|
||||
|
||||
```bash
|
||||
curl -X POST \
|
||||
'https://api.reacher.email/v0/check_email' \
|
||||
-H 'content-type: application/json' \
|
||||
-H 'authorization: test_api_token' \
|
||||
-d '{"to_email": "box@domain.test"}'
|
||||
```
|
||||
Some settings on your mail server (such as SPF Policy, DNSBL) may prevent your email from being verified.
|
||||
If you have an error with checking a working email, you can disable this check using the
|
||||
[commandline parameter](https://github.com/lycheeverse/lychee#commandline-parameters) `--exclude-mail`.
|
||||
|
||||
|
||||
|
||||
Some settings on your mail server (such as `SPF` Policy, `DNSBL`) may prevent
|
||||
your email from being verified. If you have an error with checking a working
|
||||
email, you can disable this check using the [commandline
|
||||
parameter](https://github.com/lycheeverse/lychee#commandline-parameters)
|
||||
`--exclude-mail`.
|
||||
|
|
|
|||
|
|
@ -9,9 +9,9 @@ use std::{collections::HashSet, time::Duration};
|
|||
#[allow(clippy::trivial_regex)]
|
||||
async fn main() -> Result<()> {
|
||||
// Excludes
|
||||
let excludes = Some(RegexSet::new(&[r"example"]).unwrap());
|
||||
let excludes = Some(RegexSet::new([r"example"]).unwrap());
|
||||
// Includes take precedence over excludes
|
||||
let includes = Some(RegexSet::new(&[r"example.com"]).unwrap());
|
||||
let includes = Some(RegexSet::new([r"example.com"]).unwrap());
|
||||
|
||||
// Set custom request headers
|
||||
let mut headers = HeaderMap::new();
|
||||
|
|
|
|||
14
fixtures/TEST_EMAIL_QUERY_PARAMS.html
vendored
Normal file
14
fixtures/TEST_EMAIL_QUERY_PARAMS.html
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
<html lang="en">
|
||||
<head>
|
||||
<title>Lychee Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Please email
|
||||
<a href="mailto:hello@example.org?subject=%5BHello%5D"
|
||||
>hello@example.org</a
|
||||
>
|
||||
for any questions.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
1
fixtures/TEST_EMAIL_QUERY_PARAMS.md
vendored
Normal file
1
fixtures/TEST_EMAIL_QUERY_PARAMS.md
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
Please email [hello@example.org](mailto:hello@example.org?subject=%5BHello%5D) for any questions.
|
||||
|
|
@ -127,6 +127,34 @@ mod cli {
|
|||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_email_html_with_subject() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.html");
|
||||
|
||||
cmd.arg("--dump")
|
||||
.arg(input)
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_email_markdown_with_subject() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
let input = fixtures_path().join("TEST_EMAIL_QUERY_PARAMS.md");
|
||||
|
||||
cmd.arg("--dump")
|
||||
.arg(input)
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("hello@example.org?subject=%5BHello%5D"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test that a GitHub link can be checked without specifying the token.
|
||||
#[test]
|
||||
fn test_check_github_no_token() -> Result<()> {
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ version = "0.10.1"
|
|||
|
||||
[dependencies]
|
||||
check-if-email-exists = "0.9.0"
|
||||
fast_chemail = "0.9.6"
|
||||
email_address = "0.2.4"
|
||||
glob = "0.3.0"
|
||||
http = "0.2.8"
|
||||
linkify = "0.9.0"
|
||||
|
|
|
|||
|
|
@ -549,8 +549,13 @@ impl Client {
|
|||
}
|
||||
|
||||
/// Check a mail address, or equivalently a `mailto` URI.
|
||||
///
|
||||
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
|
||||
/// which are ignored by this check. The are not part of the mail address
|
||||
/// and instead passed to a mail client.
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
let input = CheckEmailInput::new(uri.as_str().to_owned());
|
||||
let address = uri.url.path().to_string();
|
||||
let input = CheckEmailInput::new(address);
|
||||
let result = &(check_email(&input).await);
|
||||
|
||||
if let Reachable::Invalid = result.is_reachable {
|
||||
|
|
|
|||
|
|
@ -304,7 +304,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_overwrite_false_positives() {
|
||||
let includes = Includes {
|
||||
regex: RegexSet::new(&[r"http://www.w3.org/1999/xhtml"]).unwrap(),
|
||||
regex: RegexSet::new([r"http://www.w3.org/1999/xhtml"]).unwrap(),
|
||||
};
|
||||
let filter = Filter {
|
||||
includes: Some(includes),
|
||||
|
|
@ -316,7 +316,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_include_regex() {
|
||||
let includes = Includes {
|
||||
regex: RegexSet::new(&[r"foo.example.com"]).unwrap(),
|
||||
regex: RegexSet::new([r"foo.example.com"]).unwrap(),
|
||||
};
|
||||
let filter = Filter {
|
||||
includes: Some(includes),
|
||||
|
|
@ -344,7 +344,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_exclude_regex() {
|
||||
let excludes = Excludes {
|
||||
regex: RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(),
|
||||
regex: RegexSet::new([r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap(),
|
||||
};
|
||||
let filter = Filter {
|
||||
excludes: Some(excludes),
|
||||
|
|
@ -361,10 +361,10 @@ mod tests {
|
|||
#[test]
|
||||
fn test_exclude_include_regex() {
|
||||
let includes = Includes {
|
||||
regex: RegexSet::new(&[r"foo.example.com"]).unwrap(),
|
||||
regex: RegexSet::new([r"foo.example.com"]).unwrap(),
|
||||
};
|
||||
let excludes = Excludes {
|
||||
regex: RegexSet::new(&[r"example.com"]).unwrap(),
|
||||
regex: RegexSet::new([r"example.com"]).unwrap(),
|
||||
};
|
||||
let filter = Filter {
|
||||
includes: Some(includes),
|
||||
|
|
|
|||
|
|
@ -97,8 +97,8 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
|
|||
//
|
||||
// Unfortunately requires real files for `fs::canonicalize`.
|
||||
pub(crate) fn contains(parent: &PathBuf, child: &PathBuf) -> Result<bool> {
|
||||
let parent = fs::canonicalize(&parent)?;
|
||||
let child = fs::canonicalize(&child)?;
|
||||
let parent = fs::canonicalize(parent)?;
|
||||
let child = fs::canonicalize(child)?;
|
||||
|
||||
Ok(child.starts_with(parent))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ pub enum ErrorKind {
|
|||
/// Invalid Github URL
|
||||
#[error("Github URL is invalid: {0}")]
|
||||
InvalidGithubUrl(String),
|
||||
/// The input is empty and not accepted as a valid URL
|
||||
#[error("URL cannot be empty")]
|
||||
EmptyUrl,
|
||||
/// The given string can not be parsed into a valid URL, e-mail address, or file path
|
||||
#[error("Cannot parse string `{1}` as website url: {0}")]
|
||||
ParseUrl(#[source] url::ParseError, String),
|
||||
|
|
@ -180,6 +183,7 @@ impl Hash for ErrorKind {
|
|||
Self::InvalidGithubUrl(s) => s.hash(state),
|
||||
Self::DirTraversal(e) => e.to_string().hash(state),
|
||||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::EmptyUrl => "Empty URL".hash(state),
|
||||
Self::ParseUrl(e, s) => (e.to_string(), s).hash(state),
|
||||
Self::InvalidURI(u) => u.hash(state),
|
||||
Self::InvalidUrlFromPath(p) => p.hash(state),
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ impl TryFrom<&PathBuf> for InputContent {
|
|||
|
||||
fn try_from(path: &PathBuf) -> std::result::Result<Self, Self::Error> {
|
||||
let input =
|
||||
fs::read_to_string(&path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?;
|
||||
fs::read_to_string(path).map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?;
|
||||
|
||||
Ok(Self {
|
||||
source: InputSource::String(input.clone()),
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
use check_if_email_exists::{CheckEmailOutput, Reachable};
|
||||
|
||||
/// A crude way to extract error details from the mail output.
|
||||
/// This was added because `CheckEmailOutput` doesn't impl `Display`
|
||||
/// This was added because `CheckEmailOutput` doesn't impl `Display`.
|
||||
pub(crate) fn error_from_output(o: &CheckEmailOutput) -> String {
|
||||
if let Err(_e) = o.misc.as_ref() {
|
||||
return "Error occurred connecting to this email server via SMTP".to_string();
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use std::{convert::TryFrom, fmt::Display, net::IpAddr};
|
||||
|
||||
use fast_chemail::parse_email;
|
||||
use email_address::EmailAddress;
|
||||
use ip_network::Ipv6Network;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
|
|
@ -191,16 +191,48 @@ impl TryFrom<String> for Uri {
|
|||
impl TryFrom<&str> for Uri {
|
||||
type Error = ErrorKind;
|
||||
|
||||
/// Create a new URI from a string
|
||||
///
|
||||
/// Note:
|
||||
/// We do not handle relative URLs here, as we do not know the base URL.
|
||||
/// Furthermore paths also cannot be resolved, as we do not know the file system.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the string is not a valid URI
|
||||
///
|
||||
fn try_from(s: &str) -> Result<Self> {
|
||||
let s = s.trim_start_matches("mailto:");
|
||||
// Silently ignore mail parse errors as they are very common and expected for most URIs
|
||||
if parse_email(s).is_err() {
|
||||
match Url::parse(s) {
|
||||
Ok(uri) => Ok(uri.into()),
|
||||
Err(url_err) => Err(ErrorKind::ParseUrl(url_err, s.to_owned())),
|
||||
// Empty strings are accepted when being parsed with `Url::parse`,
|
||||
// but we don't want to accept them because there is no clear definition
|
||||
// of "validity" in this case.
|
||||
if s.is_empty() {
|
||||
return Err(ErrorKind::EmptyUrl);
|
||||
}
|
||||
|
||||
match Url::parse(s) {
|
||||
Ok(uri) => Ok(uri.into()),
|
||||
Err(err) => {
|
||||
// This could be a relative URL or a mail address or something
|
||||
// else entirely. Try the mail address check first, as it's the
|
||||
// most common case. Note that we use a relatively weak check
|
||||
// here because
|
||||
// - `fast_chemail::parse_email` does not accept parameters
|
||||
// (`foo@example?subject=bar`), which are common for website
|
||||
// contact forms
|
||||
// - `check_if_email_exists` does additional spam detection,
|
||||
// which we only want to execute when checking the email
|
||||
// addresses, but not when printing all links with `--dump`.
|
||||
if EmailAddress::is_valid(s) {
|
||||
// Use the `mailto:` scheme for mail addresses,
|
||||
// which will allow `Url::parse` to parse them.
|
||||
if let Ok(uri) = Url::parse(&format!("mailto:{s}")) {
|
||||
return Ok(uri.into());
|
||||
};
|
||||
};
|
||||
|
||||
// We do not handle relative URLs here, as we do not know the base URL.
|
||||
Err(ErrorKind::ParseUrl(err, s.to_owned()))
|
||||
}
|
||||
} else {
|
||||
Ok(Url::parse(&format!("mailto:{s}")).unwrap().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -242,7 +274,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_uri_from_str() {
|
||||
fn test_uri_from_url() {
|
||||
assert!(Uri::try_from("").is_err());
|
||||
assert_eq!(
|
||||
Uri::try_from("https://example.com"),
|
||||
|
|
@ -252,6 +284,10 @@ mod tests {
|
|||
Uri::try_from("https://example.com/@test/testing"),
|
||||
Ok(website("https://example.com/@test/testing"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_uri_from_email_str() {
|
||||
assert_eq!(
|
||||
Uri::try_from("mail@example.com"),
|
||||
Ok(mail("mail@example.com"))
|
||||
|
|
@ -260,6 +296,10 @@ mod tests {
|
|||
Uri::try_from("mailto:mail@example.com"),
|
||||
Ok(mail("mail@example.com"))
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("mail@example.com?foo=bar"),
|
||||
Ok(mail("mail@example.com?foo=bar"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
Loading…
Reference in a new issue