mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-16 20:50:25 +00:00
Fix URLs with '@' parsing as emails (#177)
* Fix URLs with '@' parsing as emails Only consider a link an email if it fails to parse as URL. Also use a proper email validation instead of a simple '@' check. This uses the fast_chemail crate which parses email links according to the HTML specification (which is much more practical than checking for RFC 5322 formatted emails). It's also worth noting that fast_chemail is used internally (albeit indirectly) by the check_if_email_exists crate. This means that email addresses not considered valid by fast_chemail wouldn't pass link checks anyway. * Fix comment in test
This commit is contained in:
parent
b4de8e0983
commit
a45e781d47
4 changed files with 35 additions and 5 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1381,6 +1381,7 @@ dependencies = [
|
|||
"deadpool",
|
||||
"derive_builder",
|
||||
"doc-comment",
|
||||
"fast_chemail",
|
||||
"futures",
|
||||
"glob",
|
||||
"headers",
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ hubcaps = { git="https://github.com/softprops/hubcaps.git" }
|
|||
linkify = "0.5.0"
|
||||
regex = "1.4.4"
|
||||
url = "2.2.1"
|
||||
# Switch back to version on crates.io after
|
||||
# Switch back to version on crates.io after
|
||||
# https://github.com/async-email/async-smtp/pull/36
|
||||
# is merged and a new version of check-if-email-exists is released
|
||||
check-if-email-exists = { git="https://github.com/reacherhq/check-if-email-exists.git" }
|
||||
|
|
@ -53,6 +53,7 @@ serde_json = "1.0.64"
|
|||
ring = "0.16.19"
|
||||
pad = "0.1.6"
|
||||
console = "0.14.0"
|
||||
fast_chemail = "0.9.6"
|
||||
|
||||
[dependencies.reqwest]
|
||||
features = ["gzip"]
|
||||
|
|
|
|||
|
|
@ -498,4 +498,27 @@ mod test {
|
|||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_urls_with_at_sign_properly() {
|
||||
// note that these used to parse as emails
|
||||
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
|
||||
let links: HashSet<Uri> = extract_links(
|
||||
&InputContent::from_string(&input, FileType::Plaintext),
|
||||
None,
|
||||
)
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect();
|
||||
|
||||
let expected_links = [
|
||||
website("https://example.com/@test/test"),
|
||||
website("http://otherdomain.com/test/@test"),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
13
src/uri.rs
13
src/uri.rs
|
|
@ -1,4 +1,5 @@
|
|||
use anyhow::{bail, Result};
|
||||
use fast_chemail::is_valid_email;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::IpAddr;
|
||||
use std::{convert::TryFrom, fmt::Display};
|
||||
|
|
@ -56,12 +57,12 @@ impl TryFrom<&str> for Uri {
|
|||
// Remove the `mailto` scheme if it exists
|
||||
// to avoid parsing it as a website URL.
|
||||
let s = s.trim_start_matches("mailto:");
|
||||
if s.contains('@') & !is_link_internal {
|
||||
return Ok(Uri::Mail(s.to_string()));
|
||||
}
|
||||
|
||||
if let Ok(uri) = Url::parse(s) {
|
||||
return Ok(Uri::Website(uri));
|
||||
};
|
||||
} else if !is_link_internal && is_valid_email(&s) {
|
||||
return Ok(Uri::Mail(s.to_string()));
|
||||
}
|
||||
bail!("Cannot convert to Uri")
|
||||
}
|
||||
}
|
||||
|
|
@ -86,6 +87,10 @@ mod test {
|
|||
Uri::try_from("http://example.org").unwrap(),
|
||||
website("http://example.org")
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("http://example.org/@test/testing").unwrap(),
|
||||
website("http://example.org/@test/testing")
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("mail@example.org").unwrap(),
|
||||
Uri::Mail("mail@example.org".to_string())
|
||||
|
|
|
|||
Loading…
Reference in a new issue