Fix URLs with '@' parsing as emails (#177)

* Fix URLs with '@' parsing as emails

Only consider a link an email if it fails to parse as URL.

Also use a proper email validation instead of a simple '@' check.

This uses the fast_chemail crate which parses email links according
to the HTML specification (which is much more practical than checking
for RFC 5322 formatted emails).  It's also worth noting that
fast_chemail is used internally (albeit indirectly) by the
check_if_email_exists crate.  This means that email addresses
not considered valid by fast_chemail wouldn't pass link checks
anyway.

* Fix comment in test
This commit is contained in:
Paweł Romanowski 2021-03-14 20:10:36 +01:00 committed by GitHub
parent b4de8e0983
commit a45e781d47
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 5 deletions

1
Cargo.lock generated
View file

@ -1381,6 +1381,7 @@ dependencies = [
"deadpool",
"derive_builder",
"doc-comment",
"fast_chemail",
"futures",
"glob",
"headers",

View file

@ -26,7 +26,7 @@ hubcaps = { git="https://github.com/softprops/hubcaps.git" }
linkify = "0.5.0"
regex = "1.4.4"
url = "2.2.1"
# Switch back to version on crates.io after
# Switch back to version on crates.io after
# https://github.com/async-email/async-smtp/pull/36
# is merged and a new version of check-if-email-exists is released
check-if-email-exists = { git="https://github.com/reacherhq/check-if-email-exists.git" }
@ -53,6 +53,7 @@ serde_json = "1.0.64"
ring = "0.16.19"
pad = "0.1.6"
console = "0.14.0"
fast_chemail = "0.9.6"
[dependencies.reqwest]
features = ["gzip"]

View file

@ -498,4 +498,27 @@ mod test {
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_urls_with_at_sign_properly() {
// note that these used to parse as emails
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(&input, FileType::Plaintext),
None,
)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [
website("https://example.com/@test/test"),
website("http://otherdomain.com/test/@test"),
]
.iter()
.cloned()
.collect();
assert_eq!(links, expected_links);
}
}

View file

@ -1,4 +1,5 @@
use anyhow::{bail, Result};
use fast_chemail::is_valid_email;
use serde::{Deserialize, Serialize};
use std::net::IpAddr;
use std::{convert::TryFrom, fmt::Display};
@ -56,12 +57,12 @@ impl TryFrom<&str> for Uri {
// Remove the `mailto` scheme if it exists
// to avoid parsing it as a website URL.
let s = s.trim_start_matches("mailto:");
if s.contains('@') & !is_link_internal {
return Ok(Uri::Mail(s.to_string()));
}
if let Ok(uri) = Url::parse(s) {
return Ok(Uri::Website(uri));
};
} else if !is_link_internal && is_valid_email(&s) {
return Ok(Uri::Mail(s.to_string()));
}
bail!("Cannot convert to Uri")
}
}
@ -86,6 +87,10 @@ mod test {
Uri::try_from("http://example.org").unwrap(),
website("http://example.org")
);
assert_eq!(
Uri::try_from("http://example.org/@test/testing").unwrap(),
website("http://example.org/@test/testing")
);
assert_eq!(
Uri::try_from("mail@example.org").unwrap(),
Uri::Mail("mail@example.org".to_string())