lychee/src/extract.rs

117 lines
3.2 KiB
Rust
Raw Normal View History

2020-08-09 21:16:23 +00:00
use linkify::LinkFinder;
2020-08-23 21:22:48 +00:00
use std::{collections::HashSet, fmt::Display};
2020-08-09 20:47:39 +00:00
use url::Url;
2020-08-23 21:22:48 +00:00
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum Uri {
Website(Url),
Mail(String),
}
impl Uri {
pub fn as_str(&self) -> &str {
match self {
Uri::Website(url) => url.as_str(),
Uri::Mail(address) => address.as_str(),
}
}
pub fn scheme(&self) -> Option<String> {
match self {
Uri::Website(url) => Some(url.scheme().to_string()),
Uri::Mail(_address) => None,
}
}
}
impl Display for Uri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
2020-08-17 18:19:29 +00:00
// Use LinkFinder here to offload the actual link searching
fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
2020-08-17 18:19:29 +00:00
finder.links(input).collect()
}
2020-08-09 20:47:39 +00:00
2020-08-23 21:22:48 +00:00
pub(crate) fn extract_links(input: &str) -> HashSet<Uri> {
2020-08-17 18:19:29 +00:00
let links = find_links(input);
2020-08-09 20:47:39 +00:00
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
2020-08-23 21:22:48 +00:00
let mut uris = HashSet::new();
for link in links {
match Url::parse(link.as_str()) {
Ok(url) => uris.insert(Uri::Website(url)),
Err(_) => uris.insert(Uri::Mail(link.as_str().to_owned())),
};
}
debug!("Found: {:#?}", uris);
uris
2020-08-09 20:47:39 +00:00
}
2020-08-09 21:09:27 +00:00
#[cfg(test)]
mod test {
use super::*;
use std::iter::FromIterator;
#[test]
fn test_extract_markdown_links() {
2020-08-09 21:16:23 +00:00
let input = "This is [a test](https://endler.dev).";
let links = extract_links(input);
2020-08-09 21:09:27 +00:00
assert_eq!(
links,
2020-08-23 21:22:48 +00:00
HashSet::from_iter(
[Uri::Website(Url::parse("https://endler.dev").unwrap())]
.iter()
.cloned()
)
2020-08-09 21:09:27 +00:00
)
}
#[test]
fn test_skip_markdown_anchors() {
2020-08-09 21:16:23 +00:00
let input = "This is [a test](#lol).";
let links = extract_links(input);
2020-08-09 21:09:27 +00:00
assert_eq!(links, HashSet::new())
}
#[test]
2020-08-23 21:22:48 +00:00
fn test_skip_markdown_internal_urls() {
2020-08-09 21:16:23 +00:00
let input = "This is [a test](./internal).";
let links = extract_links(input);
2020-08-09 21:09:27 +00:00
assert_eq!(links, HashSet::new())
}
#[test]
fn test_non_markdown_links() {
2020-08-23 21:22:48 +00:00
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
2020-08-09 21:16:23 +00:00
let links = extract_links(input);
2020-08-09 21:09:27 +00:00
let expected = HashSet::from_iter(
[
2020-08-23 21:22:48 +00:00
Uri::Website(Url::parse("https://endler.dev").unwrap()),
Uri::Website(Url::parse("https://hello-rust.show/foo/bar?lol=1").unwrap()),
Uri::Mail("test@example.com".to_string()),
2020-08-09 21:09:27 +00:00
]
.iter()
.cloned(),
);
assert_eq!(links, expected)
}
2020-08-09 21:16:23 +00:00
2020-08-17 18:19:29 +00:00
#[test]
#[ignore]
// TODO: Does this escaping need to work properly?
// See https://github.com/tcort/markdown-link-check/issues/37
fn test_md_escape() {
let input = r#"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx"#;
let links = find_links(input);
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
assert!(links.len() == 1);
assert_eq!(links[0].as_str(), expected);
}
2020-08-09 21:09:27 +00:00
}