From f66aaecf0f97767a4d5cd8bcd0ab6e1e2d12c8ad Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 12 Apr 2021 14:40:39 +0200 Subject: [PATCH] Assume HTML in case there is no extension (e.g. for URLs) (#197) --- Cargo.lock | 39 +++++++++++++++++++++++- Cargo.toml | 1 + src/bin/lychee/main.rs | 2 ++ src/bin/lychee/stats.rs | 1 + src/client.rs | 1 + src/collector.rs | 66 +++++++++++++++++++++++++++++++---------- src/extract.rs | 26 ++++++++++++---- src/filter/mod.rs | 1 + src/quirks/mod.rs | 1 + src/uri.rs | 3 +- tests/cli.rs | 2 ++ tests/usage.rs | 2 ++ 12 files changed, 122 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d2f0c8..21b941f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.40" @@ -416,7 +425,7 @@ version = "2.33.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" dependencies = [ - "ansi_term", + "ansi_term 0.11.0", "atty", "bitflags", "strsim 0.8.0", @@ -609,6 +618,12 @@ dependencies = [ "syn", ] +[[package]] +name = "diff" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499" + [[package]] name = "difference" version = "2.0.0" @@ -1399,6 +1414,7 @@ dependencies = [ "openssl-sys", "pad", "predicates", + "pretty_assertions", "pulldown-cmark", "regex", "reqwest", @@ -1676,6 +1692,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "output_vt100" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53cdc5b785b7a58c5aad8216b3dfa114df64b0b06ae6e1501cef91df2fbdf8f9" +dependencies = [ + "winapi", +] + [[package]] name = "pad" version = "0.1.6" @@ -1863,6 +1888,18 @@ dependencies = [ "treeline", ] +[[package]] +name = "pretty_assertions" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f297542c27a7df8d45de2b0e620308ab883ad232d06c14b76ac3e144bda50184" +dependencies = [ + "ansi_term 0.12.1", + "ctor", + "diff", + "output_vt100", +] + [[package]] name = "proc-macro-error" version = "1.0.4" diff --git a/Cargo.toml b/Cargo.toml index 394c0b8..bc3e22a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ predicates = "1.0.7" uuid = { version = "0.8.2", features = ["v4"] } tempfile = "3.2.0" doc-comment = "0.3.3" +pretty_assertions = "0.7.1" [features] vendored-openssl = ["openssl-sys/vendored"] diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 86823a0..443dede 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -240,6 +240,8 @@ fn parse_basic_auth(auth: &str) -> Result> { #[cfg(test)] mod test { use super::*; + use pretty_assertions::assert_eq; + use http::StatusCode; use reqwest::header; diff --git a/src/bin/lychee/stats.rs b/src/bin/lychee/stats.rs index 9cf73b1..2e04804 100644 --- a/src/bin/lychee/stats.rs +++ b/src/bin/lychee/stats.rs @@ -125,6 +125,7 @@ mod test_super { use lychee::{test_utils::website, Status}; use super::*; + use pretty_assertions::assert_eq; #[test] fn test_stats_is_empty() { diff --git a/src/client.rs b/src/client.rs index 8f71b4b..b9f0220 100644 --- a/src/client.rs +++ b/src/client.rs @@ -298,6 +298,7 @@ pub async fn check>(request: T) -> Result { #[cfg(test)] mod test { use super::*; + use pretty_assertions::assert_eq; use std::time::{Duration, Instant}; use wiremock::matchers::method; use wiremock::{Mock, MockServer, ResponseTemplate}; diff --git a/src/collector.rs b/src/collector.rs index 0889179..b18be13 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -134,11 +134,18 @@ impl Input { } async fn url_contents(url: &Url) -> Result { + // Assume HTML for default paths + let file_type = if url.path().is_empty() || url.path() == "/" { + FileType::Html + } else { + FileType::from(url.as_str()) + }; + let res = reqwest::get(url.clone()).await?; let content = res.text().await?; let input_content = InputContent { input: Input::RemoteUrl(url.clone()), - file_type: FileType::from(url.as_str()), + file_type, content, }; @@ -251,6 +258,8 @@ pub async fn collect_links( #[cfg(test)] mod test { use super::*; + use pretty_assertions::assert_eq; + use crate::{ test_utils::{get_mock_server_with_content, website}, Uri, @@ -259,14 +268,39 @@ mod test { use std::io::Write; use std::str::FromStr; - const TEST_STRING: &str = "http://test-string.com"; - const TEST_URL: &str = "https://test-url.org"; - const TEST_FILE: &str = "https://test-file.io"; - const TEST_GLOB_1: &str = "https://test-glob-1.io"; - const TEST_GLOB_2_MAIL: &str = "test@glob-2.io"; + #[tokio::test] + async fn test_file_without_extension_is_plaintext() -> Result<()> { + let dir = tempfile::tempdir()?; + // Treat as plaintext file (no extension) + let file_path = dir.path().join("README"); + let _file = File::create(&file_path)?; + let input = Input::new(&file_path.as_path().display().to_string(), true); + let contents = input.get_contents(None, true).await?; + + assert_eq!(contents.len(), 1); + assert_eq!(contents[0].file_type, FileType::Plaintext); + Ok(()) + } + + #[tokio::test] + async fn test_url_without_extension_is_html() -> Result<()> { + let input = Input::new("https://example.org/", true); + let contents = input.get_contents(None, true).await?; + + println!("{:?}", contents); + assert_eq!(contents.len(), 1); + assert_eq!(contents[0].file_type, FileType::Html); + Ok(()) + } #[tokio::test] async fn test_collect_links() -> Result<()> { + const TEST_STRING: &str = "http://test-string.com"; + const TEST_URL: &str = "https://test-url.org"; + const TEST_FILE: &str = "https://test-file.io"; + const TEST_GLOB_1: &str = "https://test-glob-1.io"; + const TEST_GLOB_2_MAIL: &str = "test@glob-2.io"; + let dir = tempfile::tempdir()?; let file_path = dir.path().join("f"); let file_glob_1_path = dir.path().join("glob-1"); @@ -293,18 +327,18 @@ mod test { ]; let responses = collect_links(&inputs, None, false, 8).await?; - let links = responses - .into_iter() - .map(|r| r.uri) - .collect::>(); + let mut links = responses.into_iter().map(|r| r.uri).collect::>(); - let mut expected_links: HashSet = HashSet::new(); - expected_links.insert(website(TEST_STRING)); - expected_links.insert(website(TEST_URL)); - expected_links.insert(website(TEST_FILE)); - expected_links.insert(website(TEST_GLOB_1)); - expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string())); + let mut expected_links: Vec = vec![ + website(TEST_STRING), + website(TEST_URL), + website(TEST_FILE), + website(TEST_GLOB_1), + Uri::Mail(TEST_GLOB_2_MAIL.to_string()), + ]; + links.sort(); + expected_links.sort(); assert_eq!(links, expected_links); Ok(()) diff --git a/src/extract.rs b/src/extract.rs index 19969fd..fc6ab6e 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -77,15 +77,13 @@ fn extract_links_from_html(input: &str) -> Vec { fn walk_html_links(mut urls: &mut Vec, node: &Handle) { match node.data { NodeData::Text { ref contents } => { - // escape_default turns tab characters into "\t", newlines into "\n", etc. - let esc_contents = contents.borrow().escape_default().to_string(); - for link in extract_links_from_plaintext(&esc_contents) { + for link in extract_links_from_plaintext(&contents.borrow()) { urls.push(link); } } NodeData::Comment { ref contents } => { - for link in extract_links_from_plaintext(&contents.escape_default().to_string()) { + for link in extract_links_from_plaintext(contents) { urls.push(link); } } @@ -96,7 +94,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { .. } => { for attr in attrs.borrow().iter() { - let attr_value = attr.value.escape_default().to_string(); + let attr_value = attr.value.to_string(); if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { urls.push(attr_value); @@ -181,6 +179,7 @@ mod test { use crate::test_utils::website; use super::*; + use pretty_assertions::assert_eq; use std::fs::File; use std::io::{BufReader, Read}; @@ -204,6 +203,8 @@ mod test { #[test] fn test_file_type() { + // Assume Plaintext in case there is no extension + assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext); assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown); assert_eq!( FileType::from(Path::new("test.markdown")), @@ -221,6 +222,21 @@ mod test { ); } + #[test] + fn test_extract_link_at_end_of_line() { + let link = "http://www.apache.org/licenses/LICENSE-2.0"; + let input = format!("{}\n", link); + + let found = extract_links_from_markdown(&input); + assert_eq!(vec![link], found); + + let found = extract_links_from_plaintext(&input); + assert_eq!(vec![link], found); + + let found = extract_links_from_html(&input); + assert_eq!(vec![link], found); + } + #[test] fn test_extract_markdown_links() { let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)"; diff --git a/src/filter/mod.rs b/src/filter/mod.rs index ca43554..66080a8 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -102,6 +102,7 @@ mod test { use reqwest::Url; use super::*; + use pretty_assertions::assert_eq; use crate::{test_utils::website, Input}; diff --git a/src/quirks/mod.rs b/src/quirks/mod.rs index ad3e4aa..d19a773 100644 --- a/src/quirks/mod.rs +++ b/src/quirks/mod.rs @@ -76,6 +76,7 @@ impl Quirks { #[cfg(test)] mod tests { use super::*; + use pretty_assertions::assert_eq; #[test] fn test_twitter_request() { diff --git a/src/uri.rs b/src/uri.rs index 7fa34ea..ae576b1 100644 --- a/src/uri.rs +++ b/src/uri.rs @@ -6,7 +6,7 @@ use std::{convert::TryFrom, fmt::Display}; use url::Url; /// Lychee's own representation of a URI, which encapsulates all support formats -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, PartialOrd, Ord, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum Uri { /// Website URL Website(Url), @@ -78,6 +78,7 @@ mod test { use crate::test_utils::website; use super::*; + use pretty_assertions::assert_eq; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; #[test] diff --git a/tests/cli.rs b/tests/cli.rs index c9c0bf3..c4ec718 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,5 +1,7 @@ #[cfg(test)] mod cli { + use pretty_assertions::assert_eq; + use anyhow::Result; use assert_cmd::Command; use lychee::test_utils; diff --git a/tests/usage.rs b/tests/usage.rs index 8080b93..f1eb32b 100644 --- a/tests/usage.rs +++ b/tests/usage.rs @@ -1,5 +1,7 @@ #[cfg(test)] mod readme { + use pretty_assertions::assert_eq; + use assert_cmd::Command; use std::fs::File; use std::io::{BufReader, Read};