diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index 33343a3..ffffab4 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -1,6 +1,7 @@ use log::error; use lychee_lib::Request; use lychee_lib::Result; +use lychee_lib::filter::PathExcludes; use std::fs; use std::io::{self, Write}; use std::path::PathBuf; @@ -75,7 +76,7 @@ where pub(crate) async fn dump_inputs( sources: S, output: Option<&PathBuf>, - excluded_paths: &[PathBuf], + excluded_paths: &PathExcludes, ) -> Result where S: futures::Stream>, @@ -90,9 +91,8 @@ where while let Some(source) = sources.next().await { let source = source?; - let excluded = excluded_paths - .iter() - .any(|path| source.starts_with(path.to_string_lossy().as_ref())); + // TODO: is this working? check for test coverage, create test if necessary + let excluded = excluded_paths.is_match(&source); if excluded { continue; } @@ -141,6 +141,7 @@ fn write_out(writer: &mut Box, out_str: &str) -> io::Result<()> { mod tests { use super::*; use futures::stream; + use regex::RegexSet; use tempfile::NamedTempFile; #[tokio::test] @@ -158,7 +159,7 @@ mod tests { let stream = stream::iter(inputs); // Run dump_inputs - let result = dump_inputs(stream, Some(&output_path), &[]).await?; + let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?; assert_eq!(result, ExitCode::Success); // Verify output @@ -179,8 +180,10 @@ mod tests { ]; let stream = stream::iter(inputs); - let excluded = vec![PathBuf::from("excluded")]; - let result = dump_inputs(stream, Some(&output_path), &excluded).await?; + let excluded = &PathExcludes { + regex: RegexSet::new(["excluded"]).unwrap(), + }; + let result = dump_inputs(stream, Some(&output_path), excluded).await?; assert_eq!(result, ExitCode::Success); let contents = fs::read_to_string(&output_path)?; @@ -194,7 +197,7 @@ mod tests { let output_path = temp_file.path().to_path_buf(); let stream = stream::iter::>>(vec![]); - let result = dump_inputs(stream, Some(&output_path), &[]).await?; + let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?; assert_eq!(result, ExitCode::Success); let contents = fs::read_to_string(&output_path)?; @@ -214,7 +217,7 @@ mod tests { ]; let stream = stream::iter(inputs); - let result = dump_inputs(stream, Some(&output_path), &[]).await; + let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await; assert!(result.is_err()); Ok(()) } @@ -225,7 +228,7 @@ mod tests { let inputs = vec![Ok(String::from("test/path1"))]; let stream = stream::iter(inputs); - let result = dump_inputs(stream, None, &[]).await?; + let result = dump_inputs(stream, None, &PathExcludes::empty()).await?; assert_eq!(result, ExitCode::Success); Ok(()) } diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index fb9ef5a..33197d3 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -70,10 +70,12 @@ use formatters::{get_stats_formatter, log::init_logging}; use http::HeaderMap; use log::{error, info, warn}; +use lychee_lib::filter::PathExcludes; #[cfg(feature = "native-tls")] use openssl_sys as _; // required for vendored-openssl feature use options::{HeaderMapExt, LYCHEE_CONFIG_FILE}; +use regex::RegexSet; use ring as _; // required for apple silicon use lychee_lib::BasicAuthExtractor; @@ -329,7 +331,9 @@ async fn run(opts: &LycheeOptions) -> Result { let exit_code = commands::dump_inputs( sources, opts.config.output.as_ref(), - &opts.config.exclude_path, + &PathExcludes { + regex: RegexSet::new(&opts.config.exclude_path)?, + }, ) .await?; diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index e53639f..64fd772 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -13,6 +13,7 @@ use lychee_lib::{ DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, FileType, Input, StatusCodeExcluder, StatusCodeSelector, archive::Archive, }; +use regex::RegexSet; use reqwest::tls; use secrecy::{ExposeSecret, SecretString}; use serde::{Deserialize, Deserializer}; @@ -338,7 +339,7 @@ impl LycheeOptions { s, None, self.config.glob_ignore_case, - self.config.exclude_path.clone(), + RegexSet::new(&self.config.exclude_path)?.into(), ) }) .collect::>() @@ -503,7 +504,8 @@ and 501." #[serde(default)] pub(crate) include: Vec, - /// Exclude URLs and mail addresses from checking (supports regex) + /// Exclude URLs and mail addresses from checking. + /// The value is treated as regular expression. #[arg(long)] #[serde(default)] pub(crate) exclude: Vec, @@ -513,10 +515,11 @@ and 501." #[serde(default)] pub(crate) exclude_file: Vec, - /// Exclude file path from getting checked. + /// Exclude paths from getting checked. + /// The value is treated as regular expression. #[arg(long)] #[serde(default)] - pub(crate) exclude_path: Vec, + pub(crate) exclude_path: Vec, /// Exclude all private IPs from checking. /// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback` @@ -751,7 +754,7 @@ impl Config { exclude_file: Vec::::new(); // deprecated exclude_link_local: false; exclude_loopback: false; - exclude_path: Vec::::new(); + exclude_path: Vec::::new(); exclude_private: false; exclude: Vec::::new(); extensions: FileType::default_extensions(); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index c3db7ff..f380f80 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -231,7 +231,9 @@ mod tests { use super::*; use crate::{ - Result, Uri, mock_server, + Result, Uri, + filter::PathExcludes, + mock_server, test_utils::{load_fixture, mail, path, website}, types::{FileType, Input, InputSource}, }; @@ -278,7 +280,7 @@ mod tests { &file_path.as_path().display().to_string(), None, true, - vec![], + PathExcludes::empty(), )?; let contents: Vec<_> = input .get_contents( @@ -298,7 +300,7 @@ mod tests { #[tokio::test] async fn test_url_without_extension_is_html() -> Result<()> { - let input = Input::new("https://example.com/", None, true, vec![])?; + let input = Input::new("https://example.com/", None, true, PathExcludes::empty())?; let contents: Vec<_> = input .get_contents( true, @@ -372,7 +374,7 @@ mod tests { let input = Input { source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()), file_type_hint: Some(FileType::Markdown), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); @@ -398,7 +400,7 @@ mod tests { .to_string(), ), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); @@ -427,7 +429,7 @@ mod tests { .to_string(), ), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); @@ -453,7 +455,7 @@ mod tests { .to_string(), ), file_type_hint: Some(FileType::Markdown), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); @@ -476,7 +478,7 @@ mod tests { let input = Input { source: InputSource::String(input), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); @@ -551,7 +553,7 @@ mod tests { .unwrap(), )), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }, Input { source: InputSource::RemoteUrl(Box::new( @@ -562,7 +564,7 @@ mod tests { .unwrap(), )), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }, ]; @@ -597,7 +599,7 @@ mod tests { .into(), ), file_type_hint: Some(FileType::Html), - excluded_paths: vec![], + excluded_paths: PathExcludes::empty(), }; let links = collect(vec![input], None, Some(base)).await.ok().unwrap(); diff --git a/lychee-lib/src/filter/excludes.rs b/lychee-lib/src/filter/excludes.rs deleted file mode 100644 index 418f292..0000000 --- a/lychee-lib/src/filter/excludes.rs +++ /dev/null @@ -1,26 +0,0 @@ -use regex::RegexSet; - -/// Exclude configuration for the link checker. -/// You can ignore links based on regex patterns. -#[derive(Clone, Debug)] -pub struct Excludes { - /// User-defined set of excluded regex patterns - pub(crate) regex: RegexSet, -} - -impl Excludes { - #[inline] - #[must_use] - /// Returns `true` if the given input string matches the regex set - /// and should hence be excluded from checking - pub fn is_match(&self, input: &str) -> bool { - self.regex.is_match(input) - } - - #[inline] - #[must_use] - /// Whether there were no regular expressions defined to be excluded - pub fn is_empty(&self) -> bool { - self.regex.is_empty() - } -} diff --git a/lychee-lib/src/filter/includes.rs b/lychee-lib/src/filter/includes.rs deleted file mode 100644 index ef274b5..0000000 --- a/lychee-lib/src/filter/includes.rs +++ /dev/null @@ -1,26 +0,0 @@ -use regex::RegexSet; - -/// Include configuration for the link checker. -/// You can include links based on regex patterns -#[derive(Clone, Debug)] -pub struct Includes { - /// User-defined set of included regex patterns - pub regex: RegexSet, -} - -impl Includes { - #[inline] - #[must_use] - /// Returns `true` if the given input string matches the regex set - /// and should hence be included and checked - pub fn is_match(&self, input: &str) -> bool { - self.regex.is_match(input) - } - - #[inline] - #[must_use] - /// Whether there were no regular expressions defined for inclusion - pub fn is_empty(&self) -> bool { - self.regex.is_empty() - } -} diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index e30395b..39bb3a0 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -1,12 +1,19 @@ -mod excludes; -mod includes; +mod regex_filter; use regex::RegexSet; use std::collections::HashSet; use std::sync::LazyLock; -pub use excludes::Excludes; -pub use includes::Includes; +/// Include configuration for the link checker. +/// You can include links based on regex patterns. +pub type Includes = regex_filter::RegexFilter; + +/// Exclude configuration for the link checker. +/// You can ignore links based on regex patterns. +pub type Excludes = regex_filter::RegexFilter; + +/// You can exclude paths and files based on regex patterns. +pub type PathExcludes = regex_filter::RegexFilter; use crate::Uri; diff --git a/lychee-lib/src/filter/regex_filter.rs b/lychee-lib/src/filter/regex_filter.rs new file mode 100644 index 0000000..700cff7 --- /dev/null +++ b/lychee-lib/src/filter/regex_filter.rs @@ -0,0 +1,46 @@ +use regex::RegexSet; + +/// Filter configuration for the link checker. +/// You can include and exclude links and paths based on regex patterns +#[derive(Clone, Debug)] +pub struct RegexFilter { + /// User-defined set of regex patterns + pub regex: RegexSet, +} + +impl RegexFilter { + #[inline] + #[must_use] + /// Returns `true` if the given input string matches the regex set + /// and should hence be included or excluded + pub fn is_match(&self, input: &str) -> bool { + self.regex.is_match(input) + } + + #[inline] + #[must_use] + /// Whether there were no regular expressions defined + pub fn is_empty(&self) -> bool { + self.regex.is_empty() + } + + /// Create a new empty regex set. + pub fn empty() -> Self { + Self { + regex: RegexSet::empty(), + } + } +} + +impl PartialEq for RegexFilter { + fn eq(&self, other: &Self) -> bool { + // Workaround, see https://github.com/rust-lang/regex/issues/364 + self.regex.patterns() == other.regex.patterns() + } +} + +impl From for RegexFilter { + fn from(regex: RegexSet) -> Self { + Self { regex } + } +} diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 42ea761..8631c75 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -1,11 +1,13 @@ use super::file::FileExtensions; use super::resolver::UrlContentResolver; +use crate::filter::PathExcludes; use crate::types::FileType; use crate::{ErrorKind, Result, utils}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; use ignore::WalkBuilder; +use regex::RegexSet; use reqwest::Url; use serde::{Deserialize, Serialize}; use shellexpand::tilde; @@ -101,14 +103,14 @@ impl Display for InputSource { } /// Lychee Input with optional file hint for parsing -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct Input { /// Origin of input pub source: InputSource, /// Hint to indicate which extractor to use pub file_type_hint: Option, /// Excluded paths that will be skipped when reading content - pub excluded_paths: Vec, + pub excluded_paths: PathExcludes, } impl Input { @@ -124,7 +126,7 @@ impl Input { value: &str, file_type_hint: Option, glob_ignore_case: bool, - excluded_paths: Vec, + excluded_paths: PathExcludes, ) -> Result { let source = if value == STDIN { InputSource::Stdin @@ -200,7 +202,7 @@ impl Input { /// Returns an error if the input does not exist (i.e. invalid path) /// and the input cannot be parsed as a URL. pub fn from_value(value: &str) -> Result { - Self::new(value, None, false, vec![]) + Self::new(value, None, false, RegexSet::empty().into()) } /// Convenience constructor @@ -208,7 +210,7 @@ impl Input { Self { source, file_type_hint: None, - excluded_paths: vec![], + excluded_paths: RegexSet::empty().into(), } } @@ -422,13 +424,8 @@ impl TryFrom<&str> for Input { /// Function for path exclusion tests /// /// This is a standalone function to allow for easier testing -fn is_excluded_path(excluded_paths: &[PathBuf], path: &PathBuf) -> bool { - for excluded in excluded_paths { - if let Ok(true) = utils::path::contains(excluded, path) { - return true; - } - } - false +fn is_excluded_path(excluded_paths: &PathExcludes, path: &PathBuf) -> bool { + excluded_paths.is_match(&path.to_string_lossy()) } #[cfg(test)] @@ -443,7 +440,7 @@ mod tests { assert!(path.exists()); assert!(path.is_relative()); - let input = Input::new(test_file, None, false, vec![]); + let input = Input::new(test_file, None, false, PathExcludes::empty()); assert!(input.is_ok()); assert!(matches!( input, @@ -471,14 +468,20 @@ mod tests { #[test] fn test_no_exclusions() { let dir = tempfile::tempdir().unwrap(); - assert!(!is_excluded_path(&[], &dir.path().to_path_buf())); + assert!(!is_excluded_path( + &PathExcludes::empty(), + &dir.path().to_path_buf() + )); } #[test] fn test_excluded() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().to_path_buf(); - assert!(is_excluded_path(&[path.clone()], &path)); + let excludes = PathExcludes { + regex: RegexSet::new([path.to_string_lossy()]).unwrap(), + }; + assert!(is_excluded_path(&excludes, &path)); } #[test] @@ -487,10 +490,11 @@ mod tests { let parent = parent_dir.path(); let child_dir = tempfile::tempdir_in(parent).unwrap(); let child = child_dir.path(); - assert!(is_excluded_path( - &[parent.to_path_buf()], - &child.to_path_buf() - )); + + let excludes = PathExcludes { + regex: RegexSet::new([parent.to_path_buf().to_string_lossy()]).unwrap(), + }; + assert!(is_excluded_path(&excludes, &child.to_path_buf())); } #[test]