Replace Vec<PathBuf> with dedicated PathExcludes type

This commit is contained in:
Thomas Zahner 2025-07-10 18:05:51 +02:00
parent 1de218a78a
commit 002fa49f29
9 changed files with 119 additions and 102 deletions

View file

@ -1,6 +1,7 @@
use log::error;
use lychee_lib::Request;
use lychee_lib::Result;
use lychee_lib::filter::PathExcludes;
use std::fs;
use std::io::{self, Write};
use std::path::PathBuf;
@ -75,7 +76,7 @@ where
pub(crate) async fn dump_inputs<S>(
sources: S,
output: Option<&PathBuf>,
excluded_paths: &[PathBuf],
excluded_paths: &PathExcludes,
) -> Result<ExitCode>
where
S: futures::Stream<Item = Result<String>>,
@ -90,9 +91,8 @@ where
while let Some(source) = sources.next().await {
let source = source?;
let excluded = excluded_paths
.iter()
.any(|path| source.starts_with(path.to_string_lossy().as_ref()));
// TODO: is this working? check for test coverage, create test if necessary
let excluded = excluded_paths.is_match(&source);
if excluded {
continue;
}
@ -141,6 +141,7 @@ fn write_out(writer: &mut Box<dyn Write>, out_str: &str) -> io::Result<()> {
mod tests {
use super::*;
use futures::stream;
use regex::RegexSet;
use tempfile::NamedTempFile;
#[tokio::test]
@ -158,7 +159,7 @@ mod tests {
let stream = stream::iter(inputs);
// Run dump_inputs
let result = dump_inputs(stream, Some(&output_path), &[]).await?;
let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?;
assert_eq!(result, ExitCode::Success);
// Verify output
@ -179,8 +180,10 @@ mod tests {
];
let stream = stream::iter(inputs);
let excluded = vec![PathBuf::from("excluded")];
let result = dump_inputs(stream, Some(&output_path), &excluded).await?;
let excluded = &PathExcludes {
regex: RegexSet::new(["excluded"]).unwrap(),
};
let result = dump_inputs(stream, Some(&output_path), excluded).await?;
assert_eq!(result, ExitCode::Success);
let contents = fs::read_to_string(&output_path)?;
@ -194,7 +197,7 @@ mod tests {
let output_path = temp_file.path().to_path_buf();
let stream = stream::iter::<Vec<Result<String>>>(vec![]);
let result = dump_inputs(stream, Some(&output_path), &[]).await?;
let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?;
assert_eq!(result, ExitCode::Success);
let contents = fs::read_to_string(&output_path)?;
@ -214,7 +217,7 @@ mod tests {
];
let stream = stream::iter(inputs);
let result = dump_inputs(stream, Some(&output_path), &[]).await;
let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await;
assert!(result.is_err());
Ok(())
}
@ -225,7 +228,7 @@ mod tests {
let inputs = vec![Ok(String::from("test/path1"))];
let stream = stream::iter(inputs);
let result = dump_inputs(stream, None, &[]).await?;
let result = dump_inputs(stream, None, &PathExcludes::empty()).await?;
assert_eq!(result, ExitCode::Success);
Ok(())
}

View file

@ -70,10 +70,12 @@ use formatters::{get_stats_formatter, log::init_logging};
use http::HeaderMap;
use log::{error, info, warn};
use lychee_lib::filter::PathExcludes;
#[cfg(feature = "native-tls")]
use openssl_sys as _; // required for vendored-openssl feature
use options::{HeaderMapExt, LYCHEE_CONFIG_FILE};
use regex::RegexSet;
use ring as _; // required for apple silicon
use lychee_lib::BasicAuthExtractor;
@ -329,7 +331,9 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
let exit_code = commands::dump_inputs(
sources,
opts.config.output.as_ref(),
&opts.config.exclude_path,
&PathExcludes {
regex: RegexSet::new(&opts.config.exclude_path)?,
},
)
.await?;

View file

@ -13,6 +13,7 @@ use lychee_lib::{
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions,
FileType, Input, StatusCodeExcluder, StatusCodeSelector, archive::Archive,
};
use regex::RegexSet;
use reqwest::tls;
use secrecy::{ExposeSecret, SecretString};
use serde::{Deserialize, Deserializer};
@ -338,7 +339,7 @@ impl LycheeOptions {
s,
None,
self.config.glob_ignore_case,
self.config.exclude_path.clone(),
RegexSet::new(&self.config.exclude_path)?.into(),
)
})
.collect::<Result<_, _>>()
@ -503,7 +504,8 @@ and 501."
#[serde(default)]
pub(crate) include: Vec<String>,
/// Exclude URLs and mail addresses from checking (supports regex)
/// Exclude URLs and mail addresses from checking.
/// The value is treated as regular expression.
#[arg(long)]
#[serde(default)]
pub(crate) exclude: Vec<String>,
@ -513,10 +515,11 @@ and 501."
#[serde(default)]
pub(crate) exclude_file: Vec<String>,
/// Exclude file path from getting checked.
/// Exclude paths from getting checked.
/// The value is treated as regular expression.
#[arg(long)]
#[serde(default)]
pub(crate) exclude_path: Vec<PathBuf>,
pub(crate) exclude_path: Vec<String>,
/// Exclude all private IPs from checking.
/// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback`
@ -751,7 +754,7 @@ impl Config {
exclude_file: Vec::<String>::new(); // deprecated
exclude_link_local: false;
exclude_loopback: false;
exclude_path: Vec::<PathBuf>::new();
exclude_path: Vec::<String>::new();
exclude_private: false;
exclude: Vec::<String>::new();
extensions: FileType::default_extensions();

View file

@ -231,7 +231,9 @@ mod tests {
use super::*;
use crate::{
Result, Uri, mock_server,
Result, Uri,
filter::PathExcludes,
mock_server,
test_utils::{load_fixture, mail, path, website},
types::{FileType, Input, InputSource},
};
@ -278,7 +280,7 @@ mod tests {
&file_path.as_path().display().to_string(),
None,
true,
vec![],
PathExcludes::empty(),
)?;
let contents: Vec<_> = input
.get_contents(
@ -298,7 +300,7 @@ mod tests {
#[tokio::test]
async fn test_url_without_extension_is_html() -> Result<()> {
let input = Input::new("https://example.com/", None, true, vec![])?;
let input = Input::new("https://example.com/", None, true, PathExcludes::empty())?;
let contents: Vec<_> = input
.get_contents(
true,
@ -372,7 +374,7 @@ mod tests {
let input = Input {
source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()),
file_type_hint: Some(FileType::Markdown),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
@ -398,7 +400,7 @@ mod tests {
.to_string(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
@ -427,7 +429,7 @@ mod tests {
.to_string(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
@ -453,7 +455,7 @@ mod tests {
.to_string(),
),
file_type_hint: Some(FileType::Markdown),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
@ -476,7 +478,7 @@ mod tests {
let input = Input {
source: InputSource::String(input),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
@ -551,7 +553,7 @@ mod tests {
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
},
Input {
source: InputSource::RemoteUrl(Box::new(
@ -562,7 +564,7 @@ mod tests {
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
},
];
@ -597,7 +599,7 @@ mod tests {
.into(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: vec![],
excluded_paths: PathExcludes::empty(),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

View file

@ -1,26 +0,0 @@
use regex::RegexSet;
/// Exclude configuration for the link checker.
/// You can ignore links based on regex patterns.
#[derive(Clone, Debug)]
pub struct Excludes {
/// User-defined set of excluded regex patterns
pub(crate) regex: RegexSet,
}
impl Excludes {
#[inline]
#[must_use]
/// Returns `true` if the given input string matches the regex set
/// and should hence be excluded from checking
pub fn is_match(&self, input: &str) -> bool {
self.regex.is_match(input)
}
#[inline]
#[must_use]
/// Whether there were no regular expressions defined to be excluded
pub fn is_empty(&self) -> bool {
self.regex.is_empty()
}
}

View file

@ -1,26 +0,0 @@
use regex::RegexSet;
/// Include configuration for the link checker.
/// You can include links based on regex patterns
#[derive(Clone, Debug)]
pub struct Includes {
/// User-defined set of included regex patterns
pub regex: RegexSet,
}
impl Includes {
#[inline]
#[must_use]
/// Returns `true` if the given input string matches the regex set
/// and should hence be included and checked
pub fn is_match(&self, input: &str) -> bool {
self.regex.is_match(input)
}
#[inline]
#[must_use]
/// Whether there were no regular expressions defined for inclusion
pub fn is_empty(&self) -> bool {
self.regex.is_empty()
}
}

View file

@ -1,12 +1,19 @@
mod excludes;
mod includes;
mod regex_filter;
use regex::RegexSet;
use std::collections::HashSet;
use std::sync::LazyLock;
pub use excludes::Excludes;
pub use includes::Includes;
/// Include configuration for the link checker.
/// You can include links based on regex patterns.
pub type Includes = regex_filter::RegexFilter;
/// Exclude configuration for the link checker.
/// You can ignore links based on regex patterns.
pub type Excludes = regex_filter::RegexFilter;
/// You can exclude paths and files based on regex patterns.
pub type PathExcludes = regex_filter::RegexFilter;
use crate::Uri;

View file

@ -0,0 +1,46 @@
use regex::RegexSet;
/// Filter configuration for the link checker.
/// You can include and exclude links and paths based on regex patterns
#[derive(Clone, Debug)]
pub struct RegexFilter {
/// User-defined set of regex patterns
pub regex: RegexSet,
}
impl RegexFilter {
#[inline]
#[must_use]
/// Returns `true` if the given input string matches the regex set
/// and should hence be included or excluded
pub fn is_match(&self, input: &str) -> bool {
self.regex.is_match(input)
}
#[inline]
#[must_use]
/// Whether there were no regular expressions defined
pub fn is_empty(&self) -> bool {
self.regex.is_empty()
}
/// Create a new empty regex set.
pub fn empty() -> Self {
Self {
regex: RegexSet::empty(),
}
}
}
impl PartialEq for RegexFilter {
fn eq(&self, other: &Self) -> bool {
// Workaround, see https://github.com/rust-lang/regex/issues/364
self.regex.patterns() == other.regex.patterns()
}
}
impl From<RegexSet> for RegexFilter {
fn from(regex: RegexSet) -> Self {
Self { regex }
}
}

View file

@ -1,11 +1,13 @@
use super::file::FileExtensions;
use super::resolver::UrlContentResolver;
use crate::filter::PathExcludes;
use crate::types::FileType;
use crate::{ErrorKind, Result, utils};
use async_stream::try_stream;
use futures::stream::Stream;
use glob::glob_with;
use ignore::WalkBuilder;
use regex::RegexSet;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use shellexpand::tilde;
@ -101,14 +103,14 @@ impl Display for InputSource {
}
/// Lychee Input with optional file hint for parsing
#[derive(Clone, Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq)]
pub struct Input {
/// Origin of input
pub source: InputSource,
/// Hint to indicate which extractor to use
pub file_type_hint: Option<FileType>,
/// Excluded paths that will be skipped when reading content
pub excluded_paths: Vec<PathBuf>,
pub excluded_paths: PathExcludes,
}
impl Input {
@ -124,7 +126,7 @@ impl Input {
value: &str,
file_type_hint: Option<FileType>,
glob_ignore_case: bool,
excluded_paths: Vec<PathBuf>,
excluded_paths: PathExcludes,
) -> Result<Self> {
let source = if value == STDIN {
InputSource::Stdin
@ -200,7 +202,7 @@ impl Input {
/// Returns an error if the input does not exist (i.e. invalid path)
/// and the input cannot be parsed as a URL.
pub fn from_value(value: &str) -> Result<Self> {
Self::new(value, None, false, vec![])
Self::new(value, None, false, RegexSet::empty().into())
}
/// Convenience constructor
@ -208,7 +210,7 @@ impl Input {
Self {
source,
file_type_hint: None,
excluded_paths: vec![],
excluded_paths: RegexSet::empty().into(),
}
}
@ -422,13 +424,8 @@ impl TryFrom<&str> for Input {
/// Function for path exclusion tests
///
/// This is a standalone function to allow for easier testing
fn is_excluded_path(excluded_paths: &[PathBuf], path: &PathBuf) -> bool {
for excluded in excluded_paths {
if let Ok(true) = utils::path::contains(excluded, path) {
return true;
}
}
false
fn is_excluded_path(excluded_paths: &PathExcludes, path: &PathBuf) -> bool {
excluded_paths.is_match(&path.to_string_lossy())
}
#[cfg(test)]
@ -443,7 +440,7 @@ mod tests {
assert!(path.exists());
assert!(path.is_relative());
let input = Input::new(test_file, None, false, vec![]);
let input = Input::new(test_file, None, false, PathExcludes::empty());
assert!(input.is_ok());
assert!(matches!(
input,
@ -471,14 +468,20 @@ mod tests {
#[test]
fn test_no_exclusions() {
let dir = tempfile::tempdir().unwrap();
assert!(!is_excluded_path(&[], &dir.path().to_path_buf()));
assert!(!is_excluded_path(
&PathExcludes::empty(),
&dir.path().to_path_buf()
));
}
#[test]
fn test_excluded() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().to_path_buf();
assert!(is_excluded_path(&[path.clone()], &path));
let excludes = PathExcludes {
regex: RegexSet::new([path.to_string_lossy()]).unwrap(),
};
assert!(is_excluded_path(&excludes, &path));
}
#[test]
@ -487,10 +490,11 @@ mod tests {
let parent = parent_dir.path();
let child_dir = tempfile::tempdir_in(parent).unwrap();
let child = child_dir.path();
assert!(is_excluded_path(
&[parent.to_path_buf()],
&child.to_path_buf()
));
let excludes = PathExcludes {
regex: RegexSet::new([parent.to_path_buf().to_string_lossy()]).unwrap(),
};
assert!(is_excluded_path(&excludes, &child.to_path_buf()));
}
#[test]