From f53619a455dd8ffa7991ec5662b46f775a2fcbfd Mon Sep 17 00:00:00 2001 From: Techassi Date: Sun, 16 Jul 2023 18:08:14 +0200 Subject: [PATCH] feat: Add support for --dump-inputs (#1159) * Add support for --dump-inputs * Add integration tests * Fix usage guide in README --- README.md | 3 + fixtures/dump_inputs/markdown.md | 0 fixtures/dump_inputs/some_file.txt | 0 fixtures/dump_inputs/subfolder/example.bin | 0 fixtures/dump_inputs/subfolder/file2.md | 0 fixtures/dump_inputs/subfolder/test.html | 0 lychee-bin/src/commands/dump.rs | 28 ++++++++- lychee-bin/src/commands/mod.rs | 1 + lychee-bin/src/main.rs | 7 +++ lychee-bin/src/options.rs | 6 ++ lychee-bin/tests/cli.rs | 70 ++++++++++++++++++++++ lychee-lib/src/collector.rs | 8 +++ lychee-lib/src/types/input.rs | 39 +++++++++++- 13 files changed, 158 insertions(+), 4 deletions(-) create mode 100644 fixtures/dump_inputs/markdown.md create mode 100644 fixtures/dump_inputs/some_file.txt create mode 100644 fixtures/dump_inputs/subfolder/example.bin create mode 100644 fixtures/dump_inputs/subfolder/file2.md create mode 100644 fixtures/dump_inputs/subfolder/test.html diff --git a/README.md b/README.md index 7890bc2..737907a 100644 --- a/README.md +++ b/README.md @@ -303,6 +303,9 @@ Options: --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked + --dump-inputs + Don't perform any link extraction and checking. Instead, dump all input sources from which links would be collected + --archive Specify the use of a specific web archive. Can be used in combination with `--suggest` diff --git a/fixtures/dump_inputs/markdown.md b/fixtures/dump_inputs/markdown.md new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/dump_inputs/some_file.txt b/fixtures/dump_inputs/some_file.txt new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/dump_inputs/subfolder/example.bin b/fixtures/dump_inputs/subfolder/example.bin new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/dump_inputs/subfolder/file2.md b/fixtures/dump_inputs/subfolder/file2.md new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/dump_inputs/subfolder/test.html b/fixtures/dump_inputs/subfolder/test.html new file mode 100644 index 0000000..e69de29 diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index d16f725..580cdb9 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -37,8 +37,8 @@ where let requests = params.requests; tokio::pin!(requests); - if let Some(outfile) = ¶ms.cfg.output { - fs::File::create(outfile)?; + if let Some(out_file) = ¶ms.cfg.output { + fs::File::create(out_file)?; } let mut writer = create_writer(params.cfg.output)?; @@ -70,6 +70,30 @@ where Ok(ExitCode::Success) } +/// Dump all input sources to stdout without extracting any links and checking +/// them. +pub(crate) async fn dump_inputs(sources: S, output: Option<&PathBuf>) -> Result +where + S: futures::Stream>, +{ + let sources = sources; + tokio::pin!(sources); + + if let Some(out_file) = output { + fs::File::create(out_file)?; + } + + let mut writer = create_writer(output.cloned())?; + + while let Some(source) = sources.next().await { + let source = source?; + + writeln!(writer, "{source}")?; + } + + Ok(ExitCode::Success) +} + /// Dump request to stdout fn write( writer: &mut Box, diff --git a/lychee-bin/src/commands/mod.rs b/lychee-bin/src/commands/mod.rs index f5648bf..248a9a9 100644 --- a/lychee-bin/src/commands/mod.rs +++ b/lychee-bin/src/commands/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod dump; pub(crate) use check::check; pub(crate) use dump::dump; +pub(crate) use dump::dump_inputs; use std::sync::Arc; diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 04961cf..353a959 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -293,6 +293,13 @@ async fn run(opts: &LycheeOptions) -> Result { // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1")); + if opts.config.dump_inputs { + let sources = collector.collect_sources(inputs).await; + let exit_code = commands::dump_inputs(sources, opts.config.output.as_ref()).await?; + + return Ok(exit_code as i32); + } + collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) } else { diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 7b212d5..e2abb9e 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -181,6 +181,12 @@ pub(crate) struct Config { #[serde(default)] pub(crate) dump: bool, + /// Don't perform any link extraction and checking. + /// Instead, dump all input sources from which links would be collected + #[arg(long)] + #[serde(default)] + pub(crate) dump_inputs: bool, + /// Specify the use of a specific web archive. /// Can be used in combination with `--suggest` #[arg(long, value_parser = clap::builder::PossibleValuesParser::new(Archive::VARIANTS).map(|s| s.parse::().unwrap()))] diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a039eb4..1702401 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1347,4 +1347,74 @@ mod cli { Ok(()) } + + #[test] + fn test_dump_inputs_glob_md() -> Result<()> { + let pattern = fixtures_path().join("**/*.md"); + + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg(pattern) + .assert() + .success() + .stdout(contains("fixtures/dump_inputs/subfolder/file2.md")) + .stdout(contains("fixtures/dump_inputs/markdown.md")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_glob_all() -> Result<()> { + let pattern = fixtures_path().join("**/*"); + + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg(pattern) + .assert() + .success() + .stdout(contains("fixtures/dump_inputs/subfolder/test.html")) + .stdout(contains("fixtures/dump_inputs/subfolder/file2.md")) + .stdout(contains("fixtures/dump_inputs/subfolder")) + .stdout(contains("fixtures/dump_inputs/markdown.md")) + .stdout(contains("fixtures/dump_inputs/subfolder/example.bin")) + .stdout(contains("fixtures/dump_inputs/some_file.txt")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_url() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("https://example.com") + .assert() + .success() + .stdout(contains("https://example.com")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_path() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("fixtures") + .assert() + .success() + .stdout(contains("fixtures")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_stdin() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("-") + .assert() + .success() + .stdout(contains("Stdin")); + + Ok(()) + } } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index de4e421..9d94eb1 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -63,6 +63,14 @@ impl Collector { self } + /// Collect all sources from a list of [`Input`]s. For further details, + /// see also [`Input::get_sources`](crate::Input#method.get_sources). + pub async fn collect_sources(self, inputs: Vec) -> impl Stream> { + stream::iter(inputs) + .par_then_unordered(None, move |input| async move { input.get_sources().await }) + .flatten() + } + /// Fetch all unique links from inputs /// All relative URLs get prefixed with `base` (if given). /// (This can be a directory or a base URL) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 25052ee..3549f2c 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -260,6 +260,41 @@ impl Input { } } + /// Retrieve all sources from this input. The output depends on the type of + /// input: + /// + /// - Remote URLs are returned as is, in their full form + /// - Filepath Glob Patterns are expanded and each matched entry is returned + /// - Absolute or relative filepaths are returned as is + /// - All other input types are not returned + /// + /// # Errors + /// + /// Returns an error if the globbing fails with the expanded pattern. + pub async fn get_sources(self) -> impl Stream> { + try_stream! { + match self.source { + InputSource::RemoteUrl(url) => yield url.to_string(), + InputSource::FsGlob { pattern, ignore_case } => { + let glob_expanded = tilde(&pattern).to_string(); + let mut match_opts = glob::MatchOptions::new(); + + match_opts.case_sensitive = !ignore_case; + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => yield path.to_string_lossy().to_string(), + Err(e) => eprintln!("{e:?}") + } + } + }, + InputSource::FsPath(path) => yield path.to_string_lossy().to_string(), + InputSource::Stdin => yield "Stdin".into(), + InputSource::String(_) => yield "Raw String".into(), + } + } + } + async fn url_contents(url: &Url) -> Result { // Assume HTML for default paths let file_type = if url.path().is_empty() || url.path() == "/" { @@ -282,10 +317,10 @@ impl Input { async fn glob_contents( &self, - path_glob: &str, + pattern: &str, ignore_case: bool, ) -> impl Stream> + '_ { - let glob_expanded = tilde(&path_glob).to_string(); + let glob_expanded = tilde(&pattern).to_string(); let mut match_opts = glob::MatchOptions::new(); match_opts.case_sensitive = !ignore_case;