feat: Add support for --dump-inputs (#1159)

* Add support for --dump-inputs
* Add integration tests
* Fix usage guide in README
This commit is contained in:
Techassi 2023-07-16 18:08:14 +02:00 committed by GitHub
parent f1817ead5e
commit f53619a455
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 158 additions and 4 deletions

View file

@ -303,6 +303,9 @@ Options:
--dump
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked
--dump-inputs
Don't perform any link extraction and checking. Instead, dump all input sources from which links would be collected
--archive <ARCHIVE>
Specify the use of a specific web archive. Can be used in combination with `--suggest`

0
fixtures/dump_inputs/markdown.md vendored Normal file
View file

0
fixtures/dump_inputs/some_file.txt vendored Normal file
View file

View file

View file

View file

View file

@ -37,8 +37,8 @@ where
let requests = params.requests;
tokio::pin!(requests);
if let Some(outfile) = &params.cfg.output {
fs::File::create(outfile)?;
if let Some(out_file) = &params.cfg.output {
fs::File::create(out_file)?;
}
let mut writer = create_writer(params.cfg.output)?;
@ -70,6 +70,30 @@ where
Ok(ExitCode::Success)
}
/// Dump all input sources to stdout without extracting any links and checking
/// them.
pub(crate) async fn dump_inputs<S>(sources: S, output: Option<&PathBuf>) -> Result<ExitCode>
where
S: futures::Stream<Item = Result<String>>,
{
let sources = sources;
tokio::pin!(sources);
if let Some(out_file) = output {
fs::File::create(out_file)?;
}
let mut writer = create_writer(output.cloned())?;
while let Some(source) = sources.next().await {
let source = source?;
writeln!(writer, "{source}")?;
}
Ok(ExitCode::Success)
}
/// Dump request to stdout
fn write(
writer: &mut Box<dyn Write>,

View file

@ -3,6 +3,7 @@ pub(crate) mod dump;
pub(crate) use check::check;
pub(crate) use dump::dump;
pub(crate) use dump::dump_inputs;
use std::sync::Arc;

View file

@ -293,6 +293,13 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"));
if opts.config.dump_inputs {
let sources = collector.collect_sources(inputs).await;
let exit_code = commands::dump_inputs(sources, opts.config.output.as_ref()).await?;
return Ok(exit_code as i32);
}
collector = if let Some(ref basic_auth) = opts.config.basic_auth {
collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?)
} else {

View file

@ -181,6 +181,12 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) dump: bool,
/// Don't perform any link extraction and checking.
/// Instead, dump all input sources from which links would be collected
#[arg(long)]
#[serde(default)]
pub(crate) dump_inputs: bool,
/// Specify the use of a specific web archive.
/// Can be used in combination with `--suggest`
#[arg(long, value_parser = clap::builder::PossibleValuesParser::new(Archive::VARIANTS).map(|s| s.parse::<Archive>().unwrap()))]

View file

@ -1347,4 +1347,74 @@ mod cli {
Ok(())
}
#[test]
fn test_dump_inputs_glob_md() -> Result<()> {
let pattern = fixtures_path().join("**/*.md");
let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg(pattern)
.assert()
.success()
.stdout(contains("fixtures/dump_inputs/subfolder/file2.md"))
.stdout(contains("fixtures/dump_inputs/markdown.md"));
Ok(())
}
#[test]
fn test_dump_inputs_glob_all() -> Result<()> {
let pattern = fixtures_path().join("**/*");
let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg(pattern)
.assert()
.success()
.stdout(contains("fixtures/dump_inputs/subfolder/test.html"))
.stdout(contains("fixtures/dump_inputs/subfolder/file2.md"))
.stdout(contains("fixtures/dump_inputs/subfolder"))
.stdout(contains("fixtures/dump_inputs/markdown.md"))
.stdout(contains("fixtures/dump_inputs/subfolder/example.bin"))
.stdout(contains("fixtures/dump_inputs/some_file.txt"));
Ok(())
}
#[test]
fn test_dump_inputs_url() -> Result<()> {
let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg("https://example.com")
.assert()
.success()
.stdout(contains("https://example.com"));
Ok(())
}
#[test]
fn test_dump_inputs_path() -> Result<()> {
let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg("fixtures")
.assert()
.success()
.stdout(contains("fixtures"));
Ok(())
}
#[test]
fn test_dump_inputs_stdin() -> Result<()> {
let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg("-")
.assert()
.success()
.stdout(contains("Stdin"));
Ok(())
}
}

View file

@ -63,6 +63,14 @@ impl Collector {
self
}
/// Collect all sources from a list of [`Input`]s. For further details,
/// see also [`Input::get_sources`](crate::Input#method.get_sources).
pub async fn collect_sources(self, inputs: Vec<Input>) -> impl Stream<Item = Result<String>> {
stream::iter(inputs)
.par_then_unordered(None, move |input| async move { input.get_sources().await })
.flatten()
}
/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)

View file

@ -260,6 +260,41 @@ impl Input {
}
}
/// Retrieve all sources from this input. The output depends on the type of
/// input:
///
/// - Remote URLs are returned as is, in their full form
/// - Filepath Glob Patterns are expanded and each matched entry is returned
/// - Absolute or relative filepaths are returned as is
/// - All other input types are not returned
///
/// # Errors
///
/// Returns an error if the globbing fails with the expanded pattern.
pub async fn get_sources(self) -> impl Stream<Item = Result<String>> {
try_stream! {
match self.source {
InputSource::RemoteUrl(url) => yield url.to_string(),
InputSource::FsGlob { pattern, ignore_case } => {
let glob_expanded = tilde(&pattern).to_string();
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;
for entry in glob_with(&glob_expanded, match_opts)? {
match entry {
Ok(path) => yield path.to_string_lossy().to_string(),
Err(e) => eprintln!("{e:?}")
}
}
},
InputSource::FsPath(path) => yield path.to_string_lossy().to_string(),
InputSource::Stdin => yield "Stdin".into(),
InputSource::String(_) => yield "Raw String".into(),
}
}
}
async fn url_contents(url: &Url) -> Result<InputContent> {
// Assume HTML for default paths
let file_type = if url.path().is_empty() || url.path() == "/" {
@ -282,10 +317,10 @@ impl Input {
async fn glob_contents(
&self,
path_glob: &str,
pattern: &str,
ignore_case: bool,
) -> impl Stream<Item = Result<InputContent>> + '_ {
let glob_expanded = tilde(&path_glob).to_string();
let glob_expanded = tilde(&pattern).to_string();
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;