Add support for reading from stdin and make input handling more robust (closes #26)

* Adds a `skip_missing` flag
* Adds an `Input` enum to handle different types of inputs
This commit is contained in:
Paweł Romanowski 2020-12-02 23:28:37 +01:00 committed by GitHub
parent e197012e7a
commit 1f787613d4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 736 additions and 149 deletions

80
Cargo.lock generated
View file

@ -119,6 +119,12 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d25d88fd6b8041580a654f9d0c581a047baee2b3efee13275f2fc392fc75034"
[[package]]
name = "arrayref"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
[[package]]
name = "arrayvec"
version = "0.5.1"
@ -476,6 +482,17 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "blake2b_simd"
version = "0.5.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afa748e348ad3be8263be728124b24a24f268266f6f5d58af9d75f6a40b5c587"
dependencies = [
"arrayref",
"arrayvec",
"constant_time_eq",
]
[[package]]
name = "block-buffer"
version = "0.7.3"
@ -712,6 +729,12 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2"
[[package]]
name = "constant_time_eq"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
[[package]]
name = "cookie"
version = "0.14.2"
@ -943,6 +966,27 @@ dependencies = [
"generic-array 0.14.4",
]
[[package]]
name = "dirs"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13aea89a5c93364a98e9b37b2fa237effbb694d5cfe01c5b70941f7eb087d5e3"
dependencies = [
"cfg-if 0.1.10",
"dirs-sys",
]
[[package]]
name = "dirs-sys"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e93d7f5705de3e49895a2b5e0b8855a1c27f080192ae9c32a6432d50741a57a"
dependencies = [
"libc",
"redox_users",
"winapi 0.3.9",
]
[[package]]
name = "discard"
version = "1.0.4"
@ -1790,6 +1834,7 @@ dependencies = [
"http",
"hubcaps",
"indicatif",
"lazy_static",
"linkify",
"log",
"predicates",
@ -1799,10 +1844,13 @@ dependencies = [
"regex",
"reqwest",
"serde",
"shellexpand",
"structopt",
"tempfile",
"tokio 0.2.22",
"toml",
"url",
"uuid",
"wiremock",
]
@ -2421,6 +2469,17 @@ version = "0.1.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]]
name = "redox_users"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0737333e7a9502c789a36d7c7fa6092a49895d4faa31ca5df163857ded2e9d"
dependencies = [
"getrandom",
"redox_syscall",
"rust-argon2",
]
[[package]]
name = "regex"
version = "1.4.2"
@ -2522,6 +2581,18 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "rust-argon2"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b18820d944b33caa75a71378964ac46f58517c92b6ae5f762636247c09e78fb"
dependencies = [
"base64 0.13.0",
"blake2b_simd",
"constant_time_eq",
"crossbeam-utils 0.8.0",
]
[[package]]
name = "rustc-demangle"
version = "0.1.16"
@ -2698,6 +2769,15 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "shellexpand"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2b22262a9aaf9464d356f656fea420634f78c881c5eebd5ef5e66d8b9bc603"
dependencies = [
"dirs",
]
[[package]]
name = "signal-hook-registry"
version = "1.2.1"

View file

@ -30,6 +30,8 @@ quick-xml = "0.20.0"
headers = "0.3.2"
derive_builder = "0.9.0"
deadpool = "0.6.0"
shellexpand = "2.0"
lazy_static = "1.1"
[dependencies.reqwest]
features = ["gzip"]
@ -43,3 +45,5 @@ version = "0.2"
wiremock = "0.3"
assert_cmd = "1.0"
predicates = "1.0"
uuid = { version = "0.8", features = ["v4"] }
tempfile = "3.1"

View file

@ -80,10 +80,32 @@ cargo install lychee
## Usage
Run it inside a repository with a `README.md`, or specify a file with
Run it inside a repository with a `README.md`:
```
lychee <yourfile>
lychee
```
You can also specify various types of inputs:
```
# check links on a website:
lychee https://endler.dev/
# check links in a remote file:
lychee https://raw.githubusercontent.com/lycheeverse/lychee/master/README.md
# check links in local file(s):
lychee README.md
lychee test.html info.txt
# check links in local files (by shell glob):
lychee ~/projects/*/README.md
# check links in local files (lychee supports advanced globbing and ~ expansion):
lychee "~/projects/big_project/**/README.*"
# ignore case when globbing, displaying progress and check result for each link:
lychee --glob-ignore-case --progress --verbose "~/projects/**/[r]eadme.*"
```
Optional (to avoid getting rate-limited): set an environment variable with your Github token
@ -93,7 +115,7 @@ config file.
### CLI exit codes
- `0` for success (all links checked successfully or excluded/skipped as configured)
- `1` for any unexpected runtime failures or config errors
- `1` for missing inputs and any unexpected runtime failures or config errors
- `2` for link check failures (if any non-excluded link failed the check)
## Troubleshooting and workarounds

View file

@ -1,3 +0,0 @@
Test file: this link should be a valid link but return a HTTP 404 when followed.
http://httpbin.org/status/404

View file

@ -1,63 +1,283 @@
use crate::extract::{extract_links, FileType};
use crate::types::Uri;
use anyhow::Result;
use glob::glob;
use anyhow::{anyhow, Context, Result};
use glob::glob_with;
use reqwest::Url;
use std::{collections::HashSet, fs};
use std::{ffi::OsStr, path::Path};
use shellexpand::tilde;
use std::collections::HashSet;
use std::path::Path;
use std::path::PathBuf;
use tokio::fs::read_to_string;
use tokio::io::{stdin, AsyncReadExt};
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn resolve_file_type_by_path<P: AsRef<Path>>(path: P) -> FileType {
match path.as_ref().extension().and_then(OsStr::to_str) {
Some("md") => FileType::Markdown,
Some("html") => FileType::HTML,
_ => FileType::Plaintext,
const STDIN: &str = "-";
#[derive(Debug, Clone)]
#[non_exhaustive]
pub(crate) enum Input {
RemoteUrl(Url),
FsGlob { pattern: String, ignore_case: bool },
FsPath(PathBuf),
Stdin,
String(String),
}
#[derive(Debug)]
pub(crate) struct InputContent {
pub input: Input,
pub file_type: FileType,
pub content: String,
}
impl InputContent {
pub fn from_string(s: &str, file_type: FileType) -> Self {
// TODO: consider using Cow (to avoid one .clone() for String types)
Self {
input: Input::String(s.to_owned()),
file_type,
content: s.to_owned(),
}
}
}
/// Fetch all unique links from a vector of inputs
impl Input {
pub(crate) fn new(value: &str, glob_ignore_case: bool) -> Self {
if value == STDIN {
Self::Stdin
} else {
match Url::parse(&value) {
Ok(url) => Self::RemoteUrl(url),
Err(_) => {
// this seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(value) != value;
if is_glob {
Self::FsGlob {
pattern: value.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
Self::FsPath(value.into())
}
}
}
}
}
pub async fn get_contents(
&self,
file_type_hint: Option<FileType>,
skip_missing: bool,
) -> Result<Vec<InputContent>> {
use Input::*;
match self {
// TODO: should skip_missing also affect URLs?
RemoteUrl(url) => Ok(vec![Self::url_contents(url).await?]),
FsGlob {
pattern,
ignore_case,
} => Ok(Self::glob_contents(pattern, *ignore_case).await?),
FsPath(path) => {
let content = Self::path_content(&path).await.with_context(|| {
format!(
"Failed to read file: `{}`",
path.to_str().unwrap_or("<MALFORMED PATH>")
)
});
match content {
Ok(input_content) => Ok(vec![input_content]),
Err(_) if skip_missing => Ok(vec![]),
Err(arg) => Err(anyhow!(arg)),
}
}
Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
String(s) => Ok(vec![Self::string_content(s, file_type_hint)?]),
}
}
async fn url_contents(url: &Url) -> Result<InputContent> {
let res = reqwest::get(url.clone()).await?;
let content = res.text().await?;
let input_content = InputContent {
input: Input::RemoteUrl(url.clone()),
file_type: FileType::from(url.as_str()),
content,
};
Ok(input_content)
}
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
let mut contents = vec![];
let glob_expanded = tilde(&path_glob);
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;
for entry in glob_with(&glob_expanded, match_opts)? {
match entry {
Ok(path) => {
let content = Self::path_content(&path).await?;
contents.push(content);
}
Err(e) => println!("{:?}", e),
}
}
Ok(contents)
}
async fn path_content<P: Into<PathBuf> + AsRef<Path>>(path: P) -> Result<InputContent> {
let input_content = InputContent {
file_type: FileType::from(path.as_ref()),
content: read_to_string(&path).await?,
input: Input::FsPath(path.into()),
};
Ok(input_content)
}
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
let mut content = String::new();
let mut stdin = stdin();
stdin.read_to_string(&mut content).await?;
let input_content = InputContent {
input: Input::Stdin,
file_type: file_type_hint.unwrap_or_default(),
content,
};
Ok(input_content)
}
fn string_content(s: &str, file_type_hint: Option<FileType>) -> Result<InputContent> {
Ok(InputContent::from_string(
s,
file_type_hint.unwrap_or_default(),
))
}
}
impl ToString for Input {
fn to_string(&self) -> String {
match self {
Self::RemoteUrl(url) => url.to_string(),
Self::FsGlob { pattern, .. } => pattern.clone(),
Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(),
Self::Stdin => STDIN.to_owned(),
Self::String(s) => s.clone(),
}
}
}
/// Fetch all unique links from a slice of inputs
/// All relative URLs get prefixed with `base_url` if given.
pub(crate) async fn collect_links(
inputs: Vec<String>,
inputs: &[Input],
base_url: Option<String>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Result<HashSet<Uri>> {
let base_url = match base_url {
Some(url) => Some(Url::parse(&url)?),
_ => None,
};
let mut links = HashSet::new();
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(max_concurrency);
for input in inputs {
match Url::parse(&input) {
Ok(url) => {
let path = String::from(url.path());
let res = reqwest::get(url).await?;
let content = res.text().await?;
// extract input contents
for input in inputs.iter().cloned() {
let mut sender = contents_tx.clone();
links.extend(extract_links(
resolve_file_type_by_path(path),
&content,
base_url.clone(),
));
}
Err(_) => {
// Assume we got a single file or a glob on our hands
for entry in glob(&input)? {
match entry {
Ok(path) => {
let content = fs::read_to_string(&path)?;
links.extend(extract_links(
resolve_file_type_by_path(&path),
&content,
base_url.clone(),
));
}
Err(e) => println!("Error handling file pattern {}: {:?}", input, e),
}
}
}
};
tokio::spawn(async move {
let contents = input.get_contents(None, skip_missing_inputs).await;
sender.send(contents).await
});
}
// receiver will get None once all tasks are done
drop(contents_tx);
// extract links from input contents
let mut extract_links_handles = vec![];
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base_url = base_url.clone();
let handle =
tokio::task::spawn_blocking(move || extract_links(&input_content, base_url));
extract_links_handles.push(handle);
}
}
// Note: we could dispatch links to be checked as soon as we get them,
// instead of building a HashSet with all links.
// This optimization would speed up cases where there's
// a lot of inputs and/or the inputs are large (e.g. big files).
let mut collected_links = HashSet::new();
for handle in extract_links_handles {
let links = handle.await?;
collected_links.extend(links);
}
Ok(collected_links)
}
#[cfg(test)]
mod test {
use super::*;
use crate::test_utils::get_mock_server_with_content;
use std::fs::File;
use std::io::Write;
use std::str::FromStr;
const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
const TEST_GLOB_1: &str = "https://test-glob-1.io";
const TEST_GLOB_2_MAIL: &str = "test@glob-2.io";
#[tokio::test]
async fn test_collect_links() -> Result<()> {
let dir = tempfile::tempdir()?;
let file_path = dir.path().join("f");
let file_glob_1_path = dir.path().join("glob-1");
let file_glob_2_path = dir.path().join("glob-2");
let mut file = File::create(&file_path)?;
let mut file_glob_1 = File::create(file_glob_1_path)?;
let mut file_glob_2 = File::create(file_glob_2_path)?;
writeln!(file, "{}", TEST_FILE)?;
writeln!(file_glob_1, "{}", TEST_GLOB_1)?;
writeln!(file_glob_2, "{}", TEST_GLOB_2_MAIL)?;
let mock_server = get_mock_server_with_content(http::StatusCode::OK, Some(TEST_URL)).await;
let inputs = vec![
Input::String(TEST_STRING.to_string()),
Input::RemoteUrl(Url::from_str(&mock_server.uri())?),
Input::FsPath(file_path),
Input::FsGlob {
pattern: dir.path().join("glob*").to_str().unwrap().to_string(),
ignore_case: true,
},
];
let links = collect_links(&inputs, None, false, 8).await?;
let mut expected_links = HashSet::new();
expected_links.insert(Uri::Website(Url::from_str(TEST_STRING)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_URL)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_FILE)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_GLOB_1)?));
expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string()));
assert_eq!(links, expected_links);
Ok(())
}
Ok(links)
}

View file

@ -1,3 +1,4 @@
use crate::collector::InputContent;
use crate::types::Uri;
use linkify::LinkFinder;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
@ -13,6 +14,27 @@ pub(crate) enum FileType {
Plaintext,
}
impl Default for FileType {
fn default() -> Self {
Self::Plaintext
}
}
impl<P: AsRef<Path>> From<P> for FileType {
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn from(p: P) -> FileType {
let path = p.as_ref();
match path.extension() {
Some(ext) => match ext {
_ if ext == "md" => FileType::Markdown,
_ if (ext == "htm" || ext == "html") => FileType::HTML,
_ => FileType::Plaintext,
},
None => FileType::Plaintext,
}
}
}
// Use LinkFinder here to offload the actual link searching
fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
@ -105,15 +127,11 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
.collect()
}
pub(crate) fn extract_links(
file_type: FileType,
input: &str,
base_url: Option<Url>,
) -> HashSet<Uri> {
let links = match file_type {
FileType::Markdown => extract_links_from_markdown(input),
FileType::HTML => extract_links_from_html(input),
FileType::Plaintext => extract_links_from_plaintext(input),
pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>) -> HashSet<Uri> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::HTML => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. This sorts out things like anchors.
@ -149,8 +167,7 @@ mod test {
fn test_extract_markdown_links() {
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
let links = extract_links(
FileType::Markdown,
input,
&InputContent::from_string(input, FileType::Markdown),
Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
);
assert_eq!(
@ -178,8 +195,7 @@ mod test {
</html>"#;
let links = extract_links(
FileType::HTML,
input,
&InputContent::from_string(input, FileType::HTML),
Some(Url::parse("https://github.com/hello-rust/").unwrap()),
);
@ -196,14 +212,14 @@ mod test {
#[test]
fn test_skip_markdown_anchors() {
let input = "This is [a test](#lol).";
let links = extract_links(FileType::Markdown, input, None);
let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None);
assert_eq!(links, HashSet::new())
}
#[test]
fn test_skip_markdown_internal_urls() {
let input = "This is [a test](./internal).";
let links = extract_links(FileType::Markdown, input, None);
let links = extract_links(&InputContent::from_string(input, FileType::Markdown), None);
assert_eq!(links, HashSet::new())
}
@ -211,7 +227,7 @@ mod test {
fn test_non_markdown_links() {
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
let links = extract_links(FileType::Plaintext, input, None);
let links = extract_links(&InputContent::from_string(input, FileType::Plaintext), None);
let expected = HashSet::from_iter(
[
Uri::Website(Url::parse("https://endler.dev").unwrap()),

View file

@ -19,12 +19,15 @@ mod options;
mod stats;
mod types;
#[cfg(test)]
mod test_utils;
use client::ClientBuilder;
use client_pool::ClientPool;
use collector::Input;
use options::{Config, LycheeOptions};
use stats::ResponseStats;
use types::Response;
use types::{Excludes, Status};
use types::{Excludes, Response, Status};
/// A C-like enum that can be cast to `i32` and used as process exit code.
enum ExitCode {
@ -39,14 +42,13 @@ enum ExitCode {
fn main() -> Result<()> {
pretty_env_logger::init();
let opts = LycheeOptions::from_args();
let mut opts = LycheeOptions::from_args();
// Load a potentially existing config file and merge it into the config from the CLI
let cfg = if let Some(c) = Config::load_from_file(&opts.config_file)? {
if let Some(c) = Config::load_from_file(&opts.config_file)? {
opts.config.merge(c)
} else {
opts.config
};
}
let cfg = &opts.config;
let mut runtime = match cfg.threads {
Some(threads) => {
@ -59,7 +61,7 @@ fn main() -> Result<()> {
}
None => tokio::runtime::Runtime::new()?,
};
let errorcode = runtime.block_on(run(cfg, opts.inputs))?;
let errorcode = runtime.block_on(run(cfg, opts.inputs()))?;
std::process::exit(errorcode);
}
@ -76,7 +78,7 @@ fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbos
};
}
async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
let mut headers = parse_headers(&cfg.headers)?;
if let Some(auth) = &cfg.basic_auth {
let auth_header = parse_basic_auth(&auth)?;
@ -84,8 +86,8 @@ async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
}
let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok());
let timeout = parse_timeout(&cfg.timeout)?;
let max_concurrency = cfg.max_concurrency.parse()?;
let timeout = parse_timeout(cfg.timeout);
let max_concurrency = cfg.max_concurrency;
let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?;
let includes = RegexSet::new(&cfg.include)?;
let excludes = Excludes::from_options(&cfg);
@ -94,18 +96,24 @@ async fn run(cfg: Config, inputs: Vec<String>) -> Result<i32> {
.includes(includes)
.excludes(excludes)
.max_redirects(cfg.max_redirects)
.user_agent(cfg.user_agent)
.user_agent(cfg.user_agent.clone())
.allow_insecure(cfg.insecure)
.custom_headers(headers)
.method(method)
.timeout(timeout)
.verbose(cfg.verbose)
.github_token(cfg.github_token)
.scheme(cfg.scheme)
.github_token(cfg.github_token.clone())
.scheme(cfg.scheme.clone())
.accepted(accepted)
.build()?;
let links = collector::collect_links(inputs, cfg.base_url.clone()).await?;
let links = collector::collect_links(
&inputs,
cfg.base_url.clone(),
cfg.skip_missing,
max_concurrency,
)
.await?;
let pb = if cfg.progress {
Some(
ProgressBar::new(links.len() as u64)
@ -173,8 +181,8 @@ fn read_header(input: &str) -> Result<(String, String)> {
Ok((elements[0].into(), elements[1].into()))
}
fn parse_timeout<S: AsRef<str>>(timeout: S) -> Result<Duration> {
Ok(Duration::from_secs(timeout.as_ref().parse::<u64>()?))
fn parse_timeout(timeout: usize) -> Duration {
Duration::from_secs(timeout as u64)
}
fn parse_headers<T: AsRef<str>>(headers: &[T]) -> Result<HeaderMap> {

View file

@ -1,12 +1,24 @@
use crate::collector::Input;
use anyhow::{Error, Result};
use lazy_static::lazy_static;
use serde::Deserialize;
use std::{fs, io::ErrorKind};
use structopt::{clap::crate_version, StructOpt};
pub(crate) const USER_AGENT: &str = concat!("lychee/", crate_version!());
const METHOD: &str = "get";
const TIMEOUT: &str = "20";
const MAX_CONCURRENCY: &str = "128";
const TIMEOUT: usize = 20;
const MAX_CONCURRENCY: usize = 128;
const MAX_REDIRECTS: usize = 10;
// this exists because structopt requires `&str` type values for defaults
// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde)
lazy_static! {
static ref TIMEOUT_STR: String = TIMEOUT.to_string();
static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string();
static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string();
}
// Macro for generating default functions to be used by serde
macro_rules! default_function {
@ -19,6 +31,15 @@ macro_rules! default_function {
};
}
// Generate the functions for serde defaults
default_function! {
max_redirects: usize = MAX_REDIRECTS;
max_concurrency: usize = MAX_CONCURRENCY;
user_agent: String = USER_AGENT.to_string();
timeout: usize = TIMEOUT;
method: String = METHOD.to_string();
}
// Macro for merging configuration values
macro_rules! fold_in {
( $cli:ident , $toml:ident ; $( $key:ident : $default:expr; )* ) => {
@ -31,11 +52,16 @@ macro_rules! fold_in {
}
#[derive(Debug, StructOpt)]
#[structopt(name = "lychee", about = "A glorious link checker")]
#[structopt(
name = "lychee",
about = "A glorious link checker.\n\nProject home page: https://github.com/lycheeverse/lychee"
)]
pub(crate) struct LycheeOptions {
/// Input files
#[structopt(default_value = "README.md")]
pub inputs: Vec<String>,
/// The inputs (where to get links to check from).
/// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`),
/// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`).
#[structopt(name = "inputs", default_value = "README.md")]
raw_inputs: Vec<String>,
/// Configuration file to use
#[structopt(short, long = "config", default_value = "./lychee.toml")]
@ -45,6 +71,19 @@ pub(crate) struct LycheeOptions {
pub config: Config,
}
impl LycheeOptions {
// This depends on config, which is why a method is required (we could
// accept a `Vec<Input>` in `LycheeOptions` and do the conversion there,
// but we'd get no access to `glob_ignore_case`.
/// Get parsed inputs from options.
pub(crate) fn inputs(&self) -> Vec<Input> {
self.raw_inputs
.iter()
.map(|s| Input::new(s, self.config.glob_ignore_case))
.collect()
}
}
#[derive(Debug, Deserialize, StructOpt)]
pub struct Config {
/// Verbose program output
@ -58,14 +97,14 @@ pub struct Config {
pub progress: bool,
/// Maximum number of allowed redirects
#[structopt(short, long, default_value = "10")]
#[serde(default)]
#[structopt(short, long, default_value = &MAX_REDIRECTS_STR)]
#[serde(default = "max_redirects")]
pub max_redirects: usize,
/// Maximum number of concurrent network requests
#[structopt(long, default_value = MAX_CONCURRENCY)]
#[serde(default)]
pub max_concurrency: String,
#[structopt(long, default_value = &MAX_CONCURRENCY_STR)]
#[serde(default = "max_concurrency")]
pub max_concurrency: usize,
/// Number of threads to utilize.
/// Defaults to number of cores available to the system
@ -130,9 +169,9 @@ pub struct Config {
pub accept: Option<String>,
/// Website timeout from connect to response finished
#[structopt(short, long, default_value = TIMEOUT)]
#[structopt(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
pub timeout: String,
pub timeout: usize,
/// Request method
// Using `-X` as a short param similar to curl
@ -140,21 +179,30 @@ pub struct Config {
#[serde(default = "method")]
pub method: String,
#[structopt(short, long, help = "Base URL to check relative URls")]
/// Base URL to check relative URLs
#[structopt(short, long)]
#[serde(default)]
pub base_url: Option<String>,
#[structopt(long, help = "Basic authentication support. Ex 'username:password'")]
/// Basic authentication support. E.g. `username:password`
#[structopt(long)]
#[serde(default)]
pub basic_auth: Option<String>,
#[structopt(
long,
help = "GitHub API token to use when checking github.com links, to avoid rate limiting",
env = "GITHUB_TOKEN"
)]
/// GitHub API token to use when checking github.com links, to avoid rate limiting
#[structopt(long, env = "GITHUB_TOKEN")]
#[serde(default)]
pub github_token: Option<String>,
/// Skip missing input files (default is to error if they don't exist)
#[structopt(long)]
#[serde(default)]
pub skip_missing: bool,
/// Ignore case when expanding filesystem path glob inputs
#[structopt(long)]
#[serde(default)]
pub glob_ignore_case: bool,
}
impl Config {
@ -178,7 +226,7 @@ impl Config {
}
/// Merge the configuration from TOML into the CLI configuration
pub(crate) fn merge(mut self, toml: Config) -> Config {
pub(crate) fn merge(&mut self, toml: Config) {
fold_in! {
// Destination and source configs
self, toml;
@ -186,7 +234,7 @@ impl Config {
// Keys with defaults to assign
verbose: false;
progress: false;
max_redirects: 10;
max_redirects: MAX_REDIRECTS;
max_concurrency: MAX_CONCURRENCY;
threads: None;
user_agent: USER_AGENT;
@ -205,15 +253,8 @@ impl Config {
base_url: None;
basic_auth: None;
github_token: None;
skip_missing: false;
glob_ignore_case: false;
}
self
}
}
// Generate the functions for serde defaults
default_function! {
user_agent: String = USER_AGENT.to_string();
timeout: String = TIMEOUT.to_string();
method: String = METHOD.to_string();
}

38
src/test_utils.rs Normal file
View file

@ -0,0 +1,38 @@
#![cfg(test)]
use http::StatusCode;
use wiremock::matchers::path;
use wiremock::{Mock, MockServer, ResponseTemplate};
// TODO: used in cli tests (as duplicate)
#[allow(unused)]
pub(crate) async fn get_mock_server<S>(response_code: S) -> MockServer
where
S: Into<StatusCode>,
{
get_mock_server_with_content(response_code, None).await
}
pub(crate) async fn get_mock_server_with_content<S>(
response_code: S,
content: Option<&str>,
) -> MockServer
where
S: Into<StatusCode>,
{
let mock_server = MockServer::start().await;
let template = ResponseTemplate::new(response_code.into());
let template = if let Some(s) = content {
template.set_body_string(s)
} else {
template
};
Mock::given(path("/"))
.respond_with(template)
.mount(&mock_server)
.await;
mock_server
}

View file

@ -119,7 +119,7 @@ impl From<reqwest::Error> for Status {
}
/// Exclude configuration for the link checker.
/// You can ignore links based on
/// You can ignore links based on regex patterns or pre-defined IP ranges.
#[derive(Clone, Debug)]
pub struct Excludes {
pub regex: Option<RegexSet>,

View file

@ -1,20 +1,58 @@
#[cfg(test)]
mod cli {
use anyhow::Result;
use assert_cmd::Command;
use http::StatusCode;
use predicates::str::contains;
use std::path::Path;
use std::fs::File;
use std::io::Write;
use std::path::{Path, PathBuf};
use wiremock::matchers::path;
use wiremock::{Mock, MockServer, ResponseTemplate};
fn main_command() -> Command {
// this gets the "main" binary name (e.g. `lychee`)
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
}
fn fixtures_path() -> PathBuf {
Path::new(module_path!()).parent().unwrap().join("fixtures")
}
// TODO: duplicate of test_utils
async fn get_mock_server<S>(response_code: S) -> MockServer
where
S: Into<StatusCode>,
{
get_mock_server_with_content(response_code, None).await
}
async fn get_mock_server_with_content<S>(response_code: S, content: Option<&str>) -> MockServer
where
S: Into<StatusCode>,
{
let mock_server = MockServer::start().await;
let template = ResponseTemplate::new(response_code.into());
let template = if let Some(s) = content {
template.set_body_string(s)
} else {
template
};
Mock::given(path("/"))
.respond_with(template)
.mount(&mock_server)
.await;
mock_server
}
#[test]
fn test_exclude_all_private() {
// this gets the "main" binary name (e.g. `lychee`)
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
let mut cmd = main_command();
let test_all_private_path = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_ALL_PRIVATE.md");
let test_all_private_path = fixtures_path().join("TEST_ALL_PRIVATE.md");
// assert that the command runs OK, and that it excluded all the links
cmd.arg("--exclude-all-private")
@ -31,14 +69,8 @@ mod cli {
/// Test that a GitHub link can be checked without specifying the token.
#[test]
fn test_check_github_no_token() {
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
let test_github_path = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_GITHUB.md");
let mut cmd = main_command();
let test_github_path = fixtures_path().join("TEST_GITHUB.md");
cmd.arg("--verbose")
.arg(test_github_path)
@ -50,30 +82,27 @@ mod cli {
.stdout(contains("Errors: 0"));
}
#[test]
fn test_failure_404_link() {
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
#[tokio::test]
async fn test_failure_404_link() {
let mut cmd = main_command();
let mock_server = get_mock_server(http::StatusCode::NOT_FOUND).await;
let dir = tempfile::tempdir().expect("Failed to create tempdir");
let file_path = dir.path().join("test.txt");
let mut file = File::create(&file_path).expect("Failed to create tempfile");
let test_404_path = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_404.md");
writeln!(file, "{}", mock_server.uri()).expect("Failed to write to file");
cmd.arg(test_404_path).assert().failure().code(2);
cmd.arg(file_path)
.write_stdin(mock_server.uri())
.assert()
.failure()
.code(2);
}
#[test]
fn test_failure_github_404_no_token() {
let mut cmd =
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name");
let test_github_404_path = Path::new(module_path!())
.parent()
.unwrap()
.join("fixtures")
.join("TEST_GITHUB_404.md");
let mut cmd = main_command();
let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md");
cmd.arg(test_github_404_path)
.env_clear()
@ -83,4 +112,136 @@ mod cli {
.stdout(contains("https://github.com/mre/idiomatic-rust-doesnt-exist-man \
(GitHub token not specified. To check GitHub links reliably, use `--github-token` flag / `GITHUB_TOKEN` env var.)"));
}
#[tokio::test]
async fn test_stdin_input() {
let mut cmd = main_command();
let mock_server = get_mock_server(http::StatusCode::OK).await;
cmd.arg("-")
.write_stdin(mock_server.uri())
.assert()
.success();
}
#[tokio::test]
async fn test_stdin_input_failure() {
let mut cmd = main_command();
let mock_server = get_mock_server(http::StatusCode::INTERNAL_SERVER_ERROR).await;
cmd.arg("-")
.write_stdin(mock_server.uri())
.assert()
.failure()
.code(2);
}
#[tokio::test]
async fn test_stdin_input_multiple() {
let mut cmd = main_command();
let mock_server_a = get_mock_server(http::StatusCode::OK).await;
let mock_server_b = get_mock_server(http::StatusCode::OK).await;
// this behavior (treating multiple `-` as separate inputs) is the same as most CLI tools
// that accept `-` as stdin, e.g. `cat`, `bat`, `grep` etc.
cmd.arg("-")
.arg("-")
.write_stdin(mock_server_a.uri())
.write_stdin(mock_server_b.uri())
.assert()
.success();
}
#[test]
fn test_missing_file_error() {
let mut cmd = main_command();
let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string());
cmd.arg(&filename)
.assert()
.failure()
.code(1)
.stderr(contains(format!(
"Error: Failed to read file: `{}`",
filename
)));
}
#[test]
fn test_missing_file_ok_if_skip_missing() {
let mut cmd = main_command();
let filename = format!("non-existing-file-{}", uuid::Uuid::new_v4().to_string());
cmd.arg(&filename).arg("--skip-missing").assert().success();
}
#[tokio::test]
async fn test_glob() -> Result<()> {
// using Result to be able to use `?`
let mut cmd = main_command();
let dir = tempfile::tempdir()?;
let mock_server_a = get_mock_server(http::StatusCode::OK).await;
let mock_server_b = get_mock_server(http::StatusCode::OK).await;
let mut file_a = File::create(dir.path().join("a.md"))?;
let mut file_b = File::create(dir.path().join("b.md"))?;
writeln!(file_a, "{}", mock_server_a.uri().as_str())?;
writeln!(file_b, "{}", mock_server_b.uri().as_str())?;
cmd.arg(dir.path().join("*.md"))
.arg("--verbose")
.assert()
.success()
.stdout(contains("Total: 2"));
Ok(())
}
#[cfg(target_os = "linux")] // MacOS and Windows have case-insensitive filesystems
#[tokio::test]
async fn test_glob_ignore_case() -> Result<()> {
let mut cmd = main_command();
let dir = tempfile::tempdir()?;
let mock_server_a = get_mock_server(http::StatusCode::OK).await;
let mock_server_b = get_mock_server(http::StatusCode::OK).await;
let mut file_a = File::create(dir.path().join("README.md"))?;
let mut file_b = File::create(dir.path().join("readme.md"))?;
writeln!(file_a, "{}", mock_server_a.uri().as_str())?;
writeln!(file_b, "{}", mock_server_b.uri().as_str())?;
cmd.arg(dir.path().join("[r]eadme.md"))
.arg("--verbose")
.arg("--glob-ignore-case")
.assert()
.success()
.stdout(contains("Total: 2"));
Ok(())
}
#[tokio::test]
async fn test_glob_recursive() -> Result<()> {
let mut cmd = main_command();
let dir = tempfile::tempdir()?;
let subdir_level_1 = tempfile::tempdir_in(&dir)?;
let subdir_level_2 = tempfile::tempdir_in(&subdir_level_1)?;
let mock_server = get_mock_server(http::StatusCode::OK).await;
let mut file = File::create(subdir_level_2.path().join("test.md"))?;
writeln!(file, "{}", mock_server.uri().as_str())?;
// ** should be a recursive glob
cmd.arg(dir.path().join("**/*.md"))
.arg("--verbose")
.assert()
.success()
.stdout(contains("Total: 1"));
Ok(())
}
}