diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bb1b445..c56d92f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -43,17 +43,17 @@ jobs: fail-fast: false steps: - name: Install musl tools - if: contains(matrix.target, 'musl') + if: ${{ contains(matrix.target, 'musl') }} run: sudo apt-get install -y musl-tools - name: Install arm tools - if: contains(matrix.target, 'arm') + if: ${{ contains(matrix.target, 'arm') }} run: | echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV sudo apt-get install -y binutils-arm-linux-gnueabihf - name: Install aarch64 tools - if: contains(matrix.target, 'aarch64') + if: ${{ contains(matrix.target, 'aarch64') }} run: | echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV sudo apt-get install -y binutils-aarch64-linux-gnu diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7404bc1..4b7369f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -56,7 +56,7 @@ jobs: - run: cargo-publish-all --dry-run publish: - if: startsWith(github.ref, 'refs/tags/') + if: ${{ startsWith(github.ref, 'refs/tags/') }} needs: - test - lint diff --git a/Cargo.lock b/Cargo.lock index 9f4a90c..d13478f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1406,8 +1406,11 @@ dependencies = [ "http", "hubcaps", "linkify", + "log", "markup5ever_rcdom", "openssl-sys", + "path-clean", + "percent-encoding", "pretty_assertions", "pulldown-cmark", "regex", @@ -1732,6 +1735,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "path-clean" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd" + [[package]] name = "pem" version = "0.8.3" diff --git a/Makefile b/Makefile index f794614..a0b985d 100644 --- a/Makefile +++ b/Makefile @@ -18,10 +18,14 @@ docker-run: ## Run Docker image docker-push: ## Push image to Docker Hub docker push $(IMAGE_NAME) -.PHONY: build-local +.PHONY: build build: ## Build Rust code locally cargo build +.PHONY: install +install: ## Install project locally + cargo install --path lychee-bin + .PHONY: run run: ## Run Rust code locally cargo run diff --git a/README.md b/README.md index b15bc58..587ae39 100644 --- a/README.md +++ b/README.md @@ -161,11 +161,15 @@ lychee ~/projects/*/README.md # check links in local files (lychee supports advanced globbing and ~ expansion): lychee "~/projects/big_project/**/README.*" + # ignore case when globbing and check result for each link: lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*" # check links from epub file (requires atool: https://www.nongnu.org/atool) acat -F zip {file.epub} "*.xhtml" "*.html" | lychee - + +# check links in directory; block network requests +lychee --offline path/to/directory ``` ### GitHub token @@ -202,6 +206,7 @@ FLAGS: -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuous integration) + --offline Only check local files and block network requests --require-https When HTTPS is available, treat HTTP links as errors --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information @@ -209,7 +214,8 @@ FLAGS: OPTIONS: -a, --accept Comma-separated list of accepted status codes for valid links - -b, --base-url Base URL to check relative URLs + -b, --base Base URL or website root directory to check relative URLs e.g. + https://example.org or `/path/to/public` --basic-auth Basic authentication support. E.g. `username:password` -c, --config Configuration file to use [default: ./lychee.toml] --exclude ... Exclude URLs from checking (supports regex) @@ -310,7 +316,8 @@ Try one of these links to get started: - [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) - [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) -Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors. +Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. +Begin my making sure the following commands succeed without errors. ```bash cargo test # runs tests diff --git a/examples/collect_links/collect_links.rs b/examples/collect_links/collect_links.rs index fbff5f5..60c37f9 100644 --- a/examples/collect_links/collect_links.rs +++ b/examples/collect_links/collect_links.rs @@ -14,12 +14,12 @@ async fn main() -> Result<()> { ]; let links = Collector::new( - None, // base_url + None, // base false, // don't skip missing inputs 10, // max concurrency ) .collect_links( - inputs, // base_url + inputs, // base url or directory ) .await?; diff --git a/fixtures/TEST.md b/fixtures/TEST.md index be6d5a0..dc07cc6 100644 --- a/fixtures/TEST.md +++ b/fixtures/TEST.md @@ -1,5 +1,5 @@ -This link should be ignored as it is not a fully qualified URL. -![Logo](awesome.png) +Check file link +![Logo](../assets/banner.svg) ![Anchors should be ignored](#awesome) diff --git a/fixtures/TEST_SCHEMES.txt b/fixtures/TEST_SCHEMES.txt index 29ab3b5..47a061e 100644 --- a/fixtures/TEST_SCHEMES.txt +++ b/fixtures/TEST_SCHEMES.txt @@ -1,3 +1,3 @@ slack://channel?id=123 -file://foo/bar +file:///test_folder/test_file https://example.org diff --git a/fixtures/offline/404.html/.gitkeep b/fixtures/offline/404.html/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/offline/about/index.html b/fixtures/offline/about/index.html new file mode 100644 index 0000000..1121b0b --- /dev/null +++ b/fixtures/offline/about/index.html @@ -0,0 +1,21 @@ + + + About + + +

About

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/another page/.gitkeep b/fixtures/offline/another page/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/offline/blog/post1/index.html b/fixtures/offline/blog/post1/index.html new file mode 100644 index 0000000..9112922 --- /dev/null +++ b/fixtures/offline/blog/post1/index.html @@ -0,0 +1,21 @@ + + + Post 2 + + +

Post 2 Title

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/blog/post2/index.html b/fixtures/offline/blog/post2/index.html new file mode 100644 index 0000000..514ac4e --- /dev/null +++ b/fixtures/offline/blog/post2/index.html @@ -0,0 +1,18 @@ + + + Post 1 + + +

Post 1 Title

+

+

+

+ + \ No newline at end of file diff --git a/fixtures/offline/index.html b/fixtures/offline/index.html new file mode 100644 index 0000000..8594d3f --- /dev/null +++ b/fixtures/offline/index.html @@ -0,0 +1,27 @@ + + + Index + + +

Index Title

+

+

+

+ + \ No newline at end of file diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index f417538..6c115e3 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result}; use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName}; use http::StatusCode; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{ - collector::{Collector, Input}, - ClientBuilder, ClientPool, Response, -}; +use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon @@ -178,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { let include = RegexSet::new(&cfg.include)?; let exclude = RegexSet::new(&cfg.exclude)?; + // Offline mode overrides the scheme + let schemes = if cfg.offline { + vec!["file".to_string()] + } else { + cfg.scheme.clone() + }; + let client = ClientBuilder::builder() .includes(include) .excludes(exclude) @@ -193,14 +197,14 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { .method(method) .timeout(timeout) .github_token(cfg.github_token.clone()) - .schemes(HashSet::from_iter(cfg.scheme.clone())) + .schemes(HashSet::from_iter(schemes)) .accepted(accepted) .require_https(cfg.require_https) .build() .client() .map_err(|e| anyhow!(e))?; - let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency) + let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency) .collect_links(&inputs) .await .map_err(|e| anyhow!(e))?; diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ce8d32d..547e2bb 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,9 +1,8 @@ -use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr}; +use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr}; use anyhow::{anyhow, Error, Result}; use lazy_static::lazy_static; -use lychee_lib::collector::Input; -use reqwest::Url; +use lychee_lib::{Base, Input}; use serde::Deserialize; use structopt::{clap::crate_version, StructOpt}; @@ -76,6 +75,10 @@ macro_rules! fold_in { }; } +fn parse_base(src: &str) -> Result { + Base::try_from(src) +} + #[derive(Debug, StructOpt)] #[structopt( name = "lychee", @@ -161,6 +164,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) scheme: Vec, + /// Only check local files and block network requests. + #[structopt(long)] + #[serde(default)] + pub(crate) offline: bool, + /// URLs to check (supports regex). Has preference over all excludes. #[structopt(long)] #[serde(default)] @@ -223,10 +231,11 @@ pub(crate) struct Config { #[serde(default = "method")] pub(crate) method: String, - /// Base URL to check relative URLs - #[structopt(short, long, parse(try_from_str))] + /// Base URL or website root directory to check relative URLs + /// e.g. https://example.org or `/path/to/public` + #[structopt(short, long, parse(try_from_str = parse_base))] #[serde(default)] - pub(crate) base_url: Option, + pub(crate) base: Option, /// Basic authentication support. E.g. `username:password` #[structopt(long)] @@ -311,7 +320,7 @@ impl Config { accept: None; timeout: TIMEOUT; method: METHOD; - base_url: None; + base: None; basic_auth: None; github_token: None; skip_missing: false; diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 7cf4633..06d6062 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -133,15 +133,38 @@ mod cli { /// Test unsupported URI schemes #[test] - fn test_unsupported_uri_schemes() -> Result<()> { - test_json_output!( - "TEST_SCHEMES.txt", - MockResponseStats { - total: 1, - successful: 1, - ..MockResponseStats::default() - } - ) + fn test_unsupported_uri_schemes() { + let mut cmd = main_command(); + let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt"); + + // Exclude file link because it doesn't exist on the filesystem. + // (File URIs are absolute paths, which we don't have.) + // Nevertheless, the `file` scheme should be recognized. + cmd.arg(test_schemes_path) + .arg("--exclude") + .arg("file://") + .env_clear() + .assert() + .success() + .stdout(contains("Total............2")) + .stdout(contains("Successful.......1")) + .stdout(contains("Excluded.........1")); + } + + #[test] + fn test_resolve_paths() { + let mut cmd = main_command(); + let offline_dir = fixtures_path().join("offline"); + + cmd.arg("--offline") + .arg("--base") + .arg(&offline_dir) + .arg(&offline_dir.join("index.html")) + .env_clear() + .assert() + .success() + .stdout(contains("Total............3")) + .stdout(contains("Successful.......3")); } #[test] @@ -367,7 +390,7 @@ mod cli { .assert() .success(); - let expected = r#"{"total":10,"successful":10,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; + let expected = r#"{"total":11,"successful":11,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#; let output = fs::read_to_string(&outfile)?; assert_eq!(output.split_whitespace().collect::(), expected); fs::remove_file(outfile)?; @@ -385,7 +408,7 @@ mod cli { .arg(".*") .assert() .success() - .stdout(contains("Excluded........10")); + .stdout(contains("Excluded........11")); Ok(()) } diff --git a/lychee-bin/tests/local_files.rs b/lychee-bin/tests/local_files.rs new file mode 100644 index 0000000..11574e1 --- /dev/null +++ b/lychee-bin/tests/local_files.rs @@ -0,0 +1,36 @@ +#[cfg(test)] +mod cli { + use std::{fs::File, io::Write}; + + use assert_cmd::Command; + use lychee_lib::Result; + use predicates::str::contains; + + fn main_command() -> Command { + // this gets the "main" binary name (e.g. `lychee`) + Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name") + } + + #[tokio::test] + async fn test_local_file() -> Result<()> { + let dir = tempfile::tempdir()?; + let index_path = dir.path().join("index.html"); + let mut index = File::create(&index_path)?; + writeln!(index, r#"Foo"#)?; + + let foo_path = dir.path().join("foo.html"); + File::create(&foo_path)?; + + let mut cmd = main_command(); + cmd.arg(index_path) + .arg("--no-progress") + .arg("--verbose") + .env_clear() + .assert() + .success() + .stdout(contains("Total............1")) + .stdout(contains("foo.html")); + + Ok(()) + } +} diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index cc5dd6d..e31bd4a 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -40,6 +40,9 @@ shellexpand = "2.1.0" tokio = { version = "1.6.0", features = ["full"] } typed-builder = "0.9.1" url = { version = "2.2.2", features = ["serde"] } +log = "0.4.14" +path-clean = "0.1.0" +percent-encoding = "2.1.0" [dev-dependencies] doc-comment = "0.3.3" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 5580854..4c7d8ec 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -20,8 +20,7 @@ use typed_builder::TypedBuilder; use crate::{ filter::{Excludes, Filter, Includes}, quirks::Quirks, - uri::Uri, - ErrorKind, Request, Response, Result, Status, + ErrorKind, Request, Response, Result, Status, Uri, }; const DEFAULT_MAX_REDIRECTS: usize = 5; @@ -178,6 +177,8 @@ impl Client { let Request { uri, source } = Request::try_from(request)?; let status = if self.filter.is_excluded(&uri) { Status::Excluded + } else if uri.is_file() { + self.check_file(&uri).await } else if uri.is_mail() { self.check_mail(&uri).await } else { @@ -255,6 +256,15 @@ impl Client { } } + pub async fn check_file(&self, uri: &Uri) -> Status { + if let Ok(path) = uri.url.to_file_path() { + if path.exists() { + return Status::Ok(StatusCode::OK); + } + } + ErrorKind::InvalidFilePath(uri.clone()).into() + } + pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; @@ -284,11 +294,13 @@ where mod test { use std::{ convert::TryInto, + fs::File, time::{Duration, Instant}, }; use http::{header::HeaderMap, StatusCode}; use reqwest::header; + use tempfile::tempdir; use super::ClientBuilder; use crate::{mock_server, test_utils::get_mock_client_response, Uri}; @@ -373,6 +385,17 @@ mod test { assert!(res.status().is_success()); } + #[tokio::test] + async fn test_file() { + let dir = tempdir().unwrap(); + let file = dir.path().join("temp"); + File::create(file).unwrap(); + let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap()); + + let res = get_mock_client_response(uri).await; + assert!(res.status().is_success()); + } + #[tokio::test] async fn test_custom_headers() { // See https://github.com/rust-lang/crates.io/issues/788 diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 67fb090..b5e69d9 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,223 +1,10 @@ -use std::{ - collections::HashSet, - fmt::Display, - path::{Path, PathBuf}, -}; - -use glob::glob_with; -use reqwest::Url; -use serde::Serialize; -use shellexpand::tilde; -use tokio::{ - fs::read_to_string, - io::{stdin, AsyncReadExt}, -}; - -use crate::{ - extract::{extract_links, FileType}, - uri::Uri, - Request, Result, -}; - -const STDIN: &str = "-"; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[non_exhaustive] -/// An exhaustive list of input sources, which lychee accepts -pub enum Input { - /// URL (of HTTP/HTTPS scheme). - RemoteUrl(Box), - /// Unix shell-style glob pattern. - FsGlob { - /// The glob pattern matching all input files - pattern: String, - /// Don't be case sensitive when matching files against a glob - ignore_case: bool, - }, - /// File path. - FsPath(PathBuf), - /// Standard Input. - Stdin, - /// Raw string input. - String(String), -} - -impl Serialize for Input { - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - serializer.collect_str(self) - } -} - -impl Display for Input { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(match self { - Input::RemoteUrl(url) => url.as_str(), - Input::FsGlob { pattern, .. } => pattern, - Input::FsPath(path) => path.to_str().unwrap_or_default(), - Input::Stdin => "stdin", - Input::String(_) => "raw input string", - }) - } -} - -#[derive(Debug)] -/// Encapsulates the content for a given input -pub struct InputContent { - /// Input source - pub input: Input, - /// File type of given input - pub file_type: FileType, - /// Raw UTF-8 string content - pub content: String, -} - -impl InputContent { - #[must_use] - /// Create an instance of `InputContent` from an input string - pub fn from_string(s: &str, file_type: FileType) -> Self { - // TODO: consider using Cow (to avoid one .clone() for String types) - Self { - input: Input::String(s.to_owned()), - file_type, - content: s.to_owned(), - } - } -} - -impl Input { - #[must_use] - /// Construct a new `Input` source. In case the input is a `glob` pattern, - /// `glob_ignore_case` decides whether matching files against the `glob` is - /// case-insensitive or not - pub fn new(value: &str, glob_ignore_case: bool) -> Self { - if value == STDIN { - Self::Stdin - } else if let Ok(url) = Url::parse(value) { - Self::RemoteUrl(Box::new(url)) - } else { - // this seems to be the only way to determine if this is a glob pattern - let is_glob = glob::Pattern::escape(value) != value; - - if is_glob { - Self::FsGlob { - pattern: value.to_owned(), - ignore_case: glob_ignore_case, - } - } else { - Self::FsPath(value.into()) - } - } - } - - #[allow(clippy::missing_panics_doc)] - /// Retrieve the contents from the input - /// - /// # Errors - /// - /// Returns an error if the contents can not be retrieved - /// because of an underlying I/O error (e.g. an error while making a - /// network request or retrieving the contents from the file system) - pub async fn get_contents( - &self, - file_type_hint: Option, - skip_missing: bool, - ) -> Result> { - match *self { - // TODO: should skip_missing also affect URLs? - Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]), - Input::FsGlob { - ref pattern, - ignore_case, - } => Ok(Self::glob_contents(pattern, ignore_case).await?), - Input::FsPath(ref path) => { - let content = Self::path_content(path).await; - match content { - Ok(input_content) => Ok(vec![input_content]), - Err(_) if skip_missing => Ok(vec![]), - Err(e) => Err(e), - } - } - Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), - Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]), - } - } - - async fn url_contents(url: &Url) -> Result { - // Assume HTML for default paths - let file_type = if url.path().is_empty() || url.path() == "/" { - FileType::Html - } else { - FileType::from(url.as_str()) - }; - - let res = reqwest::get(url.clone()).await?; - let input_content = InputContent { - input: Input::RemoteUrl(Box::new(url.clone())), - file_type, - content: res.text().await?, - }; - - Ok(input_content) - } - - async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> { - let mut contents = vec![]; - let glob_expanded = tilde(&path_glob); - let mut match_opts = glob::MatchOptions::new(); - - match_opts.case_sensitive = !ignore_case; - - for entry in glob_with(&glob_expanded, match_opts)? { - match entry { - Ok(path) => { - let content = Self::path_content(&path).await?; - contents.push(content); - } - Err(e) => println!("{:?}", e), - } - } - - Ok(contents) - } - - async fn path_content + AsRef + Clone>(path: P) -> Result { - let content = read_to_string(&path) - .await - .map_err(|e| (path.clone().into(), e))?; - let input_content = InputContent { - file_type: FileType::from(path.as_ref()), - content, - input: Input::FsPath(path.into()), - }; - - Ok(input_content) - } - - async fn stdin_content(file_type_hint: Option) -> Result { - let mut content = String::new(); - let mut stdin = stdin(); - stdin.read_to_string(&mut content).await?; - - let input_content = InputContent { - input: Input::Stdin, - file_type: file_type_hint.unwrap_or_default(), - content, - }; - - Ok(input_content) - } - - fn string_content(s: &str, file_type_hint: Option) -> InputContent { - InputContent::from_string(s, file_type_hint.unwrap_or_default()) - } -} +use crate::{extract::extract_links, Base, Input, Request, Result, Uri}; +use std::collections::HashSet; /// Collector keeps the state of link collection #[derive(Debug, Clone)] pub struct Collector { - base_url: Option, + base: Option, skip_missing_inputs: bool, max_concurrency: usize, cache: HashSet, @@ -226,9 +13,9 @@ pub struct Collector { impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new(base_url: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { Collector { - base_url, + base, skip_missing_inputs, max_concurrency, cache: HashSet::new(), @@ -236,7 +23,8 @@ impl Collector { } /// Fetch all unique links from a slice of inputs - /// All relative URLs get prefixed with `base_url` if given. + /// All relative URLs get prefixed with `base` if given. + /// (This can be a directory or a base URL) /// /// # Errors /// @@ -263,9 +51,9 @@ impl Collector { while let Some(result) = contents_rx.recv().await { for input_content in result? { - let base_url = self.base_url.clone(); + let base = self.base.clone(); let handle = - tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url)); + tokio::task::spawn_blocking(move || extract_links(&input_content, &base)); extract_links_handles.push(handle); } } @@ -278,7 +66,7 @@ impl Collector { for handle in extract_links_handles { let new_links = handle.await?; - links.extend(new_links); + links.extend(new_links?); } // Filter out already cached links (duplicates) @@ -304,9 +92,9 @@ mod test { use super::*; use crate::{ - extract::FileType, mock_server, test_utils::{mail, website}, + types::{FileType, Input}, Result, Uri, }; diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 32f2444..fcb4bcc 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -1,66 +1,62 @@ -use std::{collections::HashSet, convert::TryFrom, path::Path}; +use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf}; use html5ever::{ parse_document, tendril::{StrTendril, TendrilSink}, }; -use linkify::LinkFinder; +use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; +use percent_encoding::percent_decode_str; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; -use url::Url; +use reqwest::Url; -use crate::{collector::InputContent, Request, Uri}; +use crate::{ + helpers::{path, url}, + types::{FileType, InputContent}, + Base, ErrorKind, Input, Request, Result, Uri, +}; -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -/// `FileType` defines which file types lychee can handle -pub enum FileType { - /// File in HTML format - Html, - /// File in Markdown format - Markdown, - /// Generic text file without syntax-specific parsing - Plaintext, -} +/// Main entrypoint for extracting links from various sources +/// (Markdown, HTML, and plaintext) +pub(crate) fn extract_links( + input_content: &InputContent, + base: &Option, +) -> Result> { + let links = match input_content.file_type { + FileType::Markdown => extract_links_from_markdown(&input_content.content), + FileType::Html => extract_links_from_html(&input_content.content), + FileType::Plaintext => extract_links_from_plaintext(&input_content.content), + }; -impl Default for FileType { - fn default() -> Self { - Self::Plaintext + // Only keep legit URLs. For example this filters out anchors. + let mut requests: HashSet = HashSet::new(); + for link in links { + let req = if let Ok(uri) = Uri::try_from(link.as_str()) { + Request::new(uri, input_content.input.clone()) + } else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) { + Request::new(Uri { url }, input_content.input.clone()) + } else if let Input::FsPath(root) = &input_content.input { + if url::is_anchor(&link) { + // Silently ignore anchor links for now + continue; + } + let url = create_uri_from_path(root, base, &link)?; + Request::new(Uri { url }, input_content.input.clone()) + } else { + info!("Handling of {} not implemented yet", &link); + continue; + }; + requests.insert(req); } + Ok(requests) } -impl> From

for FileType { - /// Detect if the given path points to a Markdown, HTML, or plaintext file. - fn from(p: P) -> FileType { - let path = p.as_ref(); - // Assume HTML in case of no extension. - // Note: this is only reasonable for URLs; not paths on disk. - // For example, `README` without an extension is more likely to be a plaintext file. - // A better solution would be to also implement `From for FileType`. - // Unfortunately that's not possible without refactoring, as - // `AsRef` could be implemented for `Url` in the future, which is why - // `From for FileType` is not allowed. - match path.extension().and_then(std::ffi::OsStr::to_str) { - Some("md" | "markdown") => FileType::Markdown, - Some("htm" | "html") | None => FileType::Html, - Some(_) => FileType::Plaintext, - } - } -} - -// Use LinkFinder here to offload the actual link searching in plaintext. -fn find_links(input: &str) -> Vec { - let finder = LinkFinder::new(); - finder.links(input).collect() -} - -/// Extract unparsed URL strings from a markdown string. +/// Extract unparsed URL strings from a Markdown string. fn extract_links_from_markdown(input: &str) -> Vec { let parser = Parser::new(input); parser .flat_map(|event| match event { - MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => { - vec![url.to_string()] - } + MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()], MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()), MDEvent::Html(html) => extract_links_from_html(&html.to_string()), _ => vec![], @@ -68,15 +64,15 @@ fn extract_links_from_markdown(input: &str) -> Vec { .collect() } -/// Extract unparsed URL strings from a HTML string. +/// Extract unparsed URL strings from an HTML string. fn extract_links_from_html(input: &str) -> Vec { let tendril = StrTendril::from(input); let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril); let mut urls = Vec::new(); - // we pass mutable urls reference to avoid extra allocations in each - // recursive descent + // We pass mutable URL references here to avoid + // extra allocations in each recursive descent walk_html_links(&mut urls, &rc_dom.document); urls @@ -101,7 +97,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { for attr in attrs.borrow().iter() { let attr_value = attr.value.to_string(); - if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { + if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { urls.push(attr_value); } else { urls.append(&mut extract_links_from_plaintext(&attr_value)); @@ -113,56 +109,34 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) { } // recursively traverse the document's nodes -- this doesn't need any extra - // exit conditions because the document is a tree + // exit conditions, because the document is a tree for child in node.children.borrow().iter() { walk_html_links(&mut urls, child); } } -/// Determine if element's attribute contains a link / URL. -fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool { - // See a comprehensive list of attributes that might contain URLs/URIs - // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes - matches!( - (attr_name, elem_name), - ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body") - ) -} - -/// Extract unparsed URL strings from a plaintext. +/// Extract unparsed URL strings from plaintext fn extract_links_from_plaintext(input: &str) -> Vec { - find_links(input) + url::find_links(input) .iter() .map(|l| String::from(l.as_str())) .collect() } -pub(crate) fn extract_links( - input_content: &InputContent, - base_url: &Option, -) -> HashSet { - let links = match input_content.file_type { - FileType::Markdown => extract_links_from_markdown(&input_content.content), - FileType::Html => extract_links_from_html(&input_content.content), - FileType::Plaintext => extract_links_from_plaintext(&input_content.content), - }; - - // Only keep legit URLs. This sorts out things like anchors. - // Silently ignore the parse failures for now. - let mut requests: HashSet = HashSet::new(); - for link in links { - if let Ok(uri) = Uri::try_from(link.as_str()) { - requests.insert(Request::new(uri, input_content.input.clone())); - } else if !Path::new(&link).exists() { - if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) { - requests.insert(Request::new( - Uri { url: new_url }, - input_content.input.clone(), - )); - } - }; - } - requests +fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result { + let dst = url::remove_get_params_and_fragment(dst); + // Avoid double-encoding already encoded destination paths by removing any + // potential encoding (e.g. `web%20site` becomes `web site`). + // That's because Url::from_file_path will encode the full URL in the end. + // This behavior cannot be configured. + // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411 + // TODO: This is not a perfect solution. + // Ideally, only `src` and `base` should be URL encoded (as is done by + // `from_file_path` at the moment) while `dst` is left untouched and simply + // appended to the end. + let decoded = percent_decode_str(dst).decode_utf8()?.to_string(); + let path = path::resolve(src, &PathBuf::from(decoded), base)?; + Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path)) } #[cfg(test)] @@ -176,17 +150,24 @@ mod test { }; use pretty_assertions::assert_eq; - use url::Url; - use super::{ - extract_links, extract_links_from_html, extract_links_from_markdown, - extract_links_from_plaintext, find_links, FileType, - }; + use super::*; use crate::{ - collector::InputContent, + helpers::url::find_links, test_utils::{mail, website}, Uri, }; + use crate::{ + types::{FileType, InputContent}, + Base, + }; + + #[test] + fn test_create_uri_from_path() { + let result = + create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap(); + assert_eq!(result.as_str(), "file:///test+encoding"); + } fn load_fixture(filename: &str) -> String { let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR")) @@ -207,13 +188,13 @@ mod test { } fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet { - extract_links( - &InputContent::from_string(input, file_type), - &base_url.map(|u| Url::parse(u).unwrap()), - ) - .into_iter() - .map(|r| r.uri) - .collect() + let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap())); + extract_links(&InputContent::from_string(input, file_type), &base) + // unwrap is fine here as this helper function is only used in tests + .unwrap() + .into_iter() + .map(|r| r.uri) + .collect() } #[test] diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index f9daac8..0726aa6 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr}; pub use excludes::Excludes; pub use includes::Includes; -use crate::uri::Uri; +use crate::Uri; /// Pre-defined exclusions for known false-positives static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; diff --git a/lychee-lib/src/helpers/mod.rs b/lychee-lib/src/helpers/mod.rs new file mode 100644 index 0000000..94f2d21 --- /dev/null +++ b/lychee-lib/src/helpers/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod path; +pub(crate) mod url; diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs new file mode 100644 index 0000000..b31d522 --- /dev/null +++ b/lychee-lib/src/helpers/path.rs @@ -0,0 +1,141 @@ +use crate::{Base, ErrorKind, Result}; +use path_clean::PathClean; +use std::env; +use std::path::{Path, PathBuf}; + +// Returns the base if it is a valid `PathBuf` +fn get_base_dir(base: &Option) -> Option { + base.as_ref().and_then(Base::dir) +} + +// https://stackoverflow.com/a/54817755/270334 +pub(crate) fn absolute_path(path: impl AsRef) -> Result { + let path = path.as_ref(); + + let absolute_path = if path.is_absolute() { + path.to_path_buf() + } else { + env::current_dir()?.join(path) + } + .clean(); + + Ok(absolute_path) +} + +// Get the parent directory of a given `Path`. +fn dirname(src: &Path) -> PathBuf { + if src.is_file() { + src.to_path_buf() + .parent() + .map_or(PathBuf::new(), Path::to_path_buf) + } else { + src.to_path_buf() + } +} + +// Resolve `dst` that was linked to from within `src` +pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { + if dst.is_relative() { + // Find `dst` in the parent directory of `src` + if let Some(parent) = src.parent() { + let rel_path = parent.join(dst.to_path_buf()); + return absolute_path(&rel_path); + } + } + if dst.is_absolute() { + // Absolute local links (leading slash) require the `base_url` to + // define the document root. + let base = get_base_dir(base).ok_or_else(|| { + ErrorKind::InvalidBase( + "".to_string(), + format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst) + ) + })?; + let abs_path = join(dirname(&base), dst); + return absolute_path(&abs_path); + } + Err(ErrorKind::FileNotFound(dst.to_path_buf())) +} + +// A cumbersome way to concatenate paths without checking their +// existence on disk. See https://github.com/rust-lang/rust/issues/16507 +fn join(base: PathBuf, dst: &Path) -> PathBuf { + let mut abs = base.into_os_string(); + let target_str = dst.as_os_str(); + abs.push(target_str); + PathBuf::from(abs) +} + +#[cfg(test)] +mod test_path { + use super::*; + use crate::Result; + + // index.html + // ./foo.html + #[test] + fn test_resolve_relative() -> Result<()> { + let dummy = PathBuf::from("index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&dummy, &abs_path, &None)?, + env::current_dir()?.join("foo.html") + ); + Ok(()) + } + + // ./index.html + // ./foo.html + #[test] + fn test_resolve_relative_index() -> Result<()> { + let dummy = PathBuf::from("./index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&dummy, &abs_path, &None)?, + env::current_dir()?.join("foo.html") + ); + Ok(()) + } + + // /path/to/index.html + // ./foo.html + #[test] + fn test_resolve_from_absolute() -> Result<()> { + let abs_index = PathBuf::from("/path/to/index.html"); + let abs_path = PathBuf::from("./foo.html"); + assert_eq!( + resolve(&abs_index, &abs_path, &None)?, + PathBuf::from("/path/to/foo.html") + ); + Ok(()) + } + + // dummy + // foo.html + // valid base dir + #[test] + fn test_resolve_absolute_from_base_dir() -> Result<()> { + let dummy = PathBuf::new(); + let abs_path = PathBuf::from("/foo.html"); + let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir"))); + assert_eq!( + resolve(&dummy, &abs_path, &base)?, + PathBuf::from("/some/absolute/base/dir/foo.html") + ); + Ok(()) + } + + // /path/to/index.html + // /other/path/to/foo.html + #[test] + fn test_resolve_absolute_from_absolute() -> Result<()> { + let abs_index = PathBuf::from("/path/to/index.html"); + let abs_path = PathBuf::from("/other/path/to/foo.html"); + let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir"))); + assert_eq!( + resolve(&abs_index, &abs_path, &base)?, + PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html") + ); + Ok(()) + } +} diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs new file mode 100644 index 0000000..712d6f8 --- /dev/null +++ b/lychee-lib/src/helpers/url.rs @@ -0,0 +1,93 @@ +use linkify::LinkFinder; + +/// Remove all GET parameters from a URL. +/// The link is not a URL but a String as it may not have a base domain. +pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { + let path = match url.split_once('#') { + Some((path_without_fragment, _fragment)) => path_without_fragment, + None => url, + }; + let path = match path.split_once('?') { + Some((path_without_params, _params)) => path_without_params, + None => path, + }; + path +} + +/// Determine if an element's attribute contains a link / URL. +pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool { + // See a comprehensive list of attributes that might contain URLs/URIs + // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes + matches!( + (attr_name, elem_name), + ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body") + ) +} + +// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs +pub(crate) fn is_anchor(url: &str) -> bool { + url.starts_with('#') +} + +// Use `LinkFinder` to offload the raw link searching in plaintext +pub(crate) fn find_links(input: &str) -> Vec { + let finder = LinkFinder::new(); + finder.links(input).collect() +} + +#[cfg(test)] +mod test_fs_tree { + use super::*; + + #[test] + fn test_is_anchor() { + assert!(is_anchor("#anchor")); + assert!(!is_anchor("notan#anchor")); + } + + #[test] + fn test_remove_get_params_and_fragment() { + assert_eq!(remove_get_params_and_fragment("/"), "/"); + assert_eq!( + remove_get_params_and_fragment("index.html?foo=bar"), + "index.html" + ); + assert_eq!( + remove_get_params_and_fragment("/index.html?foo=bar"), + "/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), + "/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html?foo=bar"), + "https://example.org/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar"), + "test.png" + ); + + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html#anchor"), + "https://example.org/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"), + "https://example.org/index.html" + ); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar#anchor"), + "test.png" + ); + assert_eq!( + remove_get_params_and_fragment("test.png#anchor?anchor!?"), + "test.png" + ); + assert_eq!( + remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"), + "test.png" + ); + } +} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 71ba9d6..22b76f8 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -41,18 +41,18 @@ )] #![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)] #![deny(missing_docs)] +#![allow(clippy::module_name_repetitions)] #[cfg(doctest)] doc_comment::doctest!("../../README.md"); mod client; mod client_pool; -mod quirks; -mod types; -mod uri; - /// A pool of clients, to handle concurrent checks pub mod collector; +mod helpers; +mod quirks; +mod types; /// Functionality to extract URIs from inputs pub mod extract; @@ -75,8 +75,7 @@ use ring as _; // required for apple silicon pub use crate::{ client::{check, ClientBuilder}, client_pool::ClientPool, - collector::{Collector, Input}, + collector::Collector, filter::{Excludes, Filter, Includes}, - types::{ErrorKind, Request, Response, ResponseBody, Result, Status}, - uri::Uri, + types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, }; diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs new file mode 100644 index 0000000..affeacc --- /dev/null +++ b/lychee-lib/src/types/base.rs @@ -0,0 +1,83 @@ +use reqwest::Url; +use serde::{Deserialize, Serialize}; +use std::{convert::TryFrom, path::PathBuf}; + +use crate::ErrorKind; + +/// When encountering links without a full domain in a document, +/// the base determines where this resource can be found. +/// Both, local and remote targets are supported. +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] +#[allow(variant_size_differences)] +pub enum Base { + /// Local file path pointing to root directory + Local(PathBuf), + /// Remote URL pointing to a website homepage + Remote(Url), +} + +impl Base { + /// Join link with base url + #[must_use] + pub fn join(&self, link: &str) -> Option { + match self { + Self::Remote(url) => url.join(link).ok(), + Self::Local(_) => None, + } + } + + /// Return the directory if the base is local + #[must_use] + pub fn dir(&self) -> Option { + match self { + Self::Remote(_) => None, + Self::Local(d) => Some(d.clone()), + } + } +} + +impl TryFrom<&str> for Base { + type Error = ErrorKind; + + fn try_from(value: &str) -> Result { + if let Ok(url) = Url::parse(value) { + if url.cannot_be_a_base() { + return Err(ErrorKind::InvalidBase( + value.to_string(), + "The given URL cannot be a base".to_string(), + )); + } + return Ok(Self::Remote(url)); + } + Ok(Self::Local(PathBuf::from(value))) + } +} + +#[cfg(test)] +mod test_base { + use crate::Result; + + use super::*; + + #[test] + fn test_valid_remote() -> Result<()> { + let base = Base::try_from("https://endler.dev")?; + assert_eq!( + base, + Base::Remote(Url::parse("https://endler.dev").unwrap()) + ); + Ok(()) + } + + #[test] + fn test_invalid_url() { + assert!(Base::try_from("data:text/plain,Hello?World#").is_err()); + } + + #[test] + fn test_valid_local() -> Result<()> { + let dir = tempfile::tempdir()?; + Base::try_from(dir.as_ref().to_str().unwrap())?; + Ok(()) + } +} diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 0710f5e..4a76141 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -10,21 +10,32 @@ use crate::Uri; #[derive(Debug)] #[non_exhaustive] pub enum ErrorKind { - // TODO: maybe need to be splitted; currently first slot is Some only for reading files + // TODO: maybe needs to be split; currently first element is `Some` only for + // reading files /// Any form of I/O error occurred while reading from a given path. IoError(Option, std::io::Error), + /// Errors which can occur when attempting to interpret a sequence of u8 as a string + Utf8Error(std::str::Utf8Error), /// Network error when trying to connect to an endpoint via reqwest. ReqwestError(reqwest::Error), /// Network error when trying to connect to an endpoint via hubcaps. HubcapsError(hubcaps::Error), - /// The given string can not be parsed into a valid URL or e-mail address + /// The given string can not be parsed into a valid URL, e-mail address, or file path UrlParseError(String, (url::ParseError, Option)), + /// The given URI cannot be converted to a file path + InvalidFilePath(Uri), + /// The given path cannot be converted to a URI + InvalidUrl(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. /// A possible error when converting a `HeaderValue` from a string or byte /// slice. InvalidHeader(InvalidHeaderValue), + /// The given string can not be parsed into a valid base URL or base directory + InvalidBase(String, String), + /// Cannot find local file + FileNotFound(PathBuf), /// The given UNIX glob pattern is invalid InvalidGlobPattern(glob::PatternError), /// The Github API could not be called because of a missing Github token. @@ -63,8 +74,14 @@ impl Hash for ErrorKind { Self::IoError(p, e) => (p, e.kind()).hash(state), Self::ReqwestError(e) => e.to_string().hash(state), Self::HubcapsError(e) => e.to_string().hash(state), + Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), + Self::InvalidUrl(p) => p.hash(state), + Self::Utf8Error(e) => e.to_string().hash(state), + Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { + u.hash(state); + } + Self::InvalidBase(base, e) => (base, e).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), @@ -84,6 +101,7 @@ impl Display for ErrorKind { Self::IoError(None, e) => e.fmt(f), Self::ReqwestError(e) => e.fmt(f), Self::HubcapsError(e) => e.fmt(f), + Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()), Self::UrlParseError(s, (url_err, Some(mail_err))) => { write!( f, @@ -94,6 +112,8 @@ impl Display for ErrorKind { Self::UrlParseError(s, (url_err, None)) => { write!(f, "Cannot parse {} as website url ({})", s, url_err) } + Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u), + Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()), Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), @@ -106,6 +126,8 @@ impl Display for ErrorKind { "This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead", uri ), + Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e), + Self::Utf8Error(e) => e.fmt(f), } } } @@ -125,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind { } } +impl From for ErrorKind { + fn from(e: std::str::Utf8Error) -> Self { + Self::Utf8Error(e) + } +} + impl From for ErrorKind { fn from(e: std::io::Error) -> Self { Self::IoError(None, e) @@ -149,6 +177,12 @@ impl From for ErrorKind { } } +impl From for ErrorKind { + fn from(e: url::ParseError) -> Self { + Self::UrlParseError("Cannot parse URL".to_string(), (e, None)) + } +} + impl From<(String, url::ParseError)> for ErrorKind { fn from(value: (String, url::ParseError)) -> Self { Self::UrlParseError(value.0, (value.1, None)) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs new file mode 100644 index 0000000..d0d9510 --- /dev/null +++ b/lychee-lib/src/types/file.rs @@ -0,0 +1,37 @@ +use std::path::Path; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +/// `FileType` defines which file types lychee can handle +pub enum FileType { + /// File in HTML format + Html, + /// File in Markdown format + Markdown, + /// Generic text file without syntax-specific parsing + Plaintext, +} + +impl Default for FileType { + fn default() -> Self { + Self::Plaintext + } +} + +impl> From

for FileType { + /// Detect if the given path points to a Markdown, HTML, or plaintext file. + fn from(p: P) -> FileType { + let path = p.as_ref(); + // Assume HTML in case of no extension. + // Note: this is only reasonable for URLs; not paths on disk. + // For example, `README` without an extension is more likely to be a plaintext file. + // A better solution would be to also implement `From for FileType`. + // Unfortunately that's not possible without refactoring, as + // `AsRef` could be implemented for `Url` in the future, which is why + // `From for FileType` is not allowed. + match path.extension().and_then(std::ffi::OsStr::to_str) { + Some("md" | "markdown") => FileType::Markdown, + Some("htm" | "html") | None => FileType::Html, + Some(_) => FileType::Plaintext, + } + } +} diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs new file mode 100644 index 0000000..ad97355 --- /dev/null +++ b/lychee-lib/src/types/input.rs @@ -0,0 +1,214 @@ +use crate::types::FileType; +use crate::Result; +use glob::glob_with; +use reqwest::Url; +use serde::Serialize; +use shellexpand::tilde; +use std::path::{Path, PathBuf}; +use std::{fmt::Display, fs::read_to_string}; +use tokio::io::{stdin, AsyncReadExt}; + +const STDIN: &str = "-"; +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +/// An exhaustive list of input sources, which lychee accepts +pub enum Input { + /// URL (of HTTP/HTTPS scheme). + RemoteUrl(Box), + /// Unix shell-style glob pattern. + FsGlob { + /// The glob pattern matching all input files + pattern: String, + /// Don't be case sensitive when matching files against a glob + ignore_case: bool, + }, + /// File path. + FsPath(PathBuf), + /// Standard Input. + Stdin, + /// Raw string input. + String(String), +} + +impl Serialize for Input { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + serializer.collect_str(self) + } +} + +impl Display for Input { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Input::RemoteUrl(url) => url.as_str(), + Input::FsGlob { pattern, .. } => pattern, + Input::FsPath(path) => path.to_str().unwrap_or_default(), + Input::Stdin => "stdin", + Input::String(_) => "raw input string", + }) + } +} + +#[derive(Debug)] +/// Encapsulates the content for a given input +pub struct InputContent { + /// Input source + pub input: Input, + /// File type of given input + pub file_type: FileType, + /// Raw UTF-8 string content + pub content: String, +} + +impl InputContent { + #[must_use] + /// Create an instance of `InputContent` from an input string + pub fn from_string(s: &str, file_type: FileType) -> Self { + // TODO: consider using Cow (to avoid one .clone() for String types) + Self { + input: Input::String(s.to_owned()), + file_type, + content: s.to_owned(), + } + } +} + +impl Input { + #[must_use] + /// Construct a new `Input` source. In case the input is a `glob` pattern, + /// `glob_ignore_case` decides whether matching files against the `glob` is + /// case-insensitive or not + pub fn new(value: &str, glob_ignore_case: bool) -> Self { + if value == STDIN { + Self::Stdin + } else if let Ok(url) = Url::parse(value) { + Self::RemoteUrl(Box::new(url)) + } else { + // this seems to be the only way to determine if this is a glob pattern + let is_glob = glob::Pattern::escape(value) != value; + + if is_glob { + Self::FsGlob { + pattern: value.to_owned(), + ignore_case: glob_ignore_case, + } + } else { + Self::FsPath(value.into()) + } + } + } + + #[allow(clippy::missing_panics_doc)] + /// Retrieve the contents from the input + /// + /// # Errors + /// + /// Returns an error if the contents can not be retrieved + /// because of an underlying I/O error (e.g. an error while making a + /// network request or retrieving the contents from the file system) + pub async fn get_contents( + &self, + file_type_hint: Option, + skip_missing: bool, + ) -> Result> { + match *self { + // TODO: should skip_missing also affect URLs? + Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]), + Input::FsGlob { + ref pattern, + ignore_case, + } => Ok(Self::glob_contents(pattern, ignore_case).await?), + Input::FsPath(ref path) => { + let content = Self::path_content(path); + match content { + Ok(input_content) => Ok(vec![input_content]), + Err(_) if skip_missing => Ok(vec![]), + Err(e) => Err(e), + } + } + Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]), + Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]), + } + } + + async fn url_contents(url: &Url) -> Result { + // Assume HTML for default paths + let file_type = if url.path().is_empty() || url.path() == "/" { + FileType::Html + } else { + FileType::from(url.as_str()) + }; + + let res = reqwest::get(url.clone()).await?; + let input_content = InputContent { + input: Input::RemoteUrl(Box::new(url.clone())), + file_type, + content: res.text().await?, + }; + + Ok(input_content) + } + + async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> { + let mut contents = vec![]; + let glob_expanded = tilde(&path_glob); + let mut match_opts = glob::MatchOptions::new(); + + match_opts.case_sensitive = !ignore_case; + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => { + if path.is_dir() { + // Directories can still have a suffix which looks like + // a file extension like `foo.html`. This can lead to + // unexpected behavior with glob patterns like + // `**/*.html`. Therefore filter these out. + // https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819 + continue; + } + let content = Self::path_content(&path)?; + contents.push(content); + } + Err(e) => println!("{:?}", e), + } + } + + Ok(contents) + } + + /// Get the input content of a given path + /// # Errors + /// + /// Will return `Err` if file contents can't be read + pub fn path_content + AsRef + Clone>(path: P) -> Result { + let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?; + let input_content = InputContent { + file_type: FileType::from(path.as_ref()), + content, + input: Input::FsPath(path.into()), + }; + + Ok(input_content) + } + + async fn stdin_content(file_type_hint: Option) -> Result { + let mut content = String::new(); + let mut stdin = stdin(); + stdin.read_to_string(&mut content).await?; + + let input_content = InputContent { + input: Input::Stdin, + file_type: file_type_hint.unwrap_or_default(), + content, + }; + + Ok(input_content) + } + + fn string_content(s: &str, file_type_hint: Option) -> InputContent { + InputContent::from_string(s, file_type_hint.unwrap_or_default()) + } +} diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index a48f7a9..9453d5e 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -1,14 +1,22 @@ #![allow(unreachable_pub)] +mod base; mod error; +mod file; +mod input; mod request; mod response; mod status; +mod uri; +pub use base::Base; pub use error::ErrorKind; +pub use file::FileType; +pub use input::{Input, InputContent}; pub use request::Request; pub use response::{Response, ResponseBody}; pub use status::Status; +pub use uri::Uri; /// The lychee `Result` type pub type Result = std::result::Result; diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/types/uri.rs similarity index 97% rename from lychee-lib/src/uri.rs rename to lychee-lib/src/types/uri.rs index a25aad3..6ad126c 100644 --- a/lychee-lib/src/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -82,9 +82,16 @@ impl Uri { } #[inline] + /// Check if the URI is a valid mail address pub(crate) fn is_mail(&self) -> bool { self.scheme() == "mailto" } + + #[inline] + /// Check if the URI is a file + pub(crate) fn is_file(&self) -> bool { + self.scheme() == "file" + } } impl AsRef for Uri {