+
+
+
\ No newline at end of file
diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs
index b03fc35..68bf3ca 100644
--- a/lychee-bin/src/client.rs
+++ b/lychee-bin/src/client.rs
@@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) -
ClientBuilder::builder()
.remaps(remaps)
+ .base(cfg.base.clone())
.includes(includes)
.excludes(excludes)
.exclude_all_private(cfg.exclude_all_private)
diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs
index fd65499..2ffc0ed 100644
--- a/lychee-bin/src/commands/check.rs
+++ b/lychee-bin/src/commands/check.rs
@@ -429,14 +429,12 @@ mod tests {
#[tokio::test]
async fn test_invalid_url() {
- // Run a normal request with an invalid Url
let client = ClientBuilder::builder().build().client().unwrap();
- let request = Request::try_from("http://\"").unwrap();
- let response = check_url(&client, request).await;
- assert!(response.status().is_error());
+ let uri = Uri::try_from("http://\"").unwrap();
+ let response = client.check_website(&uri, None).await.unwrap();
assert!(matches!(
- response.status(),
- Status::Error(ErrorKind::InvalidURI(_))
+ response,
+ Status::Unsupported(ErrorKind::BuildRequestClient(_))
));
}
diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs
index 137c70a..9aa12df 100644
--- a/lychee-bin/src/formatters/response/color.rs
+++ b/lychee-bin/src/formatters/response/color.rs
@@ -65,12 +65,18 @@ mod tests {
}
}
+ #[cfg(test)]
+ /// Helper function to strip ANSI color codes for tests
+ fn strip_ansi_codes(s: &str) -> String {
+ console::strip_ansi_codes(s).to_string()
+ }
+
#[test]
fn test_format_response_with_ok_status() {
let formatter = ColorFormatter;
let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com");
assert_eq!(
- formatter.format_response(&body),
+ strip_ansi_codes(&formatter.format_response(&body)),
" [200] https://example.com/"
);
}
@@ -83,7 +89,7 @@ mod tests {
"https://example.com/404",
);
assert_eq!(
- formatter.format_response(&body),
+ strip_ansi_codes(&formatter.format_response(&body)),
" [ERROR] https://example.com/404"
);
}
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index 2d23b9d..fcb7530 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -267,17 +267,17 @@ mod cli {
#[test]
fn test_resolve_paths() {
let mut cmd = main_command();
- let offline_dir = fixtures_path().join("offline");
+ let dir = fixtures_path().join("resolve_paths");
cmd.arg("--offline")
.arg("--base")
- .arg(&offline_dir)
- .arg(offline_dir.join("index.html"))
+ .arg(&dir)
+ .arg(dir.join("index.html"))
.env_clear()
.assert()
.success()
- .stdout(contains("4 Total"))
- .stdout(contains("4 OK"));
+ .stdout(contains("3 Total"))
+ .stdout(contains("3 OK"));
}
#[test]
@@ -944,13 +944,17 @@ mod cli {
// check content of cache file
let data = fs::read_to_string(&cache_file)?;
+
+ if data.is_empty() {
+ println!("Cache file is empty!");
+ }
+
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));
// clear the cache file
fs::remove_file(&cache_file)?;
-
Ok(())
}
@@ -1216,8 +1220,9 @@ mod cli {
Ok(())
}
- /// If base-dir is not set, don't throw an error in case we encounter
- /// an absolute local link within a file (e.g. `/about`).
+ /// If `base-dir` is not set, don't throw an error in case we encounter
+ /// an absolute local link (e.g. `/about`) within a file.
+ /// Instead, simply ignore the link.
#[test]
fn test_ignore_absolute_local_links_without_base() -> Result<()> {
let mut cmd = main_command();
@@ -1409,9 +1414,7 @@ mod cli {
.arg("./NOT-A-REAL-TEST-FIXTURE.md")
.assert()
.failure()
- .stderr(contains(
- "Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md",
- ));
+ .stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md"));
Ok(())
}
@@ -1667,4 +1670,46 @@ mod cli {
.success()
.stdout(contains("0 Errors"));
}
+
+ /// Test relative paths
+ ///
+ /// Imagine a web server hosting a site with the following structure:
+ /// root
+ /// └── test
+ /// ├── index.html
+ /// └── next.html
+ ///
+ /// where `root/test/index.html` contains `next`
+ /// When checking the link in `root/test/index.html` we should be able to
+ /// resolve the relative path to `root/test/next.html`
+ ///
+ /// Note that the relative path is not resolved to the root of the server
+ /// but relative to the file that contains the link.
+ #[tokio::test]
+ async fn test_resolve_relative_paths_in_subfolder() -> Result<()> {
+ let mock_server = wiremock::MockServer::start().await;
+
+ let body = r#"next"#;
+ wiremock::Mock::given(wiremock::matchers::method("GET"))
+ .and(wiremock::matchers::path("/test/index.html"))
+ .respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body))
+ .mount(&mock_server)
+ .await;
+
+ wiremock::Mock::given(wiremock::matchers::method("GET"))
+ .and(wiremock::matchers::path("/test/next.html"))
+ .respond_with(wiremock::ResponseTemplate::new(200))
+ .mount(&mock_server)
+ .await;
+
+ let mut cmd = main_command();
+ cmd.arg("--verbose")
+ .arg(format!("{}/test/index.html", mock_server.uri()))
+ .assert()
+ .success()
+ .stdout(contains("1 Total"))
+ .stdout(contains("0 Errors"));
+
+ Ok(())
+ }
}
diff --git a/lychee-lib/src/checker.rs b/lychee-lib/src/checker.rs
deleted file mode 100644
index 428551a..0000000
--- a/lychee-lib/src/checker.rs
+++ /dev/null
@@ -1,80 +0,0 @@
-use crate::{
- chain::{ChainResult, Handler},
- retry::RetryExt,
- Status,
-};
-use async_trait::async_trait;
-use http::StatusCode;
-use reqwest::Request;
-use std::{collections::HashSet, time::Duration};
-
-#[derive(Debug, Clone)]
-pub(crate) struct Checker {
- retry_wait_time: Duration,
- max_retries: u64,
- reqwest_client: reqwest::Client,
- accepted: Option>,
-}
-
-impl Checker {
- pub(crate) const fn new(
- retry_wait_time: Duration,
- max_retries: u64,
- reqwest_client: reqwest::Client,
- accepted: Option>,
- ) -> Self {
- Self {
- retry_wait_time,
- max_retries,
- reqwest_client,
- accepted,
- }
- }
-
- /// Retry requests up to `max_retries` times
- /// with an exponential backoff.
- pub(crate) async fn retry_request(&self, request: Request) -> Status {
- let mut retries: u64 = 0;
- let mut wait_time = self.retry_wait_time;
-
- let mut status = self.check_default(clone_unwrap(&request)).await;
- while retries < self.max_retries {
- if status.is_success() || !status.should_retry() {
- return status;
- }
- retries += 1;
- tokio::time::sleep(wait_time).await;
- wait_time = wait_time.saturating_mul(2);
- status = self.check_default(clone_unwrap(&request)).await;
- }
- status
- }
-
- /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
- async fn check_default(&self, request: Request) -> Status {
- match self.reqwest_client.execute(request).await {
- Ok(ref response) => Status::new(response, self.accepted.clone()),
- Err(e) => e.into(),
- }
- }
-}
-
-/// Clones a `reqwest::Request`.
-///
-/// # Safety
-///
-/// This panics if the request cannot be cloned. This should only happen if the
-/// request body is a `reqwest` stream. We disable the `stream` feature, so the
-/// body should never be a stream.
-///
-/// See
-fn clone_unwrap(request: &Request) -> Request {
- request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
-}
-
-#[async_trait]
-impl Handler for Checker {
- async fn handle(&mut self, input: Request) -> ChainResult {
- ChainResult::Done(self.retry_request(input).await)
- }
-}
diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs
new file mode 100644
index 0000000..0060bfb
--- /dev/null
+++ b/lychee-lib/src/checker/file.rs
@@ -0,0 +1,179 @@
+use http::StatusCode;
+use log::warn;
+use std::path::{Path, PathBuf};
+
+use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
+
+/// A utility for checking the existence and validity of file-based URIs.
+///
+/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
+/// It supports base path resolution, fallback extensions for files without extensions,
+/// and optional fragment checking for HTML files.
+#[derive(Debug, Clone)]
+pub(crate) struct FileChecker {
+ /// Base path or URL used for resolving relative paths.
+ base: Option,
+ /// List of file extensions to try if the original path doesn't exist.
+ fallback_extensions: Vec,
+ /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
+ include_fragments: bool,
+ /// Utility for performing fragment checks in HTML files.
+ fragment_checker: FragmentChecker,
+}
+
+impl FileChecker {
+ /// Creates a new `FileChecker` with the given configuration.
+ ///
+ /// # Arguments
+ ///
+ /// * `base` - Optional base path or URL for resolving relative paths.
+ /// * `fallback_extensions` - List of extensions to try if the original file is not found.
+ /// * `include_fragments` - Whether to check for fragment existence in HTML files.
+ pub(crate) fn new(
+ base: Option,
+ fallback_extensions: Vec,
+ include_fragments: bool,
+ ) -> Self {
+ Self {
+ base,
+ fallback_extensions,
+ include_fragments,
+ fragment_checker: FragmentChecker::new(),
+ }
+ }
+
+ /// Checks the given file URI for existence and validity.
+ ///
+ /// This method resolves the URI to a file path, checks if the file exists,
+ /// and optionally checks for the existence of fragments in HTML files.
+ ///
+ /// # Arguments
+ ///
+ /// * `uri` - The URI to check.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
+ pub(crate) async fn check(&self, uri: &Uri) -> Status {
+ let Ok(path) = uri.url.to_file_path() else {
+ return ErrorKind::InvalidFilePath(uri.clone()).into();
+ };
+
+ let resolved_path = self.resolve_path(&path);
+ self.check_path(&resolved_path, uri).await
+ }
+
+ /// Resolves the given path using the base path, if one is set.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to resolve.
+ ///
+ /// # Returns
+ ///
+ /// Returns the resolved path as a `PathBuf`.
+ fn resolve_path(&self, path: &Path) -> PathBuf {
+ if let Some(Base::Local(base_path)) = &self.base {
+ if path.is_absolute() {
+ let absolute_base_path = if base_path.is_relative() {
+ std::env::current_dir().unwrap_or_default().join(base_path)
+ } else {
+ base_path.clone()
+ };
+
+ let stripped = path.strip_prefix("/").unwrap_or(path);
+ absolute_base_path.join(stripped)
+ } else {
+ base_path.join(path)
+ }
+ } else {
+ path.to_path_buf()
+ }
+ }
+
+ /// Checks if the given path exists and performs additional checks if necessary.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
+ async fn check_path(&self, path: &Path, uri: &Uri) -> Status {
+ if path.exists() {
+ return self.check_existing_path(path, uri).await;
+ }
+
+ self.check_with_fallback_extensions(path, uri).await
+ }
+
+ /// Checks an existing path, optionally verifying fragments for HTML files.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
+ async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status {
+ if self.include_fragments {
+ self.check_fragment(path, uri).await
+ } else {
+ Status::Ok(StatusCode::OK)
+ }
+ }
+
+ /// Attempts to find a file by trying different extensions specified in `fallback_extensions`.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The original path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
+ async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
+ let mut path_buf = path.to_path_buf();
+
+ // If the path already has an extension, try it first
+ if path_buf.extension().is_some() && path_buf.exists() {
+ return self.check_existing_path(&path_buf, uri).await;
+ }
+
+ // Try fallback extensions
+ for ext in &self.fallback_extensions {
+ path_buf.set_extension(ext);
+ if path_buf.exists() {
+ return self.check_existing_path(&path_buf, uri).await;
+ }
+ }
+
+ ErrorKind::InvalidFilePath(uri.clone()).into()
+ }
+
+ /// Checks for the existence of a fragment in an HTML file.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to the HTML file.
+ /// * `uri` - The original URI, containing the fragment to check.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the fragment check.
+ async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
+ match self.fragment_checker.check(path, &uri.url).await {
+ Ok(true) => Status::Ok(StatusCode::OK),
+ Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
+ Err(err) => {
+ warn!("Skipping fragment check due to the following error: {err}");
+ Status::Ok(StatusCode::OK)
+ }
+ }
+ }
+}
diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs
new file mode 100644
index 0000000..e7dcef7
--- /dev/null
+++ b/lychee-lib/src/checker/mail.rs
@@ -0,0 +1,57 @@
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use http::StatusCode;
+
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use crate::ErrorKind;
+
+use crate::{Status, Uri};
+
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
+
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use crate::types::mail;
+
+/// A utility for checking the validity of email addresses.
+///
+/// `EmailChecker` is responsible for validating email addresses,
+/// optionally performing reachability checks when the appropriate
+/// features are enabled.
+#[derive(Debug, Clone)]
+pub(crate) struct MailChecker {}
+
+impl MailChecker {
+ /// Creates a new `EmailChecker`.
+ pub(crate) const fn new() -> Self {
+ Self {}
+ }
+
+ /// Check a mail address, or equivalently a `mailto` URI.
+ ///
+ /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
+ /// which are ignored by this check. They are not part of the mail address
+ /// and instead passed to a mail client.
+ #[cfg(all(feature = "email-check", feature = "native-tls"))]
+ pub(crate) async fn check_mail(&self, uri: &Uri) -> Status {
+ self.perform_email_check(uri).await
+ }
+
+ /// Ignore the mail check if the `email-check` and `native-tls` features are not enabled.
+ #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
+ pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status {
+ Status::Excluded
+ }
+
+ #[cfg(all(feature = "email-check", feature = "native-tls"))]
+ async fn perform_email_check(&self, uri: &Uri) -> Status {
+ let address = uri.url.path().to_string();
+ let input = CheckEmailInput::new(address);
+ let result = &(check_email(&input).await);
+
+ if let Reachable::Invalid = result.is_reachable {
+ ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
+ } else {
+ Status::Ok(StatusCode::OK)
+ }
+ }
+}
diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs
new file mode 100644
index 0000000..bfbef9d
--- /dev/null
+++ b/lychee-lib/src/checker/mod.rs
@@ -0,0 +1,7 @@
+//! Checker Module
+//!
+//! This module contains all checkers, which are responsible for checking the status of a URL.
+
+pub(crate) mod file;
+pub(crate) mod mail;
+pub(crate) mod website;
diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs
new file mode 100644
index 0000000..62cadaf
--- /dev/null
+++ b/lychee-lib/src/checker/website.rs
@@ -0,0 +1,226 @@
+use crate::{
+ chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
+ quirks::Quirks,
+ retry::RetryExt,
+ types::uri::github::GithubUri,
+ BasicAuthCredentials, ErrorKind, Status, Uri,
+};
+use async_trait::async_trait;
+use http::StatusCode;
+use octocrab::Octocrab;
+use reqwest::Request;
+use std::{collections::HashSet, time::Duration};
+
+#[derive(Debug, Clone)]
+pub(crate) struct WebsiteChecker {
+ /// Request method used for making requests.
+ method: reqwest::Method,
+
+ /// The HTTP client used for requests.
+ reqwest_client: reqwest::Client,
+
+ /// GitHub client used for requests.
+ github_client: Option,
+
+ /// The chain of plugins to be executed on each request.
+ plugin_request_chain: RequestChain,
+
+ /// Maximum number of retries per request before returning an error.
+ max_retries: u64,
+
+ /// Initial wait time between retries of failed requests. This doubles after
+ /// each failure.
+ retry_wait_time: Duration,
+
+ /// Set of accepted return codes / status codes.
+ ///
+ /// Unmatched return codes/ status codes are deemed as errors.
+ accepted: Option>,
+
+ /// Requires using HTTPS when it's available.
+ ///
+ /// This would treat unencrypted links as errors when HTTPS is available.
+ require_https: bool,
+}
+
+impl WebsiteChecker {
+ #[allow(clippy::too_many_arguments)]
+ pub(crate) const fn new(
+ method: reqwest::Method,
+ retry_wait_time: Duration,
+ max_retries: u64,
+ reqwest_client: reqwest::Client,
+ accepted: Option>,
+ github_client: Option,
+ require_https: bool,
+ plugin_request_chain: RequestChain,
+ ) -> Self {
+ Self {
+ method,
+ reqwest_client,
+ github_client,
+ plugin_request_chain,
+ max_retries,
+ retry_wait_time,
+ accepted,
+ require_https,
+ }
+ }
+
+ /// Retry requests up to `max_retries` times
+ /// with an exponential backoff.
+ pub(crate) async fn retry_request(&self, request: Request) -> Status {
+ let mut retries: u64 = 0;
+ let mut wait_time = self.retry_wait_time;
+ let mut status = self.check_default(clone_unwrap(&request)).await;
+ while retries < self.max_retries {
+ if status.is_success() || !status.should_retry() {
+ return status;
+ }
+ retries += 1;
+ tokio::time::sleep(wait_time).await;
+ wait_time = wait_time.saturating_mul(2);
+ status = self.check_default(clone_unwrap(&request)).await;
+ }
+ status
+ }
+
+ /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
+ async fn check_default(&self, request: Request) -> Status {
+ match self.reqwest_client.execute(request).await {
+ Ok(ref response) => Status::new(response, self.accepted.clone()),
+ Err(e) => e.into(),
+ }
+ }
+
+ /// Checks the given URI of a website.
+ ///
+ /// # Errors
+ ///
+ /// This returns an `Err` if
+ /// - The URI is invalid.
+ /// - The request failed.
+ /// - The response status code is not accepted.
+ /// - The URI cannot be converted to HTTPS.
+ pub(crate) async fn check_website(
+ &self,
+ uri: &Uri,
+ credentials: Option,
+ ) -> Result {
+ let default_chain: RequestChain = Chain::new(vec![
+ Box::::default(),
+ Box::new(credentials),
+ Box::new(self.clone()),
+ ]);
+
+ match self.check_website_inner(uri, &default_chain).await {
+ Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
+ if self
+ .check_website_inner(&uri.to_https()?, &default_chain)
+ .await
+ .is_success()
+ {
+ Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
+ } else {
+ Ok(Status::Ok(code))
+ }
+ }
+ s => Ok(s),
+ }
+ }
+
+ /// Checks the given URI of a website.
+ ///
+ /// Unsupported schemes will be ignored
+ ///
+ /// Note: we use `inner` to improve compile times by avoiding monomorphization
+ ///
+ /// # Errors
+ ///
+ /// This returns an `Err` if
+ /// - The URI is invalid.
+ /// - The request failed.
+ /// - The response status code is not accepted.
+ async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
+ let request = self
+ .reqwest_client
+ .request(self.method.clone(), uri.as_str())
+ .build();
+
+ let request = match request {
+ Ok(r) => r,
+ Err(e) => return e.into(),
+ };
+
+ let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
+ .traverse(request)
+ .await;
+
+ self.handle_github(status, uri).await
+ }
+
+ // Pull out the heavy machinery in case of a failed normal request.
+ // This could be a GitHub URL and we ran into the rate limiter.
+ // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
+ async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
+ if status.is_success() {
+ return status;
+ }
+
+ if let Ok(github_uri) = GithubUri::try_from(uri) {
+ let status = self.check_github(github_uri).await;
+ if status.is_success() {
+ return status;
+ }
+ }
+
+ status
+ }
+
+ /// Check a `uri` hosted on `GitHub` via the GitHub API.
+ ///
+ /// # Caveats
+ ///
+ /// Files inside private repositories won't get checked and instead would
+ /// be reported as valid if the repository itself is reachable through the
+ /// API.
+ ///
+ /// A better approach would be to download the file through the API or
+ /// clone the repo, but we chose the pragmatic approach.
+ async fn check_github(&self, uri: GithubUri) -> Status {
+ let Some(client) = &self.github_client else {
+ return ErrorKind::MissingGitHubToken.into();
+ };
+ let repo = match client.repos(&uri.owner, &uri.repo).get().await {
+ Ok(repo) => repo,
+ Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
+ };
+ if let Some(true) = repo.private {
+ return Status::Ok(StatusCode::OK);
+ } else if let Some(endpoint) = uri.endpoint {
+ return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
+ .into();
+ }
+ Status::Ok(StatusCode::OK)
+ }
+}
+
+/// Clones a `reqwest::Request`.
+///
+/// # Safety
+///
+/// This panics if the request cannot be cloned. This should only happen if the
+/// request body is a `reqwest` stream. We disable the `stream` feature, so the
+/// body should never be a stream.
+///
+/// See
+fn clone_unwrap(request: &Request) -> Request {
+ request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
+}
+
+#[async_trait]
+impl Handler for WebsiteChecker {
+ async fn handle(&mut self, input: Request) -> ChainResult {
+ ChainResult::Done(self.retry_request(input).await)
+ }
+}
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index 4ba97b2..f9566f0 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -15,8 +15,6 @@
)]
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
-#[cfg(all(feature = "email-check", feature = "native-tls"))]
-use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
use http::{
header::{HeaderMap, HeaderValue},
StatusCode,
@@ -24,25 +22,21 @@ use http::{
use log::{debug, warn};
use octocrab::Octocrab;
use regex::RegexSet;
-use reqwest::{header, redirect, Url};
+use reqwest::{header, redirect};
use reqwest_cookie_store::CookieStoreMutex;
use secrecy::{ExposeSecret, SecretString};
use typed_builder::TypedBuilder;
use crate::{
- chain::{Chain, ClientRequestChains, RequestChain},
- checker::Checker,
+ chain::RequestChain,
+ checker::file::FileChecker,
+ checker::{mail::MailChecker, website::WebsiteChecker},
filter::{Excludes, Filter, Includes},
- quirks::Quirks,
remap::Remaps,
- types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
- ErrorKind, Request, Response, Result, Status, Uri,
+ Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};
-#[cfg(all(feature = "email-check", feature = "native-tls"))]
-use crate::types::mail;
-
/// Default number of redirects before a request is deemed as failed, 5.
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
/// Default number of retries before a request is deemed as failed, 3.
@@ -248,6 +242,12 @@ pub struct ClientBuilder {
/// Response timeout per request in seconds.
timeout: Option,
+ /// Base for resolving paths.
+ ///
+ /// E.g. if the base is `/home/user/` and the path is `file.txt`, the
+ /// resolved path would be `/home/user/file.txt`.
+ base: Option,
+
/// Initial time between retries of failed requests.
///
/// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`].
@@ -383,20 +383,28 @@ impl ClientBuilder {
include_mail: self.include_mail,
};
- Ok(Client {
+ let website_checker = WebsiteChecker::new(
+ self.method,
+ self.retry_wait_time,
+ self.max_retries,
reqwest_client,
+ self.accepted,
github_client,
+ self.require_https,
+ self.plugin_request_chain,
+ );
+
+ Ok(Client {
remaps: self.remaps,
- fallback_extensions: self.fallback_extensions,
filter,
- max_retries: self.max_retries,
- retry_wait_time: self.retry_wait_time,
- method: self.method,
- accepted: self.accepted,
- require_https: self.require_https,
- include_fragments: self.include_fragments,
+ email_checker: MailChecker::new(),
+ website_checker,
+ file_checker: FileChecker::new(
+ self.base,
+ self.fallback_extensions,
+ self.include_fragments,
+ ),
fragment_checker: FragmentChecker::new(),
- plugin_request_chain: self.plugin_request_chain,
})
}
}
@@ -407,50 +415,23 @@ impl ClientBuilder {
/// options.
#[derive(Debug, Clone)]
pub struct Client {
- /// Underlying `reqwest` client instance that handles the HTTP requests.
- reqwest_client: reqwest::Client,
-
- /// Optional GitHub client that handles communications with GitHub.
- github_client: Option,
-
/// Optional remapping rules for URIs matching pattern.
remaps: Option,
- /// Automatically append file extensions to `file://` URIs as needed
- fallback_extensions: Vec,
-
/// Rules to decided whether each link should be checked or ignored.
filter: Filter,
- /// Maximum number of retries per request before returning an error.
- max_retries: u64,
+ /// A checker for website URLs.
+ website_checker: WebsiteChecker,
- /// Initial wait time between retries of failed requests. This doubles after
- /// each failure.
- retry_wait_time: Duration,
+ /// A checker for file URLs.
+ file_checker: FileChecker,
- /// HTTP method used for requests, e.g. `GET` or `HEAD`.
- ///
- /// The same method will be used for all links.
- method: reqwest::Method,
-
- /// Set of accepted return codes / status codes.
- ///
- /// Unmatched return codes/ status codes are deemed as errors.
- accepted: Option>,
-
- /// Requires using HTTPS when it's available.
- ///
- /// This would treat unencrypted links as errors when HTTPS is available.
- require_https: bool,
-
- /// Enable the checking of fragments in links.
- include_fragments: bool,
+ /// A checker for email URLs.
+ email_checker: MailChecker,
/// Caches Fragments
fragment_checker: FragmentChecker,
-
- plugin_request_chain: RequestChain,
}
impl Client {
@@ -463,7 +444,7 @@ impl Client {
///
/// Returns an `Err` if:
/// - `request` does not represent a valid URI.
- /// - Encrypted connection for a HTTP URL is available but unused. (Only
+ /// - Encrypted connection for a HTTP URL is available but unused. (Only
/// checked when `Client::require_https` is `true`.)
#[allow(clippy::missing_panics_doc)]
pub async fn check(&self, request: T) -> Result
@@ -493,27 +474,22 @@ impl Client {
return Ok(Response::new(uri.clone(), Status::Excluded, source));
}
- let default_chain: RequestChain = Chain::new(vec![
- Box::::default(),
- Box::new(credentials),
- Box::new(Checker::new(
- self.retry_wait_time,
- self.max_retries,
- self.reqwest_client.clone(),
- self.accepted.clone(),
- )),
- ]);
-
let status = match uri.scheme() {
+ // We don't check tel: URIs
+ _ if uri.is_tel() => Status::Excluded,
_ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
- _ if uri.is_tel() => Status::Excluded,
- _ => self.check_website(uri, default_chain).await?,
+ _ => self.check_website(uri, credentials).await?,
};
Ok(Response::new(uri.clone(), status, source))
}
+ /// Check a single file using the file checker.
+ pub async fn check_file(&self, uri: &Uri) -> Status {
+ self.file_checker.check(uri).await
+ }
+
/// Remap `uri` using the client-defined remapping rules.
///
/// # Errors
@@ -541,151 +517,17 @@ impl Client {
/// - The request failed.
/// - The response status code is not accepted.
/// - The URI cannot be converted to HTTPS.
- pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result {
- match self.check_website_inner(uri, &default_chain).await {
- Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
- if self
- .check_website_inner(&uri.to_https()?, &default_chain)
- .await
- .is_success()
- {
- Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
- } else {
- // HTTPS is not available for this URI,
- // so the original HTTP URL is fine.
- Ok(Status::Ok(code))
- }
- }
- s => Ok(s),
- }
+ pub async fn check_website(
+ &self,
+ uri: &Uri,
+ credentials: Option,
+ ) -> Result {
+ self.website_checker.check_website(uri, credentials).await
}
- /// Checks the given URI of a website.
- ///
- /// Unsupported schemes will be ignored
- ///
- /// # Errors
- ///
- /// This returns an `Err` if
- /// - The URI is invalid.
- /// - The request failed.
- /// - The response status code is not accepted.
- pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
- // Workaround for upstream reqwest panic
- if validate_url(&uri.url) {
- if matches!(uri.scheme(), "http" | "https") {
- // This is a truly invalid URI with a known scheme.
- // If we pass that to reqwest it would panic.
- return Status::Error(ErrorKind::InvalidURI(uri.clone()));
- }
- // This is merely a URI with a scheme that is not supported by
- // reqwest yet. It would be safe to pass that to reqwest and it
- // wouldn't panic, but it's also unnecessary, because it would
- // simply return an error.
- return Status::Unsupported(ErrorKind::InvalidURI(uri.clone()));
- }
-
- let request = self
- .reqwest_client
- .request(self.method.clone(), uri.as_str())
- .build();
-
- let request = match request {
- Ok(r) => r,
- Err(e) => return e.into(),
- };
-
- let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
- .traverse(request)
- .await;
-
- self.handle_github(status, uri).await
- }
-
- // Pull out the heavy machinery in case of a failed normal request.
- // This could be a GitHub URL and we ran into the rate limiter.
- // TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023)
- async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
- if status.is_success() {
- return status;
- }
-
- if let Ok(github_uri) = GithubUri::try_from(uri) {
- let status = self.check_github(github_uri).await;
- // Only return GitHub status in case of success
- // Otherwise return the original error, which has more information
- if status.is_success() {
- return status;
- }
- }
-
- status
- }
-
- /// Check a `uri` hosted on `GitHub` via the GitHub API.
- ///
- /// # Caveats
- ///
- /// Files inside private repositories won't get checked and instead would
- /// be reported as valid if the repository itself is reachable through the
- /// API.
- ///
- /// A better approach would be to download the file through the API or
- /// clone the repo, but we chose the pragmatic approach.
- async fn check_github(&self, uri: GithubUri) -> Status {
- let Some(client) = &self.github_client else {
- return ErrorKind::MissingGitHubToken.into();
- };
- let repo = match client.repos(&uri.owner, &uri.repo).get().await {
- Ok(repo) => repo,
- Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
- };
- if let Some(true) = repo.private {
- // The private repo exists. Assume a given endpoint exists as well
- // (e.g. `issues` in `github.com/org/private/issues`). This is not
- // always the case but simplifies the check.
- return Status::Ok(StatusCode::OK);
- } else if let Some(endpoint) = uri.endpoint {
- // The URI returned a non-200 status code from a normal request and
- // now we find that this public repo is reachable through the API,
- // so that must mean the full URI (which includes the additional
- // endpoint) must be invalid.
- return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
- .into();
- }
- // Found public repo without endpoint
- Status::Ok(StatusCode::OK)
- }
-
- /// Check a `file` URI.
- pub async fn check_file(&self, uri: &Uri) -> Status {
- let Ok(path) = uri.url.to_file_path() else {
- return ErrorKind::InvalidFilePath(uri.clone()).into();
- };
-
- if path.exists() {
- if self.include_fragments {
- return self.check_fragment(&path, uri).await;
- }
- return Status::Ok(StatusCode::OK);
- }
-
- if path.extension().is_some() {
- return ErrorKind::InvalidFilePath(uri.clone()).into();
- }
-
- // if the path has no file extension, try to append some
- let mut path_buf = path.clone();
- for ext in &self.fallback_extensions {
- path_buf.set_extension(ext);
- if path_buf.exists() {
- if self.include_fragments {
- return self.check_fragment(&path_buf, uri).await;
- }
- return Status::Ok(StatusCode::OK);
- }
- }
- ErrorKind::InvalidFilePath(uri.clone()).into()
+ /// Checks a `mailto` URI.
+ pub async fn check_mail(&self, uri: &Uri) -> Status {
+ self.email_checker.check_mail(uri).await
}
/// Checks a `file` URI's fragment.
@@ -699,43 +541,6 @@ impl Client {
}
}
}
-
- /// Check a mail address, or equivalently a `mailto` URI.
- ///
- /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
- /// which are ignored by this check. The are not part of the mail address
- /// and instead passed to a mail client.
- #[cfg(all(feature = "email-check", feature = "native-tls"))]
- pub async fn check_mail(&self, uri: &Uri) -> Status {
- let address = uri.url.path().to_string();
- let input = CheckEmailInput::new(address);
- let result = &(check_email(&input).await);
-
- if let Reachable::Invalid = result.is_reachable {
- ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
- } else {
- Status::Ok(StatusCode::OK)
- }
- }
-
- /// Check a mail address, or equivalently a `mailto` URI.
- ///
- /// This implementation simply excludes all email addresses.
- #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
- #[allow(clippy::unused_async)]
- pub async fn check_mail(&self, _uri: &Uri) -> Status {
- Status::Excluded
- }
-}
-
-// Check if the given `Url` would cause `reqwest` to panic.
-// This is a workaround for https://github.com/lycheeverse/lychee/issues/539
-// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399
-// got merged.
-// It is exactly the same check that reqwest runs internally, but unfortunately
-// it `unwrap`s (and panics!) instead of returning an error, which we could handle.
-fn validate_url(url: &Url) -> bool {
- http::Uri::try_from(url.as_str()).is_err()
}
/// A shorthand function to check a single URI.
@@ -777,7 +582,7 @@ mod tests {
chain::{ChainResult, Handler, RequestChain},
mock_server,
test_utils::get_mock_client_response,
- Request, Status, Uri,
+ ErrorKind, Request, Status, Uri,
};
#[tokio::test]
@@ -1026,9 +831,14 @@ mod tests {
#[tokio::test]
async fn test_avoid_reqwest_panic() {
let client = ClientBuilder::builder().build().client().unwrap();
- // This request will fail, but it won't panic
+ // This request will result in an Unsupported status, but it won't panic
let res = client.check("http://\"").await.unwrap();
- assert!(res.status().is_error());
+
+ assert!(matches!(
+ res.status(),
+ Status::Unsupported(ErrorKind::BuildRequestClient(_))
+ ));
+ assert!(res.status().is_unsupported());
}
#[tokio::test]
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 341bb21..955bdd2 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,3 +1,4 @@
+use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Base, Input, Request, Result,
@@ -99,27 +100,31 @@ impl Collector {
///
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links(self, inputs: Vec) -> impl Stream> {
- let base = self.base;
+ let skip_missing_inputs = self.skip_missing_inputs;
+ let skip_hidden = self.skip_hidden;
+ let skip_ignored = self.skip_ignored;
+ let global_base = self.base;
stream::iter(inputs)
- .par_then_unordered(None, move |input| async move {
- input.get_contents(
- self.skip_missing_inputs,
- self.skip_hidden,
- self.skip_ignored,
- )
+ .par_then_unordered(None, move |input| {
+ let default_base = global_base.clone();
+ async move {
+ let base = match &input.source {
+ InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(),
+ _ => default_base,
+ };
+ input
+ .get_contents(skip_missing_inputs, skip_hidden, skip_ignored)
+ .map(move |content| (content, base.clone()))
+ }
})
.flatten()
- .par_then_unordered(None, move |content| {
- // send to parallel worker
- let base = base.clone();
+ .par_then_unordered(None, move |(content, base)| {
let basic_auth_extractor = self.basic_auth_extractor.clone();
async move {
let content = content?;
-
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec = extractor.extract(&content);
-
- let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
+ let requests = request::create(uris, &content, &base, &basic_auth_extractor);
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
})
@@ -137,7 +142,7 @@ mod tests {
use super::*;
use crate::{
mock_server,
- test_utils::{load_fixture, mail, website},
+ test_utils::{load_fixture, mail, path, website},
types::{FileType, Input, InputSource},
Result, Uri,
};
@@ -426,4 +431,85 @@ mod tests {
assert_eq!(links, expected_links);
}
+
+ #[tokio::test]
+ async fn test_multiple_remote_urls() {
+ let mock_server_1 = mock_server!(
+ StatusCode::OK,
+ set_body_string(r#"Link"#)
+ );
+ let mock_server_2 = mock_server!(
+ StatusCode::OK,
+ set_body_string(r#"Link"#)
+ );
+
+ let inputs = vec![
+ Input {
+ source: InputSource::RemoteUrl(Box::new(
+ Url::parse(&format!(
+ "{}/foo/index.html",
+ mock_server_1.uri().trim_end_matches('/')
+ ))
+ .unwrap(),
+ )),
+ file_type_hint: Some(FileType::Html),
+ excluded_paths: None,
+ },
+ Input {
+ source: InputSource::RemoteUrl(Box::new(
+ Url::parse(&format!(
+ "{}/bar/index.html",
+ mock_server_2.uri().trim_end_matches('/')
+ ))
+ .unwrap(),
+ )),
+ file_type_hint: Some(FileType::Html),
+ excluded_paths: None,
+ },
+ ];
+
+ let links = collect(inputs, None).await;
+
+ let expected_links = HashSet::from_iter([
+ website(&format!(
+ "{}/foo/relative.html",
+ mock_server_1.uri().trim_end_matches('/')
+ )),
+ website(&format!(
+ "{}/bar/relative.html",
+ mock_server_2.uri().trim_end_matches('/')
+ )),
+ ]);
+
+ assert_eq!(links, expected_links);
+ }
+
+ #[tokio::test]
+ async fn test_file_path_with_base() {
+ let base = Base::try_from("/path/to/root").unwrap();
+ assert_eq!(base, Base::Local("/path/to/root".into()));
+
+ let input = Input {
+ source: InputSource::String(
+ r#"
+ Index
+ About
+ Another
+ "#
+ .into(),
+ ),
+ file_type_hint: Some(FileType::Html),
+ excluded_paths: None,
+ };
+
+ let links = collect(vec![input], Some(base)).await;
+
+ let expected_links = HashSet::from_iter([
+ path("/path/to/root/index.html"),
+ path("/path/to/root/about.html"),
+ path("/another.html"),
+ ]);
+
+ assert_eq!(links, expected_links);
+ }
}
diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs
index 99fc6aa..ff7be8e 100644
--- a/lychee-lib/src/test_utils.rs
+++ b/lychee-lib/src/test_utils.rs
@@ -39,6 +39,16 @@ pub(crate) fn website(url: &str) -> Uri {
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
}
+/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme
+///
+/// # Panic
+///
+/// This panics if the given path is not absolute, so it should only be used for
+/// testing
+pub(crate) fn path>(path: P) -> Uri {
+ Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI"))
+}
+
/// Creates a mail URI from a string
pub(crate) fn mail(address: &str) -> Uri {
if address.starts_with("mailto:") {
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
index 73a4097..b7b76c7 100644
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@@ -23,7 +23,10 @@ impl Base {
pub(crate) fn join(&self, link: &str) -> Option {
match self {
Self::Remote(url) => url.join(link).ok(),
- Self::Local(_) => None,
+ Self::Local(path) => {
+ let full_path = path.join(link);
+ Url::from_file_path(full_path).ok()
+ }
}
}
@@ -36,18 +39,17 @@ impl Base {
}
}
- pub(crate) fn from_source(source: &InputSource) -> Option {
+ pub(crate) fn from_source(source: &InputSource) -> Option {
match &source {
InputSource::RemoteUrl(url) => {
- // TODO: This should be refactored.
- // Cases like https://user:pass@example.com are not handled
- // We can probably use the original URL and just replace the
- // path component in the caller of this function
- if let Some(port) = url.port() {
- Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok()
- } else {
- Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
- }
+ // Create a new URL with just the scheme, host, and port
+ let mut base_url = url.clone();
+ base_url.set_path("");
+ base_url.set_query(None);
+ base_url.set_fragment(None);
+
+ // We keep the username and password intact
+ Some(Base::Remote(*base_url))
}
// other inputs do not have a URL to extract a base
_ => None,
@@ -101,6 +103,16 @@ mod test_base {
assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
}
+ #[test]
+ fn test_valid_local_path_string_as_base() -> Result<()> {
+ let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"];
+
+ for case in cases {
+ assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case)));
+ }
+ Ok(())
+ }
+
#[test]
fn test_valid_local() -> Result<()> {
let dir = tempfile::tempdir().unwrap();
@@ -123,7 +135,7 @@ mod test_base {
let url = Url::parse(url).unwrap();
let source = InputSource::RemoteUrl(Box::new(url.clone()));
let base = Base::from_source(&source);
- let expected = Url::parse(expected).unwrap();
+ let expected = Base::Remote(Url::parse(expected).unwrap());
assert_eq!(base, Some(expected));
}
}
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 86faacb..bb1415f 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -86,12 +86,24 @@ pub enum ErrorKind {
#[error("Error with base dir `{0}` : {1}")]
InvalidBase(String, String),
+ /// Cannot join the given text with the base URL
+ #[error("Cannot join '{0}' with the base URL")]
+ InvalidBaseJoin(String),
+
+ /// Cannot convert the given path to a URI
+ #[error("Cannot convert path '{0}' to a URI")]
+ InvalidPathToUri(String),
+
+ /// The given URI type is not supported
+ #[error("Unsupported URI type: '{0}'")]
+ UnsupportedUriType(String),
+
/// The given input can not be parsed into a valid URI remapping
#[error("Error remapping URL: `{0}`")]
InvalidUrlRemap(String),
/// The given path does not resolve to a valid file
- #[error("Cannot find local file {0}")]
+ #[error("Invalid file path: {0}")]
InvalidFile(PathBuf),
/// Error while traversing an input directory
@@ -246,6 +258,13 @@ impl PartialEq for ErrorKind {
}
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
+ (Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2,
+ (Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2,
+ (Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2,
+ (Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2,
+ (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2,
+ (Self::EmptyUrl, Self::EmptyUrl) => true,
+
_ => false,
}
}
@@ -281,6 +300,9 @@ impl Hash for ErrorKind {
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
Self::InsecureURL(u, ..) => u.hash(state),
Self::InvalidBase(base, e) => (base, e).hash(state),
+ Self::InvalidBaseJoin(s) => s.hash(state),
+ Self::InvalidPathToUri(s) => s.hash(state),
+ Self::UnsupportedUriType(s) => s.hash(state),
Self::InvalidUrlRemap(remap) => (remap).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
diff --git a/lychee-lib/src/types/status_code/selector.rs b/lychee-lib/src/types/status_code/selector.rs
index 060e097..c93b662 100644
--- a/lychee-lib/src/types/status_code/selector.rs
+++ b/lychee-lib/src/types/status_code/selector.rs
@@ -5,7 +5,7 @@ use thiserror::Error;
use crate::{types::accept::AcceptRange, AcceptRangeError};
-#[derive(Debug, Error)]
+#[derive(Debug, Error, PartialEq)]
pub enum StatusCodeSelectorError {
#[error("invalid/empty input")]
InvalidInput,
diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs
index f78b23c..3ad51f2 100644
--- a/lychee-lib/src/types/uri/raw.rs
+++ b/lychee-lib/src/types/uri/raw.rs
@@ -19,12 +19,6 @@ pub struct RawUri {
pub attribute: Option,
}
-impl RawUri {
- // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
- pub(crate) fn is_anchor(&self) -> bool {
- self.text.starts_with('#')
- }
-}
impl Display for RawUri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} (Attribute: {:?})", self.text, self.attribute)
@@ -40,17 +34,3 @@ impl From<&str> for RawUri {
}
}
}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_is_anchor() {
- let raw_uri = RawUri::from("#anchor");
- assert!(raw_uri.is_anchor());
-
- let raw_uri = RawUri::from("notan#anchor");
- assert!(!raw_uri.is_anchor());
- }
-}
diff --git a/lychee-lib/src/types/uri/valid.rs b/lychee-lib/src/types/uri/valid.rs
index 573f685..522acc4 100644
--- a/lychee-lib/src/types/uri/valid.rs
+++ b/lychee-lib/src/types/uri/valid.rs
@@ -378,4 +378,9 @@ mod tests {
website("https://example.com")
);
}
+
+ #[test]
+ fn test_file_uri() {
+ assert!(Uri::try_from("file:///path/to/file").unwrap().is_file());
+ }
}
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index 70f82ce..7867e50 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -1,4 +1,4 @@
-use log::info;
+use log::warn;
use percent_encoding::percent_decode_str;
use reqwest::Url;
use std::{
@@ -13,156 +13,374 @@ use crate::{
Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri,
};
-const MAX_TRUNCATED_STR_LEN: usize = 100;
-
/// Extract basic auth credentials for a given URL.
-fn credentials(extractor: &Option, uri: &Uri) -> Option {
+fn extract_credentials(
+ extractor: &Option,
+ uri: &Uri,
+) -> Option {
extractor.as_ref().and_then(|ext| ext.matches(uri))
}
+/// Create a request from a raw URI.
+fn create_request(
+ raw_uri: &RawUri,
+ source: &InputSource,
+ base: &Option,
+ extractor: &Option,
+) -> Result {
+ let uri = try_parse_into_uri(raw_uri, source, base)?;
+ let source = truncate_source(source);
+ let element = raw_uri.element.clone();
+ let attribute = raw_uri.attribute.clone();
+ let credentials = extract_credentials(extractor, &uri);
+
+ Ok(Request::new(uri, source, element, attribute, credentials))
+}
+
+/// Try to parse the raw URI into a `Uri`.
+///
+/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
+/// If the base URL is not available, create a URI from the file path.
+///
+/// # Errors
+///
+/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
+/// to create a valid URI.
+/// - If a URI cannot be created from the file path.
+/// - If the source is not a file path (i.e. the URI type is not supported).
+fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result {
+ let text = raw_uri.text.clone();
+ let uri = match Uri::try_from(raw_uri.clone()) {
+ Ok(uri) => uri,
+ Err(_) => match base {
+ Some(base_url) => match base_url.join(&text) {
+ Some(url) => Uri { url },
+ None => return Err(ErrorKind::InvalidBaseJoin(text.clone())),
+ },
+ None => match source {
+ InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?,
+ _ => return Err(ErrorKind::UnsupportedUriType(text)),
+ },
+ },
+ };
+ Ok(uri)
+}
+
+// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
+pub(crate) fn is_anchor(text: &str) -> bool {
+ text.starts_with('#')
+}
+
+/// Create a URI from a file path
+///
+/// # Errors
+///
+/// - If the link text is an anchor and the file name cannot be extracted from the file path.
+/// - If the path cannot be resolved.
+/// - If the resolved path cannot be converted to a URL.
+fn create_uri_from_file_path(
+ file_path: &Path,
+ link_text: &str,
+ base: &Option,
+) -> Result {
+ let target_path = if is_anchor(link_text) {
+ // For anchors, we need to append the anchor to the file name.
+ let file_name = file_path
+ .file_name()
+ .and_then(|name| name.to_str())
+ .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?;
+
+ format!("{file_name}{link_text}")
+ } else {
+ link_text.to_string()
+ };
+ let Ok(constructed_url) = resolve_and_create_url(file_path, &target_path, base) else {
+ return Err(ErrorKind::InvalidPathToUri(target_path));
+ };
+ Ok(Uri {
+ url: constructed_url,
+ })
+}
+
+/// Truncate the source in case it gets too long
+///
+/// This is only needed for string inputs.
+/// For other inputs, the source is simply a "label" (an enum variant).
+// TODO: This would not be necessary if we used `Cow` for the source.
+fn truncate_source(source: &InputSource) -> InputSource {
+ const MAX_TRUNCATED_STR_LEN: usize = 100;
+
+ match source {
+ InputSource::String(s) => {
+ InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
+ }
+ other => other.clone(),
+ }
+}
+
/// Create requests out of the collected URLs.
/// Only keeps "valid" URLs. This filters out anchors for example.
+///
+/// If a URLs is ignored (because of the current settings),
+/// it will not be added to the `HashSet`.
pub(crate) fn create(
uris: Vec,
input_content: &InputContent,
base: &Option,
extractor: &Option,
-) -> Result> {
- let base_url = Base::from_source(&input_content.source);
+) -> HashSet {
+ let base = base
+ .clone()
+ .or_else(|| Base::from_source(&input_content.source));
- let requests: Result>> = uris
- .into_iter()
- .map(|raw_uri| {
- let is_anchor = raw_uri.is_anchor();
- let text = raw_uri.text.clone();
- let element = raw_uri.element.clone();
- let attribute = raw_uri.attribute.clone();
-
- // Truncate the source in case it gets too long Ideally we should
- // avoid the initial String allocation for `source` altogether
- let source = match &input_content.source {
- InputSource::String(s) => {
- InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
+ uris.into_iter()
+ .filter_map(|raw_uri| {
+ match create_request(&raw_uri, &input_content.source, &base, extractor) {
+ Ok(request) => Some(request),
+ Err(e) => {
+ warn!("Error creating request: {:?}", e);
+ None
}
- // Cloning is cheap here
- c => c.clone(),
- };
-
- if let Ok(uri) = Uri::try_from(raw_uri) {
- let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
- } else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
- let uri = Uri { url };
- let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
- } else if let InputSource::FsPath(root) = &input_content.source {
- let path = if is_anchor {
- match root.file_name() {
- Some(file_name) => match file_name.to_str() {
- Some(valid_str) => valid_str.to_string() + &text,
- None => return Err(ErrorKind::InvalidFile(root.clone())),
- },
- None => return Err(ErrorKind::InvalidFile(root.clone())),
- }
- } else {
- text
- };
-
- if let Some(url) = create_uri_from_path(root, &path, base)? {
- let uri = Uri { url };
- let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
- } else {
- // In case we cannot create a URI from a path but we didn't receive an error,
- // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
- Ok(None)
- }
- } else if let Some(url) = construct_url(&base_url, &text) {
- if base.is_some() {
- Ok(None)
- } else {
- let uri = Uri { url: url? };
- let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
- }
- } else {
- info!("Handling of `{}` not implemented yet", text);
- Ok(None)
}
})
- .collect();
- let requests: Vec = requests?.into_iter().flatten().collect();
- Ok(HashSet::from_iter(requests))
+ .collect()
}
-fn construct_url(base: &Option, text: &str) -> Option> {
- base.as_ref().map(|base| {
- base.join(text)
- .map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}")))
- })
-}
+/// Create a URI from a path
+///
+/// `src_path` is the path of the source file.
+/// `dest_path` is the path being linked to.
+/// The optional `base_uri` specifies the base URI to resolve the destination path against.
+///
+/// # Errors
+///
+/// - If the percent-decoded destination path cannot be decoded as UTF-8.
+/// - The path cannot be resolved
+/// - The resolved path cannot be converted to a URL.
+fn resolve_and_create_url(
+ src_path: &Path,
+ dest_path: &str,
+ base_uri: &Option,
+) -> Result {
+ let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
-fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result