From 3094bbca335373156e5b73c89027a172e842a4e1 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sat, 26 Oct 2024 04:07:37 +0200 Subject: [PATCH] Add support for relative links (#1489) This commit introduces several improvements to the file checking process and URI handling: - Extract file checking logic into separate `Checker` structs (`FileChecker`, `WebsiteChecker`, `MailChecker`) - Improve handling of relative and absolute file paths - Enhance URI parsing and creation from file paths - Refactor `create_request` function for better clarity and error handling These changes provide better support for resolving relative links, handling different base URLs, and working with file paths. Fixes https://github.com/lycheeverse/lychee/issues/1296 and https://github.com/lycheeverse/lychee/issues/1480 --- fixtures/resolve_paths/about/index.html | 8 + .../resolve_paths/another page/index.html | 0 fixtures/resolve_paths/index.html | 21 + lychee-bin/src/client.rs | 1 + lychee-bin/src/commands/check.rs | 10 +- lychee-bin/src/formatters/response/color.rs | 10 +- lychee-bin/tests/cli.rs | 67 ++- lychee-lib/src/checker.rs | 80 --- lychee-lib/src/checker/file.rs | 179 +++++++ lychee-lib/src/checker/mail.rs | 57 +++ lychee-lib/src/checker/mod.rs | 7 + lychee-lib/src/checker/website.rs | 226 +++++++++ lychee-lib/src/client.rs | 310 +++--------- lychee-lib/src/collector.rs | 114 ++++- lychee-lib/src/test_utils.rs | 10 + lychee-lib/src/types/base.rs | 36 +- lychee-lib/src/types/error.rs | 24 +- lychee-lib/src/types/status_code/selector.rs | 2 +- lychee-lib/src/types/uri/raw.rs | 20 - lychee-lib/src/types/uri/valid.rs | 5 + lychee-lib/src/utils/request.rs | 468 +++++++++++++----- 21 files changed, 1133 insertions(+), 522 deletions(-) create mode 100644 fixtures/resolve_paths/about/index.html create mode 100644 fixtures/resolve_paths/another page/index.html create mode 100644 fixtures/resolve_paths/index.html delete mode 100644 lychee-lib/src/checker.rs create mode 100644 lychee-lib/src/checker/file.rs create mode 100644 lychee-lib/src/checker/mail.rs create mode 100644 lychee-lib/src/checker/mod.rs create mode 100644 lychee-lib/src/checker/website.rs diff --git a/fixtures/resolve_paths/about/index.html b/fixtures/resolve_paths/about/index.html new file mode 100644 index 0000000..3141b66 --- /dev/null +++ b/fixtures/resolve_paths/about/index.html @@ -0,0 +1,8 @@ + + + About + + +

About

+ + diff --git a/fixtures/resolve_paths/another page/index.html b/fixtures/resolve_paths/another page/index.html new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/resolve_paths/index.html b/fixtures/resolve_paths/index.html new file mode 100644 index 0000000..79d285a --- /dev/null +++ b/fixtures/resolve_paths/index.html @@ -0,0 +1,21 @@ + + + Index + + +

Index Title

+

+

+

+ + \ No newline at end of file diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index b03fc35..68bf3ca 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - ClientBuilder::builder() .remaps(remaps) + .base(cfg.base.clone()) .includes(includes) .excludes(excludes) .exclude_all_private(cfg.exclude_all_private) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index fd65499..2ffc0ed 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -429,14 +429,12 @@ mod tests { #[tokio::test] async fn test_invalid_url() { - // Run a normal request with an invalid Url let client = ClientBuilder::builder().build().client().unwrap(); - let request = Request::try_from("http://\"").unwrap(); - let response = check_url(&client, request).await; - assert!(response.status().is_error()); + let uri = Uri::try_from("http://\"").unwrap(); + let response = client.check_website(&uri, None).await.unwrap(); assert!(matches!( - response.status(), - Status::Error(ErrorKind::InvalidURI(_)) + response, + Status::Unsupported(ErrorKind::BuildRequestClient(_)) )); } diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs index 137c70a..9aa12df 100644 --- a/lychee-bin/src/formatters/response/color.rs +++ b/lychee-bin/src/formatters/response/color.rs @@ -65,12 +65,18 @@ mod tests { } } + #[cfg(test)] + /// Helper function to strip ANSI color codes for tests + fn strip_ansi_codes(s: &str) -> String { + console::strip_ansi_codes(s).to_string() + } + #[test] fn test_format_response_with_ok_status() { let formatter = ColorFormatter; let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com"); assert_eq!( - formatter.format_response(&body), + strip_ansi_codes(&formatter.format_response(&body)), " [200] https://example.com/" ); } @@ -83,7 +89,7 @@ mod tests { "https://example.com/404", ); assert_eq!( - formatter.format_response(&body), + strip_ansi_codes(&formatter.format_response(&body)), " [ERROR] https://example.com/404" ); } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 2d23b9d..fcb7530 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -267,17 +267,17 @@ mod cli { #[test] fn test_resolve_paths() { let mut cmd = main_command(); - let offline_dir = fixtures_path().join("offline"); + let dir = fixtures_path().join("resolve_paths"); cmd.arg("--offline") .arg("--base") - .arg(&offline_dir) - .arg(offline_dir.join("index.html")) + .arg(&dir) + .arg(dir.join("index.html")) .env_clear() .assert() .success() - .stdout(contains("4 Total")) - .stdout(contains("4 OK")); + .stdout(contains("3 Total")) + .stdout(contains("3 OK")); } #[test] @@ -944,13 +944,17 @@ mod cli { // check content of cache file let data = fs::read_to_string(&cache_file)?; + + if data.is_empty() { + println!("Cache file is empty!"); + } + assert!(data.contains(&format!("{}/,200", mock_server_ok.uri()))); assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri()))); assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri()))); // clear the cache file fs::remove_file(&cache_file)?; - Ok(()) } @@ -1216,8 +1220,9 @@ mod cli { Ok(()) } - /// If base-dir is not set, don't throw an error in case we encounter - /// an absolute local link within a file (e.g. `/about`). + /// If `base-dir` is not set, don't throw an error in case we encounter + /// an absolute local link (e.g. `/about`) within a file. + /// Instead, simply ignore the link. #[test] fn test_ignore_absolute_local_links_without_base() -> Result<()> { let mut cmd = main_command(); @@ -1409,9 +1414,7 @@ mod cli { .arg("./NOT-A-REAL-TEST-FIXTURE.md") .assert() .failure() - .stderr(contains( - "Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md", - )); + .stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md")); Ok(()) } @@ -1667,4 +1670,46 @@ mod cli { .success() .stdout(contains("0 Errors")); } + + /// Test relative paths + /// + /// Imagine a web server hosting a site with the following structure: + /// root + /// └── test + /// ├── index.html + /// └── next.html + /// + /// where `root/test/index.html` contains `next` + /// When checking the link in `root/test/index.html` we should be able to + /// resolve the relative path to `root/test/next.html` + /// + /// Note that the relative path is not resolved to the root of the server + /// but relative to the file that contains the link. + #[tokio::test] + async fn test_resolve_relative_paths_in_subfolder() -> Result<()> { + let mock_server = wiremock::MockServer::start().await; + + let body = r#"next"#; + wiremock::Mock::given(wiremock::matchers::method("GET")) + .and(wiremock::matchers::path("/test/index.html")) + .respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body)) + .mount(&mock_server) + .await; + + wiremock::Mock::given(wiremock::matchers::method("GET")) + .and(wiremock::matchers::path("/test/next.html")) + .respond_with(wiremock::ResponseTemplate::new(200)) + .mount(&mock_server) + .await; + + let mut cmd = main_command(); + cmd.arg("--verbose") + .arg(format!("{}/test/index.html", mock_server.uri())) + .assert() + .success() + .stdout(contains("1 Total")) + .stdout(contains("0 Errors")); + + Ok(()) + } } diff --git a/lychee-lib/src/checker.rs b/lychee-lib/src/checker.rs deleted file mode 100644 index 428551a..0000000 --- a/lychee-lib/src/checker.rs +++ /dev/null @@ -1,80 +0,0 @@ -use crate::{ - chain::{ChainResult, Handler}, - retry::RetryExt, - Status, -}; -use async_trait::async_trait; -use http::StatusCode; -use reqwest::Request; -use std::{collections::HashSet, time::Duration}; - -#[derive(Debug, Clone)] -pub(crate) struct Checker { - retry_wait_time: Duration, - max_retries: u64, - reqwest_client: reqwest::Client, - accepted: Option>, -} - -impl Checker { - pub(crate) const fn new( - retry_wait_time: Duration, - max_retries: u64, - reqwest_client: reqwest::Client, - accepted: Option>, - ) -> Self { - Self { - retry_wait_time, - max_retries, - reqwest_client, - accepted, - } - } - - /// Retry requests up to `max_retries` times - /// with an exponential backoff. - pub(crate) async fn retry_request(&self, request: Request) -> Status { - let mut retries: u64 = 0; - let mut wait_time = self.retry_wait_time; - - let mut status = self.check_default(clone_unwrap(&request)).await; - while retries < self.max_retries { - if status.is_success() || !status.should_retry() { - return status; - } - retries += 1; - tokio::time::sleep(wait_time).await; - wait_time = wait_time.saturating_mul(2); - status = self.check_default(clone_unwrap(&request)).await; - } - status - } - - /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest). - async fn check_default(&self, request: Request) -> Status { - match self.reqwest_client.execute(request).await { - Ok(ref response) => Status::new(response, self.accepted.clone()), - Err(e) => e.into(), - } - } -} - -/// Clones a `reqwest::Request`. -/// -/// # Safety -/// -/// This panics if the request cannot be cloned. This should only happen if the -/// request body is a `reqwest` stream. We disable the `stream` feature, so the -/// body should never be a stream. -/// -/// See -fn clone_unwrap(request: &Request) -> Request { - request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled") -} - -#[async_trait] -impl Handler for Checker { - async fn handle(&mut self, input: Request) -> ChainResult { - ChainResult::Done(self.retry_request(input).await) - } -} diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs new file mode 100644 index 0000000..0060bfb --- /dev/null +++ b/lychee-lib/src/checker/file.rs @@ -0,0 +1,179 @@ +use http::StatusCode; +use log::warn; +use std::path::{Path, PathBuf}; + +use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; + +/// A utility for checking the existence and validity of file-based URIs. +/// +/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths. +/// It supports base path resolution, fallback extensions for files without extensions, +/// and optional fragment checking for HTML files. +#[derive(Debug, Clone)] +pub(crate) struct FileChecker { + /// Base path or URL used for resolving relative paths. + base: Option, + /// List of file extensions to try if the original path doesn't exist. + fallback_extensions: Vec, + /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files. + include_fragments: bool, + /// Utility for performing fragment checks in HTML files. + fragment_checker: FragmentChecker, +} + +impl FileChecker { + /// Creates a new `FileChecker` with the given configuration. + /// + /// # Arguments + /// + /// * `base` - Optional base path or URL for resolving relative paths. + /// * `fallback_extensions` - List of extensions to try if the original file is not found. + /// * `include_fragments` - Whether to check for fragment existence in HTML files. + pub(crate) fn new( + base: Option, + fallback_extensions: Vec, + include_fragments: bool, + ) -> Self { + Self { + base, + fallback_extensions, + include_fragments, + fragment_checker: FragmentChecker::new(), + } + } + + /// Checks the given file URI for existence and validity. + /// + /// This method resolves the URI to a file path, checks if the file exists, + /// and optionally checks for the existence of fragments in HTML files. + /// + /// # Arguments + /// + /// * `uri` - The URI to check. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. + pub(crate) async fn check(&self, uri: &Uri) -> Status { + let Ok(path) = uri.url.to_file_path() else { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + }; + + let resolved_path = self.resolve_path(&path); + self.check_path(&resolved_path, uri).await + } + + /// Resolves the given path using the base path, if one is set. + /// + /// # Arguments + /// + /// * `path` - The path to resolve. + /// + /// # Returns + /// + /// Returns the resolved path as a `PathBuf`. + fn resolve_path(&self, path: &Path) -> PathBuf { + if let Some(Base::Local(base_path)) = &self.base { + if path.is_absolute() { + let absolute_base_path = if base_path.is_relative() { + std::env::current_dir().unwrap_or_default().join(base_path) + } else { + base_path.clone() + }; + + let stripped = path.strip_prefix("/").unwrap_or(path); + absolute_base_path.join(stripped) + } else { + base_path.join(path) + } + } else { + path.to_path_buf() + } + } + + /// Checks if the given path exists and performs additional checks if necessary. + /// + /// # Arguments + /// + /// * `path` - The path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. + async fn check_path(&self, path: &Path, uri: &Uri) -> Status { + if path.exists() { + return self.check_existing_path(path, uri).await; + } + + self.check_with_fallback_extensions(path, uri).await + } + + /// Checks an existing path, optionally verifying fragments for HTML files. + /// + /// # Arguments + /// + /// * `path` - The path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. + async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status { + if self.include_fragments { + self.check_fragment(path, uri).await + } else { + Status::Ok(StatusCode::OK) + } + } + + /// Attempts to find a file by trying different extensions specified in `fallback_extensions`. + /// + /// # Arguments + /// + /// * `path` - The original path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. + async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status { + let mut path_buf = path.to_path_buf(); + + // If the path already has an extension, try it first + if path_buf.extension().is_some() && path_buf.exists() { + return self.check_existing_path(&path_buf, uri).await; + } + + // Try fallback extensions + for ext in &self.fallback_extensions { + path_buf.set_extension(ext); + if path_buf.exists() { + return self.check_existing_path(&path_buf, uri).await; + } + } + + ErrorKind::InvalidFilePath(uri.clone()).into() + } + + /// Checks for the existence of a fragment in an HTML file. + /// + /// # Arguments + /// + /// * `path` - The path to the HTML file. + /// * `uri` - The original URI, containing the fragment to check. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the fragment check. + async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { + match self.fragment_checker.check(path, &uri.url).await { + Ok(true) => Status::Ok(StatusCode::OK), + Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), + Err(err) => { + warn!("Skipping fragment check due to the following error: {err}"); + Status::Ok(StatusCode::OK) + } + } + } +} diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs new file mode 100644 index 0000000..e7dcef7 --- /dev/null +++ b/lychee-lib/src/checker/mail.rs @@ -0,0 +1,57 @@ +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use http::StatusCode; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use crate::ErrorKind; + +use crate::{Status, Uri}; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use crate::types::mail; + +/// A utility for checking the validity of email addresses. +/// +/// `EmailChecker` is responsible for validating email addresses, +/// optionally performing reachability checks when the appropriate +/// features are enabled. +#[derive(Debug, Clone)] +pub(crate) struct MailChecker {} + +impl MailChecker { + /// Creates a new `EmailChecker`. + pub(crate) const fn new() -> Self { + Self {} + } + + /// Check a mail address, or equivalently a `mailto` URI. + /// + /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), + /// which are ignored by this check. They are not part of the mail address + /// and instead passed to a mail client. + #[cfg(all(feature = "email-check", feature = "native-tls"))] + pub(crate) async fn check_mail(&self, uri: &Uri) -> Status { + self.perform_email_check(uri).await + } + + /// Ignore the mail check if the `email-check` and `native-tls` features are not enabled. + #[cfg(not(all(feature = "email-check", feature = "native-tls")))] + pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status { + Status::Excluded + } + + #[cfg(all(feature = "email-check", feature = "native-tls"))] + async fn perform_email_check(&self, uri: &Uri) -> Status { + let address = uri.url.path().to_string(); + let input = CheckEmailInput::new(address); + let result = &(check_email(&input).await); + + if let Reachable::Invalid = result.is_reachable { + ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() + } else { + Status::Ok(StatusCode::OK) + } + } +} diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs new file mode 100644 index 0000000..bfbef9d --- /dev/null +++ b/lychee-lib/src/checker/mod.rs @@ -0,0 +1,7 @@ +//! Checker Module +//! +//! This module contains all checkers, which are responsible for checking the status of a URL. + +pub(crate) mod file; +pub(crate) mod mail; +pub(crate) mod website; diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs new file mode 100644 index 0000000..62cadaf --- /dev/null +++ b/lychee-lib/src/checker/website.rs @@ -0,0 +1,226 @@ +use crate::{ + chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, + quirks::Quirks, + retry::RetryExt, + types::uri::github::GithubUri, + BasicAuthCredentials, ErrorKind, Status, Uri, +}; +use async_trait::async_trait; +use http::StatusCode; +use octocrab::Octocrab; +use reqwest::Request; +use std::{collections::HashSet, time::Duration}; + +#[derive(Debug, Clone)] +pub(crate) struct WebsiteChecker { + /// Request method used for making requests. + method: reqwest::Method, + + /// The HTTP client used for requests. + reqwest_client: reqwest::Client, + + /// GitHub client used for requests. + github_client: Option, + + /// The chain of plugins to be executed on each request. + plugin_request_chain: RequestChain, + + /// Maximum number of retries per request before returning an error. + max_retries: u64, + + /// Initial wait time between retries of failed requests. This doubles after + /// each failure. + retry_wait_time: Duration, + + /// Set of accepted return codes / status codes. + /// + /// Unmatched return codes/ status codes are deemed as errors. + accepted: Option>, + + /// Requires using HTTPS when it's available. + /// + /// This would treat unencrypted links as errors when HTTPS is available. + require_https: bool, +} + +impl WebsiteChecker { + #[allow(clippy::too_many_arguments)] + pub(crate) const fn new( + method: reqwest::Method, + retry_wait_time: Duration, + max_retries: u64, + reqwest_client: reqwest::Client, + accepted: Option>, + github_client: Option, + require_https: bool, + plugin_request_chain: RequestChain, + ) -> Self { + Self { + method, + reqwest_client, + github_client, + plugin_request_chain, + max_retries, + retry_wait_time, + accepted, + require_https, + } + } + + /// Retry requests up to `max_retries` times + /// with an exponential backoff. + pub(crate) async fn retry_request(&self, request: Request) -> Status { + let mut retries: u64 = 0; + let mut wait_time = self.retry_wait_time; + let mut status = self.check_default(clone_unwrap(&request)).await; + while retries < self.max_retries { + if status.is_success() || !status.should_retry() { + return status; + } + retries += 1; + tokio::time::sleep(wait_time).await; + wait_time = wait_time.saturating_mul(2); + status = self.check_default(clone_unwrap(&request)).await; + } + status + } + + /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest). + async fn check_default(&self, request: Request) -> Status { + match self.reqwest_client.execute(request).await { + Ok(ref response) => Status::new(response, self.accepted.clone()), + Err(e) => e.into(), + } + } + + /// Checks the given URI of a website. + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The URI is invalid. + /// - The request failed. + /// - The response status code is not accepted. + /// - The URI cannot be converted to HTTPS. + pub(crate) async fn check_website( + &self, + uri: &Uri, + credentials: Option, + ) -> Result { + let default_chain: RequestChain = Chain::new(vec![ + Box::::default(), + Box::new(credentials), + Box::new(self.clone()), + ]); + + match self.check_website_inner(uri, &default_chain).await { + Status::Ok(code) if self.require_https && uri.scheme() == "http" => { + if self + .check_website_inner(&uri.to_https()?, &default_chain) + .await + .is_success() + { + Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?))) + } else { + Ok(Status::Ok(code)) + } + } + s => Ok(s), + } + } + + /// Checks the given URI of a website. + /// + /// Unsupported schemes will be ignored + /// + /// Note: we use `inner` to improve compile times by avoiding monomorphization + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The URI is invalid. + /// - The request failed. + /// - The response status code is not accepted. + async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status { + let request = self + .reqwest_client + .request(self.method.clone(), uri.as_str()) + .build(); + + let request = match request { + Ok(r) => r, + Err(e) => return e.into(), + }; + + let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain]) + .traverse(request) + .await; + + self.handle_github(status, uri).await + } + + // Pull out the heavy machinery in case of a failed normal request. + // This could be a GitHub URL and we ran into the rate limiter. + // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023) + async fn handle_github(&self, status: Status, uri: &Uri) -> Status { + if status.is_success() { + return status; + } + + if let Ok(github_uri) = GithubUri::try_from(uri) { + let status = self.check_github(github_uri).await; + if status.is_success() { + return status; + } + } + + status + } + + /// Check a `uri` hosted on `GitHub` via the GitHub API. + /// + /// # Caveats + /// + /// Files inside private repositories won't get checked and instead would + /// be reported as valid if the repository itself is reachable through the + /// API. + /// + /// A better approach would be to download the file through the API or + /// clone the repo, but we chose the pragmatic approach. + async fn check_github(&self, uri: GithubUri) -> Status { + let Some(client) = &self.github_client else { + return ErrorKind::MissingGitHubToken.into(); + }; + let repo = match client.repos(&uri.owner, &uri.repo).get().await { + Ok(repo) => repo, + Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(), + }; + if let Some(true) = repo.private { + return Status::Ok(StatusCode::OK); + } else if let Some(endpoint) = uri.endpoint { + return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo)) + .into(); + } + Status::Ok(StatusCode::OK) + } +} + +/// Clones a `reqwest::Request`. +/// +/// # Safety +/// +/// This panics if the request cannot be cloned. This should only happen if the +/// request body is a `reqwest` stream. We disable the `stream` feature, so the +/// body should never be a stream. +/// +/// See +fn clone_unwrap(request: &Request) -> Request { + request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled") +} + +#[async_trait] +impl Handler for WebsiteChecker { + async fn handle(&mut self, input: Request) -> ChainResult { + ChainResult::Done(self.retry_request(input).await) + } +} diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 4ba97b2..f9566f0 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -15,8 +15,6 @@ )] use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; use http::{ header::{HeaderMap, HeaderValue}, StatusCode, @@ -24,25 +22,21 @@ use http::{ use log::{debug, warn}; use octocrab::Octocrab; use regex::RegexSet; -use reqwest::{header, redirect, Url}; +use reqwest::{header, redirect}; use reqwest_cookie_store::CookieStoreMutex; use secrecy::{ExposeSecret, SecretString}; use typed_builder::TypedBuilder; use crate::{ - chain::{Chain, ClientRequestChains, RequestChain}, - checker::Checker, + chain::RequestChain, + checker::file::FileChecker, + checker::{mail::MailChecker, website::WebsiteChecker}, filter::{Excludes, Filter, Includes}, - quirks::Quirks, remap::Remaps, - types::uri::github::GithubUri, utils::fragment_checker::FragmentChecker, - ErrorKind, Request, Response, Result, Status, Uri, + Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use crate::types::mail; - /// Default number of redirects before a request is deemed as failed, 5. pub const DEFAULT_MAX_REDIRECTS: usize = 5; /// Default number of retries before a request is deemed as failed, 3. @@ -248,6 +242,12 @@ pub struct ClientBuilder { /// Response timeout per request in seconds. timeout: Option, + /// Base for resolving paths. + /// + /// E.g. if the base is `/home/user/` and the path is `file.txt`, the + /// resolved path would be `/home/user/file.txt`. + base: Option, + /// Initial time between retries of failed requests. /// /// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`]. @@ -383,20 +383,28 @@ impl ClientBuilder { include_mail: self.include_mail, }; - Ok(Client { + let website_checker = WebsiteChecker::new( + self.method, + self.retry_wait_time, + self.max_retries, reqwest_client, + self.accepted, github_client, + self.require_https, + self.plugin_request_chain, + ); + + Ok(Client { remaps: self.remaps, - fallback_extensions: self.fallback_extensions, filter, - max_retries: self.max_retries, - retry_wait_time: self.retry_wait_time, - method: self.method, - accepted: self.accepted, - require_https: self.require_https, - include_fragments: self.include_fragments, + email_checker: MailChecker::new(), + website_checker, + file_checker: FileChecker::new( + self.base, + self.fallback_extensions, + self.include_fragments, + ), fragment_checker: FragmentChecker::new(), - plugin_request_chain: self.plugin_request_chain, }) } } @@ -407,50 +415,23 @@ impl ClientBuilder { /// options. #[derive(Debug, Clone)] pub struct Client { - /// Underlying `reqwest` client instance that handles the HTTP requests. - reqwest_client: reqwest::Client, - - /// Optional GitHub client that handles communications with GitHub. - github_client: Option, - /// Optional remapping rules for URIs matching pattern. remaps: Option, - /// Automatically append file extensions to `file://` URIs as needed - fallback_extensions: Vec, - /// Rules to decided whether each link should be checked or ignored. filter: Filter, - /// Maximum number of retries per request before returning an error. - max_retries: u64, + /// A checker for website URLs. + website_checker: WebsiteChecker, - /// Initial wait time between retries of failed requests. This doubles after - /// each failure. - retry_wait_time: Duration, + /// A checker for file URLs. + file_checker: FileChecker, - /// HTTP method used for requests, e.g. `GET` or `HEAD`. - /// - /// The same method will be used for all links. - method: reqwest::Method, - - /// Set of accepted return codes / status codes. - /// - /// Unmatched return codes/ status codes are deemed as errors. - accepted: Option>, - - /// Requires using HTTPS when it's available. - /// - /// This would treat unencrypted links as errors when HTTPS is available. - require_https: bool, - - /// Enable the checking of fragments in links. - include_fragments: bool, + /// A checker for email URLs. + email_checker: MailChecker, /// Caches Fragments fragment_checker: FragmentChecker, - - plugin_request_chain: RequestChain, } impl Client { @@ -463,7 +444,7 @@ impl Client { /// /// Returns an `Err` if: /// - `request` does not represent a valid URI. - /// - Encrypted connection for a HTTP URL is available but unused. (Only + /// - Encrypted connection for a HTTP URL is available but unused. (Only /// checked when `Client::require_https` is `true`.) #[allow(clippy::missing_panics_doc)] pub async fn check(&self, request: T) -> Result @@ -493,27 +474,22 @@ impl Client { return Ok(Response::new(uri.clone(), Status::Excluded, source)); } - let default_chain: RequestChain = Chain::new(vec![ - Box::::default(), - Box::new(credentials), - Box::new(Checker::new( - self.retry_wait_time, - self.max_retries, - self.reqwest_client.clone(), - self.accepted.clone(), - )), - ]); - let status = match uri.scheme() { + // We don't check tel: URIs + _ if uri.is_tel() => Status::Excluded, _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, - _ if uri.is_tel() => Status::Excluded, - _ => self.check_website(uri, default_chain).await?, + _ => self.check_website(uri, credentials).await?, }; Ok(Response::new(uri.clone(), status, source)) } + /// Check a single file using the file checker. + pub async fn check_file(&self, uri: &Uri) -> Status { + self.file_checker.check(uri).await + } + /// Remap `uri` using the client-defined remapping rules. /// /// # Errors @@ -541,151 +517,17 @@ impl Client { /// - The request failed. /// - The response status code is not accepted. /// - The URI cannot be converted to HTTPS. - pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result { - match self.check_website_inner(uri, &default_chain).await { - Status::Ok(code) if self.require_https && uri.scheme() == "http" => { - if self - .check_website_inner(&uri.to_https()?, &default_chain) - .await - .is_success() - { - Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?))) - } else { - // HTTPS is not available for this URI, - // so the original HTTP URL is fine. - Ok(Status::Ok(code)) - } - } - s => Ok(s), - } + pub async fn check_website( + &self, + uri: &Uri, + credentials: Option, + ) -> Result { + self.website_checker.check_website(uri, credentials).await } - /// Checks the given URI of a website. - /// - /// Unsupported schemes will be ignored - /// - /// # Errors - /// - /// This returns an `Err` if - /// - The URI is invalid. - /// - The request failed. - /// - The response status code is not accepted. - pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status { - // Workaround for upstream reqwest panic - if validate_url(&uri.url) { - if matches!(uri.scheme(), "http" | "https") { - // This is a truly invalid URI with a known scheme. - // If we pass that to reqwest it would panic. - return Status::Error(ErrorKind::InvalidURI(uri.clone())); - } - // This is merely a URI with a scheme that is not supported by - // reqwest yet. It would be safe to pass that to reqwest and it - // wouldn't panic, but it's also unnecessary, because it would - // simply return an error. - return Status::Unsupported(ErrorKind::InvalidURI(uri.clone())); - } - - let request = self - .reqwest_client - .request(self.method.clone(), uri.as_str()) - .build(); - - let request = match request { - Ok(r) => r, - Err(e) => return e.into(), - }; - - let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain]) - .traverse(request) - .await; - - self.handle_github(status, uri).await - } - - // Pull out the heavy machinery in case of a failed normal request. - // This could be a GitHub URL and we ran into the rate limiter. - // TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023) - async fn handle_github(&self, status: Status, uri: &Uri) -> Status { - if status.is_success() { - return status; - } - - if let Ok(github_uri) = GithubUri::try_from(uri) { - let status = self.check_github(github_uri).await; - // Only return GitHub status in case of success - // Otherwise return the original error, which has more information - if status.is_success() { - return status; - } - } - - status - } - - /// Check a `uri` hosted on `GitHub` via the GitHub API. - /// - /// # Caveats - /// - /// Files inside private repositories won't get checked and instead would - /// be reported as valid if the repository itself is reachable through the - /// API. - /// - /// A better approach would be to download the file through the API or - /// clone the repo, but we chose the pragmatic approach. - async fn check_github(&self, uri: GithubUri) -> Status { - let Some(client) = &self.github_client else { - return ErrorKind::MissingGitHubToken.into(); - }; - let repo = match client.repos(&uri.owner, &uri.repo).get().await { - Ok(repo) => repo, - Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(), - }; - if let Some(true) = repo.private { - // The private repo exists. Assume a given endpoint exists as well - // (e.g. `issues` in `github.com/org/private/issues`). This is not - // always the case but simplifies the check. - return Status::Ok(StatusCode::OK); - } else if let Some(endpoint) = uri.endpoint { - // The URI returned a non-200 status code from a normal request and - // now we find that this public repo is reachable through the API, - // so that must mean the full URI (which includes the additional - // endpoint) must be invalid. - return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo)) - .into(); - } - // Found public repo without endpoint - Status::Ok(StatusCode::OK) - } - - /// Check a `file` URI. - pub async fn check_file(&self, uri: &Uri) -> Status { - let Ok(path) = uri.url.to_file_path() else { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - }; - - if path.exists() { - if self.include_fragments { - return self.check_fragment(&path, uri).await; - } - return Status::Ok(StatusCode::OK); - } - - if path.extension().is_some() { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - } - - // if the path has no file extension, try to append some - let mut path_buf = path.clone(); - for ext in &self.fallback_extensions { - path_buf.set_extension(ext); - if path_buf.exists() { - if self.include_fragments { - return self.check_fragment(&path_buf, uri).await; - } - return Status::Ok(StatusCode::OK); - } - } - ErrorKind::InvalidFilePath(uri.clone()).into() + /// Checks a `mailto` URI. + pub async fn check_mail(&self, uri: &Uri) -> Status { + self.email_checker.check_mail(uri).await } /// Checks a `file` URI's fragment. @@ -699,43 +541,6 @@ impl Client { } } } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), - /// which are ignored by this check. The are not part of the mail address - /// and instead passed to a mail client. - #[cfg(all(feature = "email-check", feature = "native-tls"))] - pub async fn check_mail(&self, uri: &Uri) -> Status { - let address = uri.url.path().to_string(); - let input = CheckEmailInput::new(address); - let result = &(check_email(&input).await); - - if let Reachable::Invalid = result.is_reachable { - ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() - } else { - Status::Ok(StatusCode::OK) - } - } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// This implementation simply excludes all email addresses. - #[cfg(not(all(feature = "email-check", feature = "native-tls")))] - #[allow(clippy::unused_async)] - pub async fn check_mail(&self, _uri: &Uri) -> Status { - Status::Excluded - } -} - -// Check if the given `Url` would cause `reqwest` to panic. -// This is a workaround for https://github.com/lycheeverse/lychee/issues/539 -// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399 -// got merged. -// It is exactly the same check that reqwest runs internally, but unfortunately -// it `unwrap`s (and panics!) instead of returning an error, which we could handle. -fn validate_url(url: &Url) -> bool { - http::Uri::try_from(url.as_str()).is_err() } /// A shorthand function to check a single URI. @@ -777,7 +582,7 @@ mod tests { chain::{ChainResult, Handler, RequestChain}, mock_server, test_utils::get_mock_client_response, - Request, Status, Uri, + ErrorKind, Request, Status, Uri, }; #[tokio::test] @@ -1026,9 +831,14 @@ mod tests { #[tokio::test] async fn test_avoid_reqwest_panic() { let client = ClientBuilder::builder().build().client().unwrap(); - // This request will fail, but it won't panic + // This request will result in an Unsupported status, but it won't panic let res = client.check("http://\"").await.unwrap(); - assert!(res.status().is_error()); + + assert!(matches!( + res.status(), + Status::Unsupported(ErrorKind::BuildRequestClient(_)) + )); + assert!(res.status().is_unsupported()); } #[tokio::test] diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 341bb21..955bdd2 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,3 +1,4 @@ +use crate::InputSource; use crate::{ basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request, Base, Input, Request, Result, @@ -99,27 +100,31 @@ impl Collector { /// /// Will return `Err` if links cannot be extracted from an input pub fn collect_links(self, inputs: Vec) -> impl Stream> { - let base = self.base; + let skip_missing_inputs = self.skip_missing_inputs; + let skip_hidden = self.skip_hidden; + let skip_ignored = self.skip_ignored; + let global_base = self.base; stream::iter(inputs) - .par_then_unordered(None, move |input| async move { - input.get_contents( - self.skip_missing_inputs, - self.skip_hidden, - self.skip_ignored, - ) + .par_then_unordered(None, move |input| { + let default_base = global_base.clone(); + async move { + let base = match &input.source { + InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), + _ => default_base, + }; + input + .get_contents(skip_missing_inputs, skip_hidden, skip_ignored) + .map(move |content| (content, base.clone())) + } }) .flatten() - .par_then_unordered(None, move |content| { - // send to parallel worker - let base = base.clone(); + .par_then_unordered(None, move |(content, base)| { let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { let content = content?; - let extractor = Extractor::new(self.use_html5ever, self.include_verbatim); let uris: Vec = extractor.extract(&content); - - let requests = request::create(uris, &content, &base, &basic_auth_extractor)?; + let requests = request::create(uris, &content, &base, &basic_auth_extractor); Result::Ok(stream::iter(requests.into_iter().map(Ok))) } }) @@ -137,7 +142,7 @@ mod tests { use super::*; use crate::{ mock_server, - test_utils::{load_fixture, mail, website}, + test_utils::{load_fixture, mail, path, website}, types::{FileType, Input, InputSource}, Result, Uri, }; @@ -426,4 +431,85 @@ mod tests { assert_eq!(links, expected_links); } + + #[tokio::test] + async fn test_multiple_remote_urls() { + let mock_server_1 = mock_server!( + StatusCode::OK, + set_body_string(r#"Link"#) + ); + let mock_server_2 = mock_server!( + StatusCode::OK, + set_body_string(r#"Link"#) + ); + + let inputs = vec![ + Input { + source: InputSource::RemoteUrl(Box::new( + Url::parse(&format!( + "{}/foo/index.html", + mock_server_1.uri().trim_end_matches('/') + )) + .unwrap(), + )), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }, + Input { + source: InputSource::RemoteUrl(Box::new( + Url::parse(&format!( + "{}/bar/index.html", + mock_server_2.uri().trim_end_matches('/') + )) + .unwrap(), + )), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }, + ]; + + let links = collect(inputs, None).await; + + let expected_links = HashSet::from_iter([ + website(&format!( + "{}/foo/relative.html", + mock_server_1.uri().trim_end_matches('/') + )), + website(&format!( + "{}/bar/relative.html", + mock_server_2.uri().trim_end_matches('/') + )), + ]); + + assert_eq!(links, expected_links); + } + + #[tokio::test] + async fn test_file_path_with_base() { + let base = Base::try_from("/path/to/root").unwrap(); + assert_eq!(base, Base::Local("/path/to/root".into())); + + let input = Input { + source: InputSource::String( + r#" + Index + About + Another + "# + .into(), + ), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }; + + let links = collect(vec![input], Some(base)).await; + + let expected_links = HashSet::from_iter([ + path("/path/to/root/index.html"), + path("/path/to/root/about.html"), + path("/another.html"), + ]); + + assert_eq!(links, expected_links); + } } diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs index 99fc6aa..ff7be8e 100644 --- a/lychee-lib/src/test_utils.rs +++ b/lychee-lib/src/test_utils.rs @@ -39,6 +39,16 @@ pub(crate) fn website(url: &str) -> Uri { Uri::from(Url::parse(url).expect("Expected valid Website URI")) } +/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme +/// +/// # Panic +/// +/// This panics if the given path is not absolute, so it should only be used for +/// testing +pub(crate) fn path>(path: P) -> Uri { + Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI")) +} + /// Creates a mail URI from a string pub(crate) fn mail(address: &str) -> Uri { if address.starts_with("mailto:") { diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 73a4097..b7b76c7 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -23,7 +23,10 @@ impl Base { pub(crate) fn join(&self, link: &str) -> Option { match self { Self::Remote(url) => url.join(link).ok(), - Self::Local(_) => None, + Self::Local(path) => { + let full_path = path.join(link); + Url::from_file_path(full_path).ok() + } } } @@ -36,18 +39,17 @@ impl Base { } } - pub(crate) fn from_source(source: &InputSource) -> Option { + pub(crate) fn from_source(source: &InputSource) -> Option { match &source { InputSource::RemoteUrl(url) => { - // TODO: This should be refactored. - // Cases like https://user:pass@example.com are not handled - // We can probably use the original URL and just replace the - // path component in the caller of this function - if let Some(port) = url.port() { - Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok() - } else { - Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok() - } + // Create a new URL with just the scheme, host, and port + let mut base_url = url.clone(); + base_url.set_path(""); + base_url.set_query(None); + base_url.set_fragment(None); + + // We keep the username and password intact + Some(Base::Remote(*base_url)) } // other inputs do not have a URL to extract a base _ => None, @@ -101,6 +103,16 @@ mod test_base { assert!(Base::try_from("data:text/plain,Hello?World#").is_err()); } + #[test] + fn test_valid_local_path_string_as_base() -> Result<()> { + let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"]; + + for case in cases { + assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case))); + } + Ok(()) + } + #[test] fn test_valid_local() -> Result<()> { let dir = tempfile::tempdir().unwrap(); @@ -123,7 +135,7 @@ mod test_base { let url = Url::parse(url).unwrap(); let source = InputSource::RemoteUrl(Box::new(url.clone())); let base = Base::from_source(&source); - let expected = Url::parse(expected).unwrap(); + let expected = Base::Remote(Url::parse(expected).unwrap()); assert_eq!(base, Some(expected)); } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 86faacb..bb1415f 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -86,12 +86,24 @@ pub enum ErrorKind { #[error("Error with base dir `{0}` : {1}")] InvalidBase(String, String), + /// Cannot join the given text with the base URL + #[error("Cannot join '{0}' with the base URL")] + InvalidBaseJoin(String), + + /// Cannot convert the given path to a URI + #[error("Cannot convert path '{0}' to a URI")] + InvalidPathToUri(String), + + /// The given URI type is not supported + #[error("Unsupported URI type: '{0}'")] + UnsupportedUriType(String), + /// The given input can not be parsed into a valid URI remapping #[error("Error remapping URL: `{0}`")] InvalidUrlRemap(String), /// The given path does not resolve to a valid file - #[error("Cannot find local file {0}")] + #[error("Invalid file path: {0}")] InvalidFile(PathBuf), /// Error while traversing an input directory @@ -246,6 +258,13 @@ impl PartialEq for ErrorKind { } (Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2, (Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2, + (Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2, + (Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2, + (Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2, + (Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2, + (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2, + (Self::EmptyUrl, Self::EmptyUrl) => true, + _ => false, } } @@ -281,6 +300,9 @@ impl Hash for ErrorKind { Self::UnreachableEmailAddress(u, ..) => u.hash(state), Self::InsecureURL(u, ..) => u.hash(state), Self::InvalidBase(base, e) => (base, e).hash(state), + Self::InvalidBaseJoin(s) => s.hash(state), + Self::InvalidPathToUri(s) => s.hash(state), + Self::UnsupportedUriType(s) => s.hash(state), Self::InvalidUrlRemap(remap) => (remap).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), diff --git a/lychee-lib/src/types/status_code/selector.rs b/lychee-lib/src/types/status_code/selector.rs index 060e097..c93b662 100644 --- a/lychee-lib/src/types/status_code/selector.rs +++ b/lychee-lib/src/types/status_code/selector.rs @@ -5,7 +5,7 @@ use thiserror::Error; use crate::{types::accept::AcceptRange, AcceptRangeError}; -#[derive(Debug, Error)] +#[derive(Debug, Error, PartialEq)] pub enum StatusCodeSelectorError { #[error("invalid/empty input")] InvalidInput, diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs index f78b23c..3ad51f2 100644 --- a/lychee-lib/src/types/uri/raw.rs +++ b/lychee-lib/src/types/uri/raw.rs @@ -19,12 +19,6 @@ pub struct RawUri { pub attribute: Option, } -impl RawUri { - // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs - pub(crate) fn is_anchor(&self) -> bool { - self.text.starts_with('#') - } -} impl Display for RawUri { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} (Attribute: {:?})", self.text, self.attribute) @@ -40,17 +34,3 @@ impl From<&str> for RawUri { } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_anchor() { - let raw_uri = RawUri::from("#anchor"); - assert!(raw_uri.is_anchor()); - - let raw_uri = RawUri::from("notan#anchor"); - assert!(!raw_uri.is_anchor()); - } -} diff --git a/lychee-lib/src/types/uri/valid.rs b/lychee-lib/src/types/uri/valid.rs index 573f685..522acc4 100644 --- a/lychee-lib/src/types/uri/valid.rs +++ b/lychee-lib/src/types/uri/valid.rs @@ -378,4 +378,9 @@ mod tests { website("https://example.com") ); } + + #[test] + fn test_file_uri() { + assert!(Uri::try_from("file:///path/to/file").unwrap().is_file()); + } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 70f82ce..7867e50 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,4 +1,4 @@ -use log::info; +use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::{ @@ -13,156 +13,374 @@ use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, }; -const MAX_TRUNCATED_STR_LEN: usize = 100; - /// Extract basic auth credentials for a given URL. -fn credentials(extractor: &Option, uri: &Uri) -> Option { +fn extract_credentials( + extractor: &Option, + uri: &Uri, +) -> Option { extractor.as_ref().and_then(|ext| ext.matches(uri)) } +/// Create a request from a raw URI. +fn create_request( + raw_uri: &RawUri, + source: &InputSource, + base: &Option, + extractor: &Option, +) -> Result { + let uri = try_parse_into_uri(raw_uri, source, base)?; + let source = truncate_source(source); + let element = raw_uri.element.clone(); + let attribute = raw_uri.attribute.clone(); + let credentials = extract_credentials(extractor, &uri); + + Ok(Request::new(uri, source, element, attribute, credentials)) +} + +/// Try to parse the raw URI into a `Uri`. +/// +/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. +/// If the base URL is not available, create a URI from the file path. +/// +/// # Errors +/// +/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base +/// to create a valid URI. +/// - If a URI cannot be created from the file path. +/// - If the source is not a file path (i.e. the URI type is not supported). +fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result { + let text = raw_uri.text.clone(); + let uri = match Uri::try_from(raw_uri.clone()) { + Ok(uri) => uri, + Err(_) => match base { + Some(base_url) => match base_url.join(&text) { + Some(url) => Uri { url }, + None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), + }, + None => match source { + InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, + _ => return Err(ErrorKind::UnsupportedUriType(text)), + }, + }, + }; + Ok(uri) +} + +// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs +pub(crate) fn is_anchor(text: &str) -> bool { + text.starts_with('#') +} + +/// Create a URI from a file path +/// +/// # Errors +/// +/// - If the link text is an anchor and the file name cannot be extracted from the file path. +/// - If the path cannot be resolved. +/// - If the resolved path cannot be converted to a URL. +fn create_uri_from_file_path( + file_path: &Path, + link_text: &str, + base: &Option, +) -> Result { + let target_path = if is_anchor(link_text) { + // For anchors, we need to append the anchor to the file name. + let file_name = file_path + .file_name() + .and_then(|name| name.to_str()) + .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?; + + format!("{file_name}{link_text}") + } else { + link_text.to_string() + }; + let Ok(constructed_url) = resolve_and_create_url(file_path, &target_path, base) else { + return Err(ErrorKind::InvalidPathToUri(target_path)); + }; + Ok(Uri { + url: constructed_url, + }) +} + +/// Truncate the source in case it gets too long +/// +/// This is only needed for string inputs. +/// For other inputs, the source is simply a "label" (an enum variant). +// TODO: This would not be necessary if we used `Cow` for the source. +fn truncate_source(source: &InputSource) -> InputSource { + const MAX_TRUNCATED_STR_LEN: usize = 100; + + match source { + InputSource::String(s) => { + InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) + } + other => other.clone(), + } +} + /// Create requests out of the collected URLs. /// Only keeps "valid" URLs. This filters out anchors for example. +/// +/// If a URLs is ignored (because of the current settings), +/// it will not be added to the `HashSet`. pub(crate) fn create( uris: Vec, input_content: &InputContent, base: &Option, extractor: &Option, -) -> Result> { - let base_url = Base::from_source(&input_content.source); +) -> HashSet { + let base = base + .clone() + .or_else(|| Base::from_source(&input_content.source)); - let requests: Result>> = uris - .into_iter() - .map(|raw_uri| { - let is_anchor = raw_uri.is_anchor(); - let text = raw_uri.text.clone(); - let element = raw_uri.element.clone(); - let attribute = raw_uri.attribute.clone(); - - // Truncate the source in case it gets too long Ideally we should - // avoid the initial String allocation for `source` altogether - let source = match &input_content.source { - InputSource::String(s) => { - InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) + uris.into_iter() + .filter_map(|raw_uri| { + match create_request(&raw_uri, &input_content.source, &base, extractor) { + Ok(request) => Some(request), + Err(e) => { + warn!("Error creating request: {:?}", e); + None } - // Cloning is cheap here - c => c.clone(), - }; - - if let Ok(uri) = Uri::try_from(raw_uri) { - let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) { - let uri = Uri { url }; - let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else if let InputSource::FsPath(root) = &input_content.source { - let path = if is_anchor { - match root.file_name() { - Some(file_name) => match file_name.to_str() { - Some(valid_str) => valid_str.to_string() + &text, - None => return Err(ErrorKind::InvalidFile(root.clone())), - }, - None => return Err(ErrorKind::InvalidFile(root.clone())), - } - } else { - text - }; - - if let Some(url) = create_uri_from_path(root, &path, base)? { - let uri = Uri { url }; - let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else { - // In case we cannot create a URI from a path but we didn't receive an error, - // it means that some preconditions were not met, e.g. the `base_url` wasn't set. - Ok(None) - } - } else if let Some(url) = construct_url(&base_url, &text) { - if base.is_some() { - Ok(None) - } else { - let uri = Uri { url: url? }; - let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } - } else { - info!("Handling of `{}` not implemented yet", text); - Ok(None) } }) - .collect(); - let requests: Vec = requests?.into_iter().flatten().collect(); - Ok(HashSet::from_iter(requests)) + .collect() } -fn construct_url(base: &Option, text: &str) -> Option> { - base.as_ref().map(|base| { - base.join(text) - .map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}"))) - }) -} +/// Create a URI from a path +/// +/// `src_path` is the path of the source file. +/// `dest_path` is the path being linked to. +/// The optional `base_uri` specifies the base URI to resolve the destination path against. +/// +/// # Errors +/// +/// - If the percent-decoded destination path cannot be decoded as UTF-8. +/// - The path cannot be resolved +/// - The resolved path cannot be converted to a URL. +fn resolve_and_create_url( + src_path: &Path, + dest_path: &str, + base_uri: &Option, +) -> Result { + let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); -fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { - let (dst, frag) = url::remove_get_params_and_separate_fragment(dst); - // Avoid double-encoding already encoded destination paths by removing any - // potential encoding (e.g. `web%20site` becomes `web site`). - // That's because Url::from_file_path will encode the full URL in the end. - // This behavior cannot be configured. - // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411 - // TODO: This is not a perfect solution. - // Ideally, only `src` and `base` should be URL encoded (as is done by - // `from_file_path` at the moment) while `dst` gets left untouched and simply - // appended to the end. - let decoded = percent_decode_str(dst).decode_utf8()?; - let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?; - match resolved { - Some(path) => Url::from_file_path(&path) - .map(|mut url| { - url.set_fragment(frag); - url - }) - .map(Some) - .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)), - None => Ok(None), - } + // Decode the destination path to avoid double-encoding + // This addresses the issue mentioned in the original comment about double-encoding + let decoded_dest = percent_decode_str(dest_path).decode_utf8()?; + + let Ok(Some(resolved_path)) = path::resolve(src_path, &PathBuf::from(&*decoded_dest), base_uri) + else { + return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string())); + }; + + let Ok(mut url) = Url::from_file_path(&resolved_path) else { + return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone())); + }; + + url.set_fragment(fragment); + Ok(url) } #[cfg(test)] mod tests { use super::*; + use crate::types::FileType; + + #[test] + fn test_is_anchor() { + assert!(is_anchor("#anchor")); + assert!(!is_anchor("notan#anchor")); + } #[test] fn test_create_uri_from_path() { let result = - create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap(); - assert_eq!(result.unwrap().as_str(), "file:///test+encoding"); + resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap(); + assert_eq!(result.as_str(), "file:///test+encoding"); + } + + fn create_input(content: &str, file_type: FileType) -> InputContent { + InputContent { + content: content.to_string(), + file_type, + source: InputSource::String(content.to_string()), + } + } + + #[test] + fn test_relative_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Relative Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("relative.html")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")); + } + + #[test] + fn test_absolute_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("https://another.com/page")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://another.com/page")); + } + + #[test] + fn test_root_relative_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Root Relative Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("/root-relative")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")); + } + + #[test] + fn test_parent_directory_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Parent Directory Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("../parent")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/parent")); + } + + #[test] + fn test_fragment_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input(r##"Fragment Link"##, FileType::Html); + + let uris = vec![RawUri::from("#fragment")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")); + } + + #[test] + fn test_no_base_url_resolution() { + let base = None; + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("https://example.com/page")]; + let requests = create(uris, &input, &base, &None); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/page")); + } + + #[test] + fn test_create_request_from_relative_file_path() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input_source = InputSource::FsPath(PathBuf::from("page.html")); + + let actual = + create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap(); + + assert_eq!( + actual, + Request::new( + Uri { + url: Url::from_file_path("/tmp/lychee/file.html").unwrap() + }, + input_source, + None, + None, + None, + ) + ); + } + + #[test] + fn test_create_request_from_absolute_file_path() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); + + // Use an absolute path that's outside the base directory + let actual = create_request( + &RawUri::from("/usr/local/share/doc/example.html"), + &input_source, + &base, + &None, + ) + .unwrap(); + + assert_eq!( + actual, + Request::new( + Uri { + url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap() + }, + input_source, + None, + None, + None, + ) + ); + } + + #[test] + fn test_parse_relative_path_into_uri() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input = create_input( + r#"Relative Link"#, + FileType::Html, + ); + + let raw_uri = RawUri::from("relative.html"); + let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); + + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); + } + + #[test] + fn test_parse_absolute_path_into_uri() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let raw_uri = RawUri::from("absolute.html"); + let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); + + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); } }