Add support for relative links (#1489)

This commit introduces several improvements to the file checking process and URI handling:

- Extract file checking logic into separate `Checker` structs (`FileChecker`, `WebsiteChecker`, `MailChecker`)
- Improve handling of relative and absolute file paths
- Enhance URI parsing and creation from file paths
- Refactor `create_request` function for better clarity and error handling

These changes provide better support for resolving relative links, handling different base URLs, and working with file paths.

Fixes https://github.com/lycheeverse/lychee/issues/1296 and https://github.com/lycheeverse/lychee/issues/1480
This commit is contained in:
Matthias Endler 2024-10-26 04:07:37 +02:00 committed by GitHub
parent 87d5b56e4f
commit 3094bbca33
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1133 additions and 522 deletions

View file

@ -0,0 +1,8 @@
<html>
<head>
<title>About</title>
</head>
<body>
<h1 id="fragment">About</h1>
</body>
</html>

View file

21
fixtures/resolve_paths/index.html vendored Normal file
View file

@ -0,0 +1,21 @@
<html>
<head>
<title>Index</title>
</head>
<body>
<h1>Index Title</h1>
<p>
<ul>
<li>
<a href="/">home</a>
</li>
<li>
<a href="/about">About</a>
</li>
<li>
<a href="/another page">About</a>
</li>
</ul>
</p>
</body>
</html>

View file

@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
ClientBuilder::builder()
.remaps(remaps)
.base(cfg.base.clone())
.includes(includes)
.excludes(excludes)
.exclude_all_private(cfg.exclude_all_private)

View file

@ -429,14 +429,12 @@ mod tests {
#[tokio::test]
async fn test_invalid_url() {
// Run a normal request with an invalid Url
let client = ClientBuilder::builder().build().client().unwrap();
let request = Request::try_from("http://\"").unwrap();
let response = check_url(&client, request).await;
assert!(response.status().is_error());
let uri = Uri::try_from("http://\"").unwrap();
let response = client.check_website(&uri, None).await.unwrap();
assert!(matches!(
response.status(),
Status::Error(ErrorKind::InvalidURI(_))
response,
Status::Unsupported(ErrorKind::BuildRequestClient(_))
));
}

View file

@ -65,12 +65,18 @@ mod tests {
}
}
#[cfg(test)]
/// Helper function to strip ANSI color codes for tests
fn strip_ansi_codes(s: &str) -> String {
console::strip_ansi_codes(s).to_string()
}
#[test]
fn test_format_response_with_ok_status() {
let formatter = ColorFormatter;
let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com");
assert_eq!(
formatter.format_response(&body),
strip_ansi_codes(&formatter.format_response(&body)),
" [200] https://example.com/"
);
}
@ -83,7 +89,7 @@ mod tests {
"https://example.com/404",
);
assert_eq!(
formatter.format_response(&body),
strip_ansi_codes(&formatter.format_response(&body)),
" [ERROR] https://example.com/404"
);
}

View file

@ -267,17 +267,17 @@ mod cli {
#[test]
fn test_resolve_paths() {
let mut cmd = main_command();
let offline_dir = fixtures_path().join("offline");
let dir = fixtures_path().join("resolve_paths");
cmd.arg("--offline")
.arg("--base")
.arg(&offline_dir)
.arg(offline_dir.join("index.html"))
.arg(&dir)
.arg(dir.join("index.html"))
.env_clear()
.assert()
.success()
.stdout(contains("4 Total"))
.stdout(contains("4 OK"));
.stdout(contains("3 Total"))
.stdout(contains("3 OK"));
}
#[test]
@ -944,13 +944,17 @@ mod cli {
// check content of cache file
let data = fs::read_to_string(&cache_file)?;
if data.is_empty() {
println!("Cache file is empty!");
}
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));
// clear the cache file
fs::remove_file(&cache_file)?;
Ok(())
}
@ -1216,8 +1220,9 @@ mod cli {
Ok(())
}
/// If base-dir is not set, don't throw an error in case we encounter
/// an absolute local link within a file (e.g. `/about`).
/// If `base-dir` is not set, don't throw an error in case we encounter
/// an absolute local link (e.g. `/about`) within a file.
/// Instead, simply ignore the link.
#[test]
fn test_ignore_absolute_local_links_without_base() -> Result<()> {
let mut cmd = main_command();
@ -1409,9 +1414,7 @@ mod cli {
.arg("./NOT-A-REAL-TEST-FIXTURE.md")
.assert()
.failure()
.stderr(contains(
"Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md",
));
.stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md"));
Ok(())
}
@ -1667,4 +1670,46 @@ mod cli {
.success()
.stdout(contains("0 Errors"));
}
/// Test relative paths
///
/// Imagine a web server hosting a site with the following structure:
/// root
/// └── test
/// ├── index.html
/// └── next.html
///
/// where `root/test/index.html` contains `<a href="next.html">next</a>`
/// When checking the link in `root/test/index.html` we should be able to
/// resolve the relative path to `root/test/next.html`
///
/// Note that the relative path is not resolved to the root of the server
/// but relative to the file that contains the link.
#[tokio::test]
async fn test_resolve_relative_paths_in_subfolder() -> Result<()> {
let mock_server = wiremock::MockServer::start().await;
let body = r#"<a href="next.html">next</a>"#;
wiremock::Mock::given(wiremock::matchers::method("GET"))
.and(wiremock::matchers::path("/test/index.html"))
.respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body))
.mount(&mock_server)
.await;
wiremock::Mock::given(wiremock::matchers::method("GET"))
.and(wiremock::matchers::path("/test/next.html"))
.respond_with(wiremock::ResponseTemplate::new(200))
.mount(&mock_server)
.await;
let mut cmd = main_command();
cmd.arg("--verbose")
.arg(format!("{}/test/index.html", mock_server.uri()))
.assert()
.success()
.stdout(contains("1 Total"))
.stdout(contains("0 Errors"));
Ok(())
}
}

View file

@ -1,80 +0,0 @@
use crate::{
chain::{ChainResult, Handler},
retry::RetryExt,
Status,
};
use async_trait::async_trait;
use http::StatusCode;
use reqwest::Request;
use std::{collections::HashSet, time::Duration};
#[derive(Debug, Clone)]
pub(crate) struct Checker {
retry_wait_time: Duration,
max_retries: u64,
reqwest_client: reqwest::Client,
accepted: Option<HashSet<StatusCode>>,
}
impl Checker {
pub(crate) const fn new(
retry_wait_time: Duration,
max_retries: u64,
reqwest_client: reqwest::Client,
accepted: Option<HashSet<StatusCode>>,
) -> Self {
Self {
retry_wait_time,
max_retries,
reqwest_client,
accepted,
}
}
/// Retry requests up to `max_retries` times
/// with an exponential backoff.
pub(crate) async fn retry_request(&self, request: Request) -> Status {
let mut retries: u64 = 0;
let mut wait_time = self.retry_wait_time;
let mut status = self.check_default(clone_unwrap(&request)).await;
while retries < self.max_retries {
if status.is_success() || !status.should_retry() {
return status;
}
retries += 1;
tokio::time::sleep(wait_time).await;
wait_time = wait_time.saturating_mul(2);
status = self.check_default(clone_unwrap(&request)).await;
}
status
}
/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
async fn check_default(&self, request: Request) -> Status {
match self.reqwest_client.execute(request).await {
Ok(ref response) => Status::new(response, self.accepted.clone()),
Err(e) => e.into(),
}
}
}
/// Clones a `reqwest::Request`.
///
/// # Safety
///
/// This panics if the request cannot be cloned. This should only happen if the
/// request body is a `reqwest` stream. We disable the `stream` feature, so the
/// body should never be a stream.
///
/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
fn clone_unwrap(request: &Request) -> Request {
request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
}
#[async_trait]
impl Handler<Request, Status> for Checker {
async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
ChainResult::Done(self.retry_request(input).await)
}
}

View file

@ -0,0 +1,179 @@
use http::StatusCode;
use log::warn;
use std::path::{Path, PathBuf};
use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
/// A utility for checking the existence and validity of file-based URIs.
///
/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
/// It supports base path resolution, fallback extensions for files without extensions,
/// and optional fragment checking for HTML files.
#[derive(Debug, Clone)]
pub(crate) struct FileChecker {
/// Base path or URL used for resolving relative paths.
base: Option<Base>,
/// List of file extensions to try if the original path doesn't exist.
fallback_extensions: Vec<String>,
/// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
include_fragments: bool,
/// Utility for performing fragment checks in HTML files.
fragment_checker: FragmentChecker,
}
impl FileChecker {
/// Creates a new `FileChecker` with the given configuration.
///
/// # Arguments
///
/// * `base` - Optional base path or URL for resolving relative paths.
/// * `fallback_extensions` - List of extensions to try if the original file is not found.
/// * `include_fragments` - Whether to check for fragment existence in HTML files.
pub(crate) fn new(
base: Option<Base>,
fallback_extensions: Vec<String>,
include_fragments: bool,
) -> Self {
Self {
base,
fallback_extensions,
include_fragments,
fragment_checker: FragmentChecker::new(),
}
}
/// Checks the given file URI for existence and validity.
///
/// This method resolves the URI to a file path, checks if the file exists,
/// and optionally checks for the existence of fragments in HTML files.
///
/// # Arguments
///
/// * `uri` - The URI to check.
///
/// # Returns
///
/// Returns a `Status` indicating the result of the check.
pub(crate) async fn check(&self, uri: &Uri) -> Status {
let Ok(path) = uri.url.to_file_path() else {
return ErrorKind::InvalidFilePath(uri.clone()).into();
};
let resolved_path = self.resolve_path(&path);
self.check_path(&resolved_path, uri).await
}
/// Resolves the given path using the base path, if one is set.
///
/// # Arguments
///
/// * `path` - The path to resolve.
///
/// # Returns
///
/// Returns the resolved path as a `PathBuf`.
fn resolve_path(&self, path: &Path) -> PathBuf {
if let Some(Base::Local(base_path)) = &self.base {
if path.is_absolute() {
let absolute_base_path = if base_path.is_relative() {
std::env::current_dir().unwrap_or_default().join(base_path)
} else {
base_path.clone()
};
let stripped = path.strip_prefix("/").unwrap_or(path);
absolute_base_path.join(stripped)
} else {
base_path.join(path)
}
} else {
path.to_path_buf()
}
}
/// Checks if the given path exists and performs additional checks if necessary.
///
/// # Arguments
///
/// * `path` - The path to check.
/// * `uri` - The original URI, used for error reporting.
///
/// # Returns
///
/// Returns a `Status` indicating the result of the check.
async fn check_path(&self, path: &Path, uri: &Uri) -> Status {
if path.exists() {
return self.check_existing_path(path, uri).await;
}
self.check_with_fallback_extensions(path, uri).await
}
/// Checks an existing path, optionally verifying fragments for HTML files.
///
/// # Arguments
///
/// * `path` - The path to check.
/// * `uri` - The original URI, used for error reporting.
///
/// # Returns
///
/// Returns a `Status` indicating the result of the check.
async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status {
if self.include_fragments {
self.check_fragment(path, uri).await
} else {
Status::Ok(StatusCode::OK)
}
}
/// Attempts to find a file by trying different extensions specified in `fallback_extensions`.
///
/// # Arguments
///
/// * `path` - The original path to check.
/// * `uri` - The original URI, used for error reporting.
///
/// # Returns
///
/// Returns a `Status` indicating the result of the check.
async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
let mut path_buf = path.to_path_buf();
// If the path already has an extension, try it first
if path_buf.extension().is_some() && path_buf.exists() {
return self.check_existing_path(&path_buf, uri).await;
}
// Try fallback extensions
for ext in &self.fallback_extensions {
path_buf.set_extension(ext);
if path_buf.exists() {
return self.check_existing_path(&path_buf, uri).await;
}
}
ErrorKind::InvalidFilePath(uri.clone()).into()
}
/// Checks for the existence of a fragment in an HTML file.
///
/// # Arguments
///
/// * `path` - The path to the HTML file.
/// * `uri` - The original URI, containing the fragment to check.
///
/// # Returns
///
/// Returns a `Status` indicating the result of the fragment check.
async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
}
}
}
}

View file

@ -0,0 +1,57 @@
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use http::StatusCode;
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use crate::ErrorKind;
use crate::{Status, Uri};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use crate::types::mail;
/// A utility for checking the validity of email addresses.
///
/// `EmailChecker` is responsible for validating email addresses,
/// optionally performing reachability checks when the appropriate
/// features are enabled.
#[derive(Debug, Clone)]
pub(crate) struct MailChecker {}
impl MailChecker {
/// Creates a new `EmailChecker`.
pub(crate) const fn new() -> Self {
Self {}
}
/// Check a mail address, or equivalently a `mailto` URI.
///
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
/// which are ignored by this check. They are not part of the mail address
/// and instead passed to a mail client.
#[cfg(all(feature = "email-check", feature = "native-tls"))]
pub(crate) async fn check_mail(&self, uri: &Uri) -> Status {
self.perform_email_check(uri).await
}
/// Ignore the mail check if the `email-check` and `native-tls` features are not enabled.
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status {
Status::Excluded
}
#[cfg(all(feature = "email-check", feature = "native-tls"))]
async fn perform_email_check(&self, uri: &Uri) -> Status {
let address = uri.url.path().to_string();
let input = CheckEmailInput::new(address);
let result = &(check_email(&input).await);
if let Reachable::Invalid = result.is_reachable {
ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
} else {
Status::Ok(StatusCode::OK)
}
}
}

View file

@ -0,0 +1,7 @@
//! Checker Module
//!
//! This module contains all checkers, which are responsible for checking the status of a URL.
pub(crate) mod file;
pub(crate) mod mail;
pub(crate) mod website;

View file

@ -0,0 +1,226 @@
use crate::{
chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
quirks::Quirks,
retry::RetryExt,
types::uri::github::GithubUri,
BasicAuthCredentials, ErrorKind, Status, Uri,
};
use async_trait::async_trait;
use http::StatusCode;
use octocrab::Octocrab;
use reqwest::Request;
use std::{collections::HashSet, time::Duration};
#[derive(Debug, Clone)]
pub(crate) struct WebsiteChecker {
/// Request method used for making requests.
method: reqwest::Method,
/// The HTTP client used for requests.
reqwest_client: reqwest::Client,
/// GitHub client used for requests.
github_client: Option<Octocrab>,
/// The chain of plugins to be executed on each request.
plugin_request_chain: RequestChain,
/// Maximum number of retries per request before returning an error.
max_retries: u64,
/// Initial wait time between retries of failed requests. This doubles after
/// each failure.
retry_wait_time: Duration,
/// Set of accepted return codes / status codes.
///
/// Unmatched return codes/ status codes are deemed as errors.
accepted: Option<HashSet<StatusCode>>,
/// Requires using HTTPS when it's available.
///
/// This would treat unencrypted links as errors when HTTPS is available.
require_https: bool,
}
impl WebsiteChecker {
#[allow(clippy::too_many_arguments)]
pub(crate) const fn new(
method: reqwest::Method,
retry_wait_time: Duration,
max_retries: u64,
reqwest_client: reqwest::Client,
accepted: Option<HashSet<StatusCode>>,
github_client: Option<Octocrab>,
require_https: bool,
plugin_request_chain: RequestChain,
) -> Self {
Self {
method,
reqwest_client,
github_client,
plugin_request_chain,
max_retries,
retry_wait_time,
accepted,
require_https,
}
}
/// Retry requests up to `max_retries` times
/// with an exponential backoff.
pub(crate) async fn retry_request(&self, request: Request) -> Status {
let mut retries: u64 = 0;
let mut wait_time = self.retry_wait_time;
let mut status = self.check_default(clone_unwrap(&request)).await;
while retries < self.max_retries {
if status.is_success() || !status.should_retry() {
return status;
}
retries += 1;
tokio::time::sleep(wait_time).await;
wait_time = wait_time.saturating_mul(2);
status = self.check_default(clone_unwrap(&request)).await;
}
status
}
/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
async fn check_default(&self, request: Request) -> Status {
match self.reqwest_client.execute(request).await {
Ok(ref response) => Status::new(response, self.accepted.clone()),
Err(e) => e.into(),
}
}
/// Checks the given URI of a website.
///
/// # Errors
///
/// This returns an `Err` if
/// - The URI is invalid.
/// - The request failed.
/// - The response status code is not accepted.
/// - The URI cannot be converted to HTTPS.
pub(crate) async fn check_website(
&self,
uri: &Uri,
credentials: Option<BasicAuthCredentials>,
) -> Result<Status, ErrorKind> {
let default_chain: RequestChain = Chain::new(vec![
Box::<Quirks>::default(),
Box::new(credentials),
Box::new(self.clone()),
]);
match self.check_website_inner(uri, &default_chain).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
if self
.check_website_inner(&uri.to_https()?, &default_chain)
.await
.is_success()
{
Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
} else {
Ok(Status::Ok(code))
}
}
s => Ok(s),
}
}
/// Checks the given URI of a website.
///
/// Unsupported schemes will be ignored
///
/// Note: we use `inner` to improve compile times by avoiding monomorphization
///
/// # Errors
///
/// This returns an `Err` if
/// - The URI is invalid.
/// - The request failed.
/// - The response status code is not accepted.
async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
let request = self
.reqwest_client
.request(self.method.clone(), uri.as_str())
.build();
let request = match request {
Ok(r) => r,
Err(e) => return e.into(),
};
let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
.traverse(request)
.await;
self.handle_github(status, uri).await
}
// Pull out the heavy machinery in case of a failed normal request.
// This could be a GitHub URL and we ran into the rate limiter.
// TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
if status.is_success() {
return status;
}
if let Ok(github_uri) = GithubUri::try_from(uri) {
let status = self.check_github(github_uri).await;
if status.is_success() {
return status;
}
}
status
}
/// Check a `uri` hosted on `GitHub` via the GitHub API.
///
/// # Caveats
///
/// Files inside private repositories won't get checked and instead would
/// be reported as valid if the repository itself is reachable through the
/// API.
///
/// A better approach would be to download the file through the API or
/// clone the repo, but we chose the pragmatic approach.
async fn check_github(&self, uri: GithubUri) -> Status {
let Some(client) = &self.github_client else {
return ErrorKind::MissingGitHubToken.into();
};
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
Ok(repo) => repo,
Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
};
if let Some(true) = repo.private {
return Status::Ok(StatusCode::OK);
} else if let Some(endpoint) = uri.endpoint {
return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
.into();
}
Status::Ok(StatusCode::OK)
}
}
/// Clones a `reqwest::Request`.
///
/// # Safety
///
/// This panics if the request cannot be cloned. This should only happen if the
/// request body is a `reqwest` stream. We disable the `stream` feature, so the
/// body should never be a stream.
///
/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
fn clone_unwrap(request: &Request) -> Request {
request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
}
#[async_trait]
impl Handler<Request, Status> for WebsiteChecker {
async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
ChainResult::Done(self.retry_request(input).await)
}
}

View file

@ -15,8 +15,6 @@
)]
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
use http::{
header::{HeaderMap, HeaderValue},
StatusCode,
@ -24,25 +22,21 @@ use http::{
use log::{debug, warn};
use octocrab::Octocrab;
use regex::RegexSet;
use reqwest::{header, redirect, Url};
use reqwest::{header, redirect};
use reqwest_cookie_store::CookieStoreMutex;
use secrecy::{ExposeSecret, SecretString};
use typed_builder::TypedBuilder;
use crate::{
chain::{Chain, ClientRequestChains, RequestChain},
checker::Checker,
chain::RequestChain,
checker::file::FileChecker,
checker::{mail::MailChecker, website::WebsiteChecker},
filter::{Excludes, Filter, Includes},
quirks::Quirks,
remap::Remaps,
types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
ErrorKind, Request, Response, Result, Status, Uri,
Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use crate::types::mail;
/// Default number of redirects before a request is deemed as failed, 5.
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
/// Default number of retries before a request is deemed as failed, 3.
@ -248,6 +242,12 @@ pub struct ClientBuilder {
/// Response timeout per request in seconds.
timeout: Option<Duration>,
/// Base for resolving paths.
///
/// E.g. if the base is `/home/user/` and the path is `file.txt`, the
/// resolved path would be `/home/user/file.txt`.
base: Option<Base>,
/// Initial time between retries of failed requests.
///
/// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`].
@ -383,20 +383,28 @@ impl ClientBuilder {
include_mail: self.include_mail,
};
Ok(Client {
let website_checker = WebsiteChecker::new(
self.method,
self.retry_wait_time,
self.max_retries,
reqwest_client,
self.accepted,
github_client,
self.require_https,
self.plugin_request_chain,
);
Ok(Client {
remaps: self.remaps,
fallback_extensions: self.fallback_extensions,
filter,
max_retries: self.max_retries,
retry_wait_time: self.retry_wait_time,
method: self.method,
accepted: self.accepted,
require_https: self.require_https,
include_fragments: self.include_fragments,
email_checker: MailChecker::new(),
website_checker,
file_checker: FileChecker::new(
self.base,
self.fallback_extensions,
self.include_fragments,
),
fragment_checker: FragmentChecker::new(),
plugin_request_chain: self.plugin_request_chain,
})
}
}
@ -407,50 +415,23 @@ impl ClientBuilder {
/// options.
#[derive(Debug, Clone)]
pub struct Client {
/// Underlying `reqwest` client instance that handles the HTTP requests.
reqwest_client: reqwest::Client,
/// Optional GitHub client that handles communications with GitHub.
github_client: Option<Octocrab>,
/// Optional remapping rules for URIs matching pattern.
remaps: Option<Remaps>,
/// Automatically append file extensions to `file://` URIs as needed
fallback_extensions: Vec<String>,
/// Rules to decided whether each link should be checked or ignored.
filter: Filter,
/// Maximum number of retries per request before returning an error.
max_retries: u64,
/// A checker for website URLs.
website_checker: WebsiteChecker,
/// Initial wait time between retries of failed requests. This doubles after
/// each failure.
retry_wait_time: Duration,
/// A checker for file URLs.
file_checker: FileChecker,
/// HTTP method used for requests, e.g. `GET` or `HEAD`.
///
/// The same method will be used for all links.
method: reqwest::Method,
/// Set of accepted return codes / status codes.
///
/// Unmatched return codes/ status codes are deemed as errors.
accepted: Option<HashSet<StatusCode>>,
/// Requires using HTTPS when it's available.
///
/// This would treat unencrypted links as errors when HTTPS is available.
require_https: bool,
/// Enable the checking of fragments in links.
include_fragments: bool,
/// A checker for email URLs.
email_checker: MailChecker,
/// Caches Fragments
fragment_checker: FragmentChecker,
plugin_request_chain: RequestChain,
}
impl Client {
@ -463,7 +444,7 @@ impl Client {
///
/// Returns an `Err` if:
/// - `request` does not represent a valid URI.
/// - Encrypted connection for a HTTP URL is available but unused. (Only
/// - Encrypted connection for a HTTP URL is available but unused. (Only
/// checked when `Client::require_https` is `true`.)
#[allow(clippy::missing_panics_doc)]
pub async fn check<T, E>(&self, request: T) -> Result<Response>
@ -493,27 +474,22 @@ impl Client {
return Ok(Response::new(uri.clone(), Status::Excluded, source));
}
let default_chain: RequestChain = Chain::new(vec![
Box::<Quirks>::default(),
Box::new(credentials),
Box::new(Checker::new(
self.retry_wait_time,
self.max_retries,
self.reqwest_client.clone(),
self.accepted.clone(),
)),
]);
let status = match uri.scheme() {
// We don't check tel: URIs
_ if uri.is_tel() => Status::Excluded,
_ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
_ if uri.is_tel() => Status::Excluded,
_ => self.check_website(uri, default_chain).await?,
_ => self.check_website(uri, credentials).await?,
};
Ok(Response::new(uri.clone(), status, source))
}
/// Check a single file using the file checker.
pub async fn check_file(&self, uri: &Uri) -> Status {
self.file_checker.check(uri).await
}
/// Remap `uri` using the client-defined remapping rules.
///
/// # Errors
@ -541,151 +517,17 @@ impl Client {
/// - The request failed.
/// - The response status code is not accepted.
/// - The URI cannot be converted to HTTPS.
pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result<Status> {
match self.check_website_inner(uri, &default_chain).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
if self
.check_website_inner(&uri.to_https()?, &default_chain)
.await
.is_success()
{
Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
} else {
// HTTPS is not available for this URI,
// so the original HTTP URL is fine.
Ok(Status::Ok(code))
}
}
s => Ok(s),
}
pub async fn check_website(
&self,
uri: &Uri,
credentials: Option<BasicAuthCredentials>,
) -> Result<Status> {
self.website_checker.check_website(uri, credentials).await
}
/// Checks the given URI of a website.
///
/// Unsupported schemes will be ignored
///
/// # Errors
///
/// This returns an `Err` if
/// - The URI is invalid.
/// - The request failed.
/// - The response status code is not accepted.
pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
// Workaround for upstream reqwest panic
if validate_url(&uri.url) {
if matches!(uri.scheme(), "http" | "https") {
// This is a truly invalid URI with a known scheme.
// If we pass that to reqwest it would panic.
return Status::Error(ErrorKind::InvalidURI(uri.clone()));
}
// This is merely a URI with a scheme that is not supported by
// reqwest yet. It would be safe to pass that to reqwest and it
// wouldn't panic, but it's also unnecessary, because it would
// simply return an error.
return Status::Unsupported(ErrorKind::InvalidURI(uri.clone()));
}
let request = self
.reqwest_client
.request(self.method.clone(), uri.as_str())
.build();
let request = match request {
Ok(r) => r,
Err(e) => return e.into(),
};
let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
.traverse(request)
.await;
self.handle_github(status, uri).await
}
// Pull out the heavy machinery in case of a failed normal request.
// This could be a GitHub URL and we ran into the rate limiter.
// TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023)
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
if status.is_success() {
return status;
}
if let Ok(github_uri) = GithubUri::try_from(uri) {
let status = self.check_github(github_uri).await;
// Only return GitHub status in case of success
// Otherwise return the original error, which has more information
if status.is_success() {
return status;
}
}
status
}
/// Check a `uri` hosted on `GitHub` via the GitHub API.
///
/// # Caveats
///
/// Files inside private repositories won't get checked and instead would
/// be reported as valid if the repository itself is reachable through the
/// API.
///
/// A better approach would be to download the file through the API or
/// clone the repo, but we chose the pragmatic approach.
async fn check_github(&self, uri: GithubUri) -> Status {
let Some(client) = &self.github_client else {
return ErrorKind::MissingGitHubToken.into();
};
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
Ok(repo) => repo,
Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
};
if let Some(true) = repo.private {
// The private repo exists. Assume a given endpoint exists as well
// (e.g. `issues` in `github.com/org/private/issues`). This is not
// always the case but simplifies the check.
return Status::Ok(StatusCode::OK);
} else if let Some(endpoint) = uri.endpoint {
// The URI returned a non-200 status code from a normal request and
// now we find that this public repo is reachable through the API,
// so that must mean the full URI (which includes the additional
// endpoint) must be invalid.
return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
.into();
}
// Found public repo without endpoint
Status::Ok(StatusCode::OK)
}
/// Check a `file` URI.
pub async fn check_file(&self, uri: &Uri) -> Status {
let Ok(path) = uri.url.to_file_path() else {
return ErrorKind::InvalidFilePath(uri.clone()).into();
};
if path.exists() {
if self.include_fragments {
return self.check_fragment(&path, uri).await;
}
return Status::Ok(StatusCode::OK);
}
if path.extension().is_some() {
return ErrorKind::InvalidFilePath(uri.clone()).into();
}
// if the path has no file extension, try to append some
let mut path_buf = path.clone();
for ext in &self.fallback_extensions {
path_buf.set_extension(ext);
if path_buf.exists() {
if self.include_fragments {
return self.check_fragment(&path_buf, uri).await;
}
return Status::Ok(StatusCode::OK);
}
}
ErrorKind::InvalidFilePath(uri.clone()).into()
/// Checks a `mailto` URI.
pub async fn check_mail(&self, uri: &Uri) -> Status {
self.email_checker.check_mail(uri).await
}
/// Checks a `file` URI's fragment.
@ -699,43 +541,6 @@ impl Client {
}
}
}
/// Check a mail address, or equivalently a `mailto` URI.
///
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
/// which are ignored by this check. The are not part of the mail address
/// and instead passed to a mail client.
#[cfg(all(feature = "email-check", feature = "native-tls"))]
pub async fn check_mail(&self, uri: &Uri) -> Status {
let address = uri.url.path().to_string();
let input = CheckEmailInput::new(address);
let result = &(check_email(&input).await);
if let Reachable::Invalid = result.is_reachable {
ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
} else {
Status::Ok(StatusCode::OK)
}
}
/// Check a mail address, or equivalently a `mailto` URI.
///
/// This implementation simply excludes all email addresses.
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
#[allow(clippy::unused_async)]
pub async fn check_mail(&self, _uri: &Uri) -> Status {
Status::Excluded
}
}
// Check if the given `Url` would cause `reqwest` to panic.
// This is a workaround for https://github.com/lycheeverse/lychee/issues/539
// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399
// got merged.
// It is exactly the same check that reqwest runs internally, but unfortunately
// it `unwrap`s (and panics!) instead of returning an error, which we could handle.
fn validate_url(url: &Url) -> bool {
http::Uri::try_from(url.as_str()).is_err()
}
/// A shorthand function to check a single URI.
@ -777,7 +582,7 @@ mod tests {
chain::{ChainResult, Handler, RequestChain},
mock_server,
test_utils::get_mock_client_response,
Request, Status, Uri,
ErrorKind, Request, Status, Uri,
};
#[tokio::test]
@ -1026,9 +831,14 @@ mod tests {
#[tokio::test]
async fn test_avoid_reqwest_panic() {
let client = ClientBuilder::builder().build().client().unwrap();
// This request will fail, but it won't panic
// This request will result in an Unsupported status, but it won't panic
let res = client.check("http://\"").await.unwrap();
assert!(res.status().is_error());
assert!(matches!(
res.status(),
Status::Unsupported(ErrorKind::BuildRequestClient(_))
));
assert!(res.status().is_unsupported());
}
#[tokio::test]

View file

@ -1,3 +1,4 @@
use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Base, Input, Request, Result,
@ -99,27 +100,31 @@ impl Collector {
///
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
let base = self.base;
let skip_missing_inputs = self.skip_missing_inputs;
let skip_hidden = self.skip_hidden;
let skip_ignored = self.skip_ignored;
let global_base = self.base;
stream::iter(inputs)
.par_then_unordered(None, move |input| async move {
input.get_contents(
self.skip_missing_inputs,
self.skip_hidden,
self.skip_ignored,
)
.par_then_unordered(None, move |input| {
let default_base = global_base.clone();
async move {
let base = match &input.source {
InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(),
_ => default_base,
};
input
.get_contents(skip_missing_inputs, skip_hidden, skip_ignored)
.map(move |content| (content, base.clone()))
}
})
.flatten()
.par_then_unordered(None, move |content| {
// send to parallel worker
let base = base.clone();
.par_then_unordered(None, move |(content, base)| {
let basic_auth_extractor = self.basic_auth_extractor.clone();
async move {
let content = content?;
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec<RawUri> = extractor.extract(&content);
let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
let requests = request::create(uris, &content, &base, &basic_auth_extractor);
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
})
@ -137,7 +142,7 @@ mod tests {
use super::*;
use crate::{
mock_server,
test_utils::{load_fixture, mail, website},
test_utils::{load_fixture, mail, path, website},
types::{FileType, Input, InputSource},
Result, Uri,
};
@ -426,4 +431,85 @@ mod tests {
assert_eq!(links, expected_links);
}
#[tokio::test]
async fn test_multiple_remote_urls() {
let mock_server_1 = mock_server!(
StatusCode::OK,
set_body_string(r#"<a href="relative.html">Link</a>"#)
);
let mock_server_2 = mock_server!(
StatusCode::OK,
set_body_string(r#"<a href="relative.html">Link</a>"#)
);
let inputs = vec![
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&format!(
"{}/foo/index.html",
mock_server_1.uri().trim_end_matches('/')
))
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
},
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&format!(
"{}/bar/index.html",
mock_server_2.uri().trim_end_matches('/')
))
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
},
];
let links = collect(inputs, None).await;
let expected_links = HashSet::from_iter([
website(&format!(
"{}/foo/relative.html",
mock_server_1.uri().trim_end_matches('/')
)),
website(&format!(
"{}/bar/relative.html",
mock_server_2.uri().trim_end_matches('/')
)),
]);
assert_eq!(links, expected_links);
}
#[tokio::test]
async fn test_file_path_with_base() {
let base = Base::try_from("/path/to/root").unwrap();
assert_eq!(base, Base::Local("/path/to/root".into()));
let input = Input {
source: InputSource::String(
r#"
<a href="index.html">Index</a>
<a href="about.html">About</a>
<a href="/another.html">Another</a>
"#
.into(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;
let expected_links = HashSet::from_iter([
path("/path/to/root/index.html"),
path("/path/to/root/about.html"),
path("/another.html"),
]);
assert_eq!(links, expected_links);
}
}

View file

@ -39,6 +39,16 @@ pub(crate) fn website(url: &str) -> Uri {
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
}
/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme
///
/// # Panic
///
/// This panics if the given path is not absolute, so it should only be used for
/// testing
pub(crate) fn path<P: AsRef<Path>>(path: P) -> Uri {
Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI"))
}
/// Creates a mail URI from a string
pub(crate) fn mail(address: &str) -> Uri {
if address.starts_with("mailto:") {

View file

@ -23,7 +23,10 @@ impl Base {
pub(crate) fn join(&self, link: &str) -> Option<Url> {
match self {
Self::Remote(url) => url.join(link).ok(),
Self::Local(_) => None,
Self::Local(path) => {
let full_path = path.join(link);
Url::from_file_path(full_path).ok()
}
}
}
@ -36,18 +39,17 @@ impl Base {
}
}
pub(crate) fn from_source(source: &InputSource) -> Option<Url> {
pub(crate) fn from_source(source: &InputSource) -> Option<Base> {
match &source {
InputSource::RemoteUrl(url) => {
// TODO: This should be refactored.
// Cases like https://user:pass@example.com are not handled
// We can probably use the original URL and just replace the
// path component in the caller of this function
if let Some(port) = url.port() {
Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok()
} else {
Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
}
// Create a new URL with just the scheme, host, and port
let mut base_url = url.clone();
base_url.set_path("");
base_url.set_query(None);
base_url.set_fragment(None);
// We keep the username and password intact
Some(Base::Remote(*base_url))
}
// other inputs do not have a URL to extract a base
_ => None,
@ -101,6 +103,16 @@ mod test_base {
assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
}
#[test]
fn test_valid_local_path_string_as_base() -> Result<()> {
let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"];
for case in cases {
assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case)));
}
Ok(())
}
#[test]
fn test_valid_local() -> Result<()> {
let dir = tempfile::tempdir().unwrap();
@ -123,7 +135,7 @@ mod test_base {
let url = Url::parse(url).unwrap();
let source = InputSource::RemoteUrl(Box::new(url.clone()));
let base = Base::from_source(&source);
let expected = Url::parse(expected).unwrap();
let expected = Base::Remote(Url::parse(expected).unwrap());
assert_eq!(base, Some(expected));
}
}

View file

@ -86,12 +86,24 @@ pub enum ErrorKind {
#[error("Error with base dir `{0}` : {1}")]
InvalidBase(String, String),
/// Cannot join the given text with the base URL
#[error("Cannot join '{0}' with the base URL")]
InvalidBaseJoin(String),
/// Cannot convert the given path to a URI
#[error("Cannot convert path '{0}' to a URI")]
InvalidPathToUri(String),
/// The given URI type is not supported
#[error("Unsupported URI type: '{0}'")]
UnsupportedUriType(String),
/// The given input can not be parsed into a valid URI remapping
#[error("Error remapping URL: `{0}`")]
InvalidUrlRemap(String),
/// The given path does not resolve to a valid file
#[error("Cannot find local file {0}")]
#[error("Invalid file path: {0}")]
InvalidFile(PathBuf),
/// Error while traversing an input directory
@ -246,6 +258,13 @@ impl PartialEq for ErrorKind {
}
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
(Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2,
(Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2,
(Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2,
(Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2,
(Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2,
(Self::EmptyUrl, Self::EmptyUrl) => true,
_ => false,
}
}
@ -281,6 +300,9 @@ impl Hash for ErrorKind {
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
Self::InsecureURL(u, ..) => u.hash(state),
Self::InvalidBase(base, e) => (base, e).hash(state),
Self::InvalidBaseJoin(s) => s.hash(state),
Self::InvalidPathToUri(s) => s.hash(state),
Self::UnsupportedUriType(s) => s.hash(state),
Self::InvalidUrlRemap(remap) => (remap).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),

View file

@ -5,7 +5,7 @@ use thiserror::Error;
use crate::{types::accept::AcceptRange, AcceptRangeError};
#[derive(Debug, Error)]
#[derive(Debug, Error, PartialEq)]
pub enum StatusCodeSelectorError {
#[error("invalid/empty input")]
InvalidInput,

View file

@ -19,12 +19,6 @@ pub struct RawUri {
pub attribute: Option<String>,
}
impl RawUri {
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
pub(crate) fn is_anchor(&self) -> bool {
self.text.starts_with('#')
}
}
impl Display for RawUri {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} (Attribute: {:?})", self.text, self.attribute)
@ -40,17 +34,3 @@ impl From<&str> for RawUri {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_anchor() {
let raw_uri = RawUri::from("#anchor");
assert!(raw_uri.is_anchor());
let raw_uri = RawUri::from("notan#anchor");
assert!(!raw_uri.is_anchor());
}
}

View file

@ -378,4 +378,9 @@ mod tests {
website("https://example.com")
);
}
#[test]
fn test_file_uri() {
assert!(Uri::try_from("file:///path/to/file").unwrap().is_file());
}
}

View file

@ -1,4 +1,4 @@
use log::info;
use log::warn;
use percent_encoding::percent_decode_str;
use reqwest::Url;
use std::{
@ -13,156 +13,374 @@ use crate::{
Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri,
};
const MAX_TRUNCATED_STR_LEN: usize = 100;
/// Extract basic auth credentials for a given URL.
fn credentials(extractor: &Option<BasicAuthExtractor>, uri: &Uri) -> Option<BasicAuthCredentials> {
fn extract_credentials(
extractor: &Option<BasicAuthExtractor>,
uri: &Uri,
) -> Option<BasicAuthCredentials> {
extractor.as_ref().and_then(|ext| ext.matches(uri))
}
/// Create a request from a raw URI.
fn create_request(
raw_uri: &RawUri,
source: &InputSource,
base: &Option<Base>,
extractor: &Option<BasicAuthExtractor>,
) -> Result<Request> {
let uri = try_parse_into_uri(raw_uri, source, base)?;
let source = truncate_source(source);
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();
let credentials = extract_credentials(extractor, &uri);
Ok(Request::new(uri, source, element, attribute, credentials))
}
/// Try to parse the raw URI into a `Uri`.
///
/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
/// If the base URL is not available, create a URI from the file path.
///
/// # Errors
///
/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
/// to create a valid URI.
/// - If a URI cannot be created from the file path.
/// - If the source is not a file path (i.e. the URI type is not supported).
fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option<Base>) -> Result<Uri> {
let text = raw_uri.text.clone();
let uri = match Uri::try_from(raw_uri.clone()) {
Ok(uri) => uri,
Err(_) => match base {
Some(base_url) => match base_url.join(&text) {
Some(url) => Uri { url },
None => return Err(ErrorKind::InvalidBaseJoin(text.clone())),
},
None => match source {
InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?,
_ => return Err(ErrorKind::UnsupportedUriType(text)),
},
},
};
Ok(uri)
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
pub(crate) fn is_anchor(text: &str) -> bool {
text.starts_with('#')
}
/// Create a URI from a file path
///
/// # Errors
///
/// - If the link text is an anchor and the file name cannot be extracted from the file path.
/// - If the path cannot be resolved.
/// - If the resolved path cannot be converted to a URL.
fn create_uri_from_file_path(
file_path: &Path,
link_text: &str,
base: &Option<Base>,
) -> Result<Uri> {
let target_path = if is_anchor(link_text) {
// For anchors, we need to append the anchor to the file name.
let file_name = file_path
.file_name()
.and_then(|name| name.to_str())
.ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?;
format!("{file_name}{link_text}")
} else {
link_text.to_string()
};
let Ok(constructed_url) = resolve_and_create_url(file_path, &target_path, base) else {
return Err(ErrorKind::InvalidPathToUri(target_path));
};
Ok(Uri {
url: constructed_url,
})
}
/// Truncate the source in case it gets too long
///
/// This is only needed for string inputs.
/// For other inputs, the source is simply a "label" (an enum variant).
// TODO: This would not be necessary if we used `Cow` for the source.
fn truncate_source(source: &InputSource) -> InputSource {
const MAX_TRUNCATED_STR_LEN: usize = 100;
match source {
InputSource::String(s) => {
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
}
other => other.clone(),
}
}
/// Create requests out of the collected URLs.
/// Only keeps "valid" URLs. This filters out anchors for example.
///
/// If a URLs is ignored (because of the current settings),
/// it will not be added to the `HashSet`.
pub(crate) fn create(
uris: Vec<RawUri>,
input_content: &InputContent,
base: &Option<Base>,
extractor: &Option<BasicAuthExtractor>,
) -> Result<HashSet<Request>> {
let base_url = Base::from_source(&input_content.source);
) -> HashSet<Request> {
let base = base
.clone()
.or_else(|| Base::from_source(&input_content.source));
let requests: Result<Vec<Option<Request>>> = uris
.into_iter()
.map(|raw_uri| {
let is_anchor = raw_uri.is_anchor();
let text = raw_uri.text.clone();
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();
// Truncate the source in case it gets too long Ideally we should
// avoid the initial String allocation for `source` altogether
let source = match &input_content.source {
InputSource::String(s) => {
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
uris.into_iter()
.filter_map(|raw_uri| {
match create_request(&raw_uri, &input_content.source, &base, extractor) {
Ok(request) => Some(request),
Err(e) => {
warn!("Error creating request: {:?}", e);
None
}
// Cloning is cheap here
c => c.clone(),
};
if let Ok(uri) = Uri::try_from(raw_uri) {
let credentials = credentials(extractor, &uri);
Ok(Some(Request::new(
uri,
source,
element,
attribute,
credentials,
)))
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
let uri = Uri { url };
let credentials = credentials(extractor, &uri);
Ok(Some(Request::new(
uri,
source,
element,
attribute,
credentials,
)))
} else if let InputSource::FsPath(root) = &input_content.source {
let path = if is_anchor {
match root.file_name() {
Some(file_name) => match file_name.to_str() {
Some(valid_str) => valid_str.to_string() + &text,
None => return Err(ErrorKind::InvalidFile(root.clone())),
},
None => return Err(ErrorKind::InvalidFile(root.clone())),
}
} else {
text
};
if let Some(url) = create_uri_from_path(root, &path, base)? {
let uri = Uri { url };
let credentials = credentials(extractor, &uri);
Ok(Some(Request::new(
uri,
source,
element,
attribute,
credentials,
)))
} else {
// In case we cannot create a URI from a path but we didn't receive an error,
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
Ok(None)
}
} else if let Some(url) = construct_url(&base_url, &text) {
if base.is_some() {
Ok(None)
} else {
let uri = Uri { url: url? };
let credentials = credentials(extractor, &uri);
Ok(Some(Request::new(
uri,
source,
element,
attribute,
credentials,
)))
}
} else {
info!("Handling of `{}` not implemented yet", text);
Ok(None)
}
})
.collect();
let requests: Vec<Request> = requests?.into_iter().flatten().collect();
Ok(HashSet::from_iter(requests))
.collect()
}
fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
base.as_ref().map(|base| {
base.join(text)
.map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}")))
})
}
/// Create a URI from a path
///
/// `src_path` is the path of the source file.
/// `dest_path` is the path being linked to.
/// The optional `base_uri` specifies the base URI to resolve the destination path against.
///
/// # Errors
///
/// - If the percent-decoded destination path cannot be decoded as UTF-8.
/// - The path cannot be resolved
/// - The resolved path cannot be converted to a URL.
fn resolve_and_create_url(
src_path: &Path,
dest_path: &str,
base_uri: &Option<Base>,
) -> Result<Url> {
let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
let (dst, frag) = url::remove_get_params_and_separate_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
// This behavior cannot be configured.
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
// TODO: This is not a perfect solution.
// Ideally, only `src` and `base` should be URL encoded (as is done by
// `from_file_path` at the moment) while `dst` gets left untouched and simply
// appended to the end.
let decoded = percent_decode_str(dst).decode_utf8()?;
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
match resolved {
Some(path) => Url::from_file_path(&path)
.map(|mut url| {
url.set_fragment(frag);
url
})
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),
}
// Decode the destination path to avoid double-encoding
// This addresses the issue mentioned in the original comment about double-encoding
let decoded_dest = percent_decode_str(dest_path).decode_utf8()?;
let Ok(Some(resolved_path)) = path::resolve(src_path, &PathBuf::from(&*decoded_dest), base_uri)
else {
return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string()));
};
let Ok(mut url) = Url::from_file_path(&resolved_path) else {
return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone()));
};
url.set_fragment(fragment);
Ok(url)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::FileType;
#[test]
fn test_is_anchor() {
assert!(is_anchor("#anchor"));
assert!(!is_anchor("notan#anchor"));
}
#[test]
fn test_create_uri_from_path() {
let result =
create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
assert_eq!(result.as_str(), "file:///test+encoding");
}
fn create_input(content: &str, file_type: FileType) -> InputContent {
InputContent {
content: content.to_string(),
file_type,
source: InputSource::String(content.to_string()),
}
}
#[test]
fn test_relative_url_resolution() {
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
let input = create_input(
r#"<a href="relative.html">Relative Link</a>"#,
FileType::Html,
);
let uris = vec![RawUri::from("relative.html")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html"));
}
#[test]
fn test_absolute_url_resolution() {
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
let input = create_input(
r#"<a href="https://another.com/page">Absolute Link</a>"#,
FileType::Html,
);
let uris = vec![RawUri::from("https://another.com/page")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://another.com/page"));
}
#[test]
fn test_root_relative_url_resolution() {
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
let input = create_input(
r#"<a href="/root-relative">Root Relative Link</a>"#,
FileType::Html,
);
let uris = vec![RawUri::from("/root-relative")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://example.com/root-relative"));
}
#[test]
fn test_parent_directory_url_resolution() {
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
let input = create_input(
r#"<a href="../parent">Parent Directory Link</a>"#,
FileType::Html,
);
let uris = vec![RawUri::from("../parent")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://example.com/parent"));
}
#[test]
fn test_fragment_url_resolution() {
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
let input = create_input(r##"<a href="#fragment">Fragment Link</a>"##, FileType::Html);
let uris = vec![RawUri::from("#fragment")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment"));
}
#[test]
fn test_no_base_url_resolution() {
let base = None;
let input = create_input(
r#"<a href="https://example.com/page">Absolute Link</a>"#,
FileType::Html,
);
let uris = vec![RawUri::from("https://example.com/page")];
let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
.iter()
.any(|r| r.uri.url.as_str() == "https://example.com/page"));
}
#[test]
fn test_create_request_from_relative_file_path() {
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
let input_source = InputSource::FsPath(PathBuf::from("page.html"));
let actual =
create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap();
assert_eq!(
actual,
Request::new(
Uri {
url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
},
input_source,
None,
None,
None,
)
);
}
#[test]
fn test_create_request_from_absolute_file_path() {
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html"));
// Use an absolute path that's outside the base directory
let actual = create_request(
&RawUri::from("/usr/local/share/doc/example.html"),
&input_source,
&base,
&None,
)
.unwrap();
assert_eq!(
actual,
Request::new(
Uri {
url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap()
},
input_source,
None,
None,
None,
)
);
}
#[test]
fn test_parse_relative_path_into_uri() {
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
let input = create_input(
r#"<a href="relative.html">Relative Link</a>"#,
FileType::Html,
);
let raw_uri = RawUri::from("relative.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
}
#[test]
fn test_parse_absolute_path_into_uri() {
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
let input = create_input(
r#"<a href="/absolute.html">Absolute Link</a>"#,
FileType::Html,
);
let raw_uri = RawUri::from("absolute.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
}
}