mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-17 05:00:26 +00:00
Add support for relative links (#1489)
This commit introduces several improvements to the file checking process and URI handling: - Extract file checking logic into separate `Checker` structs (`FileChecker`, `WebsiteChecker`, `MailChecker`) - Improve handling of relative and absolute file paths - Enhance URI parsing and creation from file paths - Refactor `create_request` function for better clarity and error handling These changes provide better support for resolving relative links, handling different base URLs, and working with file paths. Fixes https://github.com/lycheeverse/lychee/issues/1296 and https://github.com/lycheeverse/lychee/issues/1480
This commit is contained in:
parent
87d5b56e4f
commit
3094bbca33
21 changed files with 1133 additions and 522 deletions
8
fixtures/resolve_paths/about/index.html
vendored
Normal file
8
fixtures/resolve_paths/about/index.html
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>About</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1 id="fragment">About</h1>
|
||||
</body>
|
||||
</html>
|
||||
0
fixtures/resolve_paths/another page/index.html
vendored
Normal file
0
fixtures/resolve_paths/another page/index.html
vendored
Normal file
21
fixtures/resolve_paths/index.html
vendored
Normal file
21
fixtures/resolve_paths/index.html
vendored
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Index</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Index Title</h1>
|
||||
<p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/">home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/about">About</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/another page">About</a>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
|
|||
|
||||
ClientBuilder::builder()
|
||||
.remaps(remaps)
|
||||
.base(cfg.base.clone())
|
||||
.includes(includes)
|
||||
.excludes(excludes)
|
||||
.exclude_all_private(cfg.exclude_all_private)
|
||||
|
|
|
|||
|
|
@ -429,14 +429,12 @@ mod tests {
|
|||
|
||||
#[tokio::test]
|
||||
async fn test_invalid_url() {
|
||||
// Run a normal request with an invalid Url
|
||||
let client = ClientBuilder::builder().build().client().unwrap();
|
||||
let request = Request::try_from("http://\"").unwrap();
|
||||
let response = check_url(&client, request).await;
|
||||
assert!(response.status().is_error());
|
||||
let uri = Uri::try_from("http://\"").unwrap();
|
||||
let response = client.check_website(&uri, None).await.unwrap();
|
||||
assert!(matches!(
|
||||
response.status(),
|
||||
Status::Error(ErrorKind::InvalidURI(_))
|
||||
response,
|
||||
Status::Unsupported(ErrorKind::BuildRequestClient(_))
|
||||
));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -65,12 +65,18 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Helper function to strip ANSI color codes for tests
|
||||
fn strip_ansi_codes(s: &str) -> String {
|
||||
console::strip_ansi_codes(s).to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_response_with_ok_status() {
|
||||
let formatter = ColorFormatter;
|
||||
let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com");
|
||||
assert_eq!(
|
||||
formatter.format_response(&body),
|
||||
strip_ansi_codes(&formatter.format_response(&body)),
|
||||
" [200] https://example.com/"
|
||||
);
|
||||
}
|
||||
|
|
@ -83,7 +89,7 @@ mod tests {
|
|||
"https://example.com/404",
|
||||
);
|
||||
assert_eq!(
|
||||
formatter.format_response(&body),
|
||||
strip_ansi_codes(&formatter.format_response(&body)),
|
||||
" [ERROR] https://example.com/404"
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -267,17 +267,17 @@ mod cli {
|
|||
#[test]
|
||||
fn test_resolve_paths() {
|
||||
let mut cmd = main_command();
|
||||
let offline_dir = fixtures_path().join("offline");
|
||||
let dir = fixtures_path().join("resolve_paths");
|
||||
|
||||
cmd.arg("--offline")
|
||||
.arg("--base")
|
||||
.arg(&offline_dir)
|
||||
.arg(offline_dir.join("index.html"))
|
||||
.arg(&dir)
|
||||
.arg(dir.join("index.html"))
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("4 Total"))
|
||||
.stdout(contains("4 OK"));
|
||||
.stdout(contains("3 Total"))
|
||||
.stdout(contains("3 OK"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -944,13 +944,17 @@ mod cli {
|
|||
|
||||
// check content of cache file
|
||||
let data = fs::read_to_string(&cache_file)?;
|
||||
|
||||
if data.is_empty() {
|
||||
println!("Cache file is empty!");
|
||||
}
|
||||
|
||||
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
|
||||
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
|
||||
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));
|
||||
|
||||
// clear the cache file
|
||||
fs::remove_file(&cache_file)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -1216,8 +1220,9 @@ mod cli {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// If base-dir is not set, don't throw an error in case we encounter
|
||||
/// an absolute local link within a file (e.g. `/about`).
|
||||
/// If `base-dir` is not set, don't throw an error in case we encounter
|
||||
/// an absolute local link (e.g. `/about`) within a file.
|
||||
/// Instead, simply ignore the link.
|
||||
#[test]
|
||||
fn test_ignore_absolute_local_links_without_base() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
|
|
@ -1409,9 +1414,7 @@ mod cli {
|
|||
.arg("./NOT-A-REAL-TEST-FIXTURE.md")
|
||||
.assert()
|
||||
.failure()
|
||||
.stderr(contains(
|
||||
"Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md",
|
||||
));
|
||||
.stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1667,4 +1670,46 @@ mod cli {
|
|||
.success()
|
||||
.stdout(contains("0 Errors"));
|
||||
}
|
||||
|
||||
/// Test relative paths
|
||||
///
|
||||
/// Imagine a web server hosting a site with the following structure:
|
||||
/// root
|
||||
/// └── test
|
||||
/// ├── index.html
|
||||
/// └── next.html
|
||||
///
|
||||
/// where `root/test/index.html` contains `<a href="next.html">next</a>`
|
||||
/// When checking the link in `root/test/index.html` we should be able to
|
||||
/// resolve the relative path to `root/test/next.html`
|
||||
///
|
||||
/// Note that the relative path is not resolved to the root of the server
|
||||
/// but relative to the file that contains the link.
|
||||
#[tokio::test]
|
||||
async fn test_resolve_relative_paths_in_subfolder() -> Result<()> {
|
||||
let mock_server = wiremock::MockServer::start().await;
|
||||
|
||||
let body = r#"<a href="next.html">next</a>"#;
|
||||
wiremock::Mock::given(wiremock::matchers::method("GET"))
|
||||
.and(wiremock::matchers::path("/test/index.html"))
|
||||
.respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body))
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
wiremock::Mock::given(wiremock::matchers::method("GET"))
|
||||
.and(wiremock::matchers::path("/test/next.html"))
|
||||
.respond_with(wiremock::ResponseTemplate::new(200))
|
||||
.mount(&mock_server)
|
||||
.await;
|
||||
|
||||
let mut cmd = main_command();
|
||||
cmd.arg("--verbose")
|
||||
.arg(format!("{}/test/index.html", mock_server.uri()))
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("1 Total"))
|
||||
.stdout(contains("0 Errors"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,80 +0,0 @@
|
|||
use crate::{
|
||||
chain::{ChainResult, Handler},
|
||||
retry::RetryExt,
|
||||
Status,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use http::StatusCode;
|
||||
use reqwest::Request;
|
||||
use std::{collections::HashSet, time::Duration};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct Checker {
|
||||
retry_wait_time: Duration,
|
||||
max_retries: u64,
|
||||
reqwest_client: reqwest::Client,
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
}
|
||||
|
||||
impl Checker {
|
||||
pub(crate) const fn new(
|
||||
retry_wait_time: Duration,
|
||||
max_retries: u64,
|
||||
reqwest_client: reqwest::Client,
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
retry_wait_time,
|
||||
max_retries,
|
||||
reqwest_client,
|
||||
accepted,
|
||||
}
|
||||
}
|
||||
|
||||
/// Retry requests up to `max_retries` times
|
||||
/// with an exponential backoff.
|
||||
pub(crate) async fn retry_request(&self, request: Request) -> Status {
|
||||
let mut retries: u64 = 0;
|
||||
let mut wait_time = self.retry_wait_time;
|
||||
|
||||
let mut status = self.check_default(clone_unwrap(&request)).await;
|
||||
while retries < self.max_retries {
|
||||
if status.is_success() || !status.should_retry() {
|
||||
return status;
|
||||
}
|
||||
retries += 1;
|
||||
tokio::time::sleep(wait_time).await;
|
||||
wait_time = wait_time.saturating_mul(2);
|
||||
status = self.check_default(clone_unwrap(&request)).await;
|
||||
}
|
||||
status
|
||||
}
|
||||
|
||||
/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
|
||||
async fn check_default(&self, request: Request) -> Status {
|
||||
match self.reqwest_client.execute(request).await {
|
||||
Ok(ref response) => Status::new(response, self.accepted.clone()),
|
||||
Err(e) => e.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clones a `reqwest::Request`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This panics if the request cannot be cloned. This should only happen if the
|
||||
/// request body is a `reqwest` stream. We disable the `stream` feature, so the
|
||||
/// body should never be a stream.
|
||||
///
|
||||
/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
|
||||
fn clone_unwrap(request: &Request) -> Request {
|
||||
request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Handler<Request, Status> for Checker {
|
||||
async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
|
||||
ChainResult::Done(self.retry_request(input).await)
|
||||
}
|
||||
}
|
||||
179
lychee-lib/src/checker/file.rs
Normal file
179
lychee-lib/src/checker/file.rs
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
use http::StatusCode;
|
||||
use log::warn;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
|
||||
|
||||
/// A utility for checking the existence and validity of file-based URIs.
|
||||
///
|
||||
/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
|
||||
/// It supports base path resolution, fallback extensions for files without extensions,
|
||||
/// and optional fragment checking for HTML files.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct FileChecker {
|
||||
/// Base path or URL used for resolving relative paths.
|
||||
base: Option<Base>,
|
||||
/// List of file extensions to try if the original path doesn't exist.
|
||||
fallback_extensions: Vec<String>,
|
||||
/// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
|
||||
include_fragments: bool,
|
||||
/// Utility for performing fragment checks in HTML files.
|
||||
fragment_checker: FragmentChecker,
|
||||
}
|
||||
|
||||
impl FileChecker {
|
||||
/// Creates a new `FileChecker` with the given configuration.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `base` - Optional base path or URL for resolving relative paths.
|
||||
/// * `fallback_extensions` - List of extensions to try if the original file is not found.
|
||||
/// * `include_fragments` - Whether to check for fragment existence in HTML files.
|
||||
pub(crate) fn new(
|
||||
base: Option<Base>,
|
||||
fallback_extensions: Vec<String>,
|
||||
include_fragments: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
base,
|
||||
fallback_extensions,
|
||||
include_fragments,
|
||||
fragment_checker: FragmentChecker::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the given file URI for existence and validity.
|
||||
///
|
||||
/// This method resolves the URI to a file path, checks if the file exists,
|
||||
/// and optionally checks for the existence of fragments in HTML files.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `uri` - The URI to check.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Status` indicating the result of the check.
|
||||
pub(crate) async fn check(&self, uri: &Uri) -> Status {
|
||||
let Ok(path) = uri.url.to_file_path() else {
|
||||
return ErrorKind::InvalidFilePath(uri.clone()).into();
|
||||
};
|
||||
|
||||
let resolved_path = self.resolve_path(&path);
|
||||
self.check_path(&resolved_path, uri).await
|
||||
}
|
||||
|
||||
/// Resolves the given path using the base path, if one is set.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - The path to resolve.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns the resolved path as a `PathBuf`.
|
||||
fn resolve_path(&self, path: &Path) -> PathBuf {
|
||||
if let Some(Base::Local(base_path)) = &self.base {
|
||||
if path.is_absolute() {
|
||||
let absolute_base_path = if base_path.is_relative() {
|
||||
std::env::current_dir().unwrap_or_default().join(base_path)
|
||||
} else {
|
||||
base_path.clone()
|
||||
};
|
||||
|
||||
let stripped = path.strip_prefix("/").unwrap_or(path);
|
||||
absolute_base_path.join(stripped)
|
||||
} else {
|
||||
base_path.join(path)
|
||||
}
|
||||
} else {
|
||||
path.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the given path exists and performs additional checks if necessary.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - The path to check.
|
||||
/// * `uri` - The original URI, used for error reporting.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Status` indicating the result of the check.
|
||||
async fn check_path(&self, path: &Path, uri: &Uri) -> Status {
|
||||
if path.exists() {
|
||||
return self.check_existing_path(path, uri).await;
|
||||
}
|
||||
|
||||
self.check_with_fallback_extensions(path, uri).await
|
||||
}
|
||||
|
||||
/// Checks an existing path, optionally verifying fragments for HTML files.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - The path to check.
|
||||
/// * `uri` - The original URI, used for error reporting.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Status` indicating the result of the check.
|
||||
async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status {
|
||||
if self.include_fragments {
|
||||
self.check_fragment(path, uri).await
|
||||
} else {
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempts to find a file by trying different extensions specified in `fallback_extensions`.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - The original path to check.
|
||||
/// * `uri` - The original URI, used for error reporting.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Status` indicating the result of the check.
|
||||
async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
|
||||
let mut path_buf = path.to_path_buf();
|
||||
|
||||
// If the path already has an extension, try it first
|
||||
if path_buf.extension().is_some() && path_buf.exists() {
|
||||
return self.check_existing_path(&path_buf, uri).await;
|
||||
}
|
||||
|
||||
// Try fallback extensions
|
||||
for ext in &self.fallback_extensions {
|
||||
path_buf.set_extension(ext);
|
||||
if path_buf.exists() {
|
||||
return self.check_existing_path(&path_buf, uri).await;
|
||||
}
|
||||
}
|
||||
|
||||
ErrorKind::InvalidFilePath(uri.clone()).into()
|
||||
}
|
||||
|
||||
/// Checks for the existence of a fragment in an HTML file.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `path` - The path to the HTML file.
|
||||
/// * `uri` - The original URI, containing the fragment to check.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Status` indicating the result of the fragment check.
|
||||
async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
|
||||
match self.fragment_checker.check(path, &uri.url).await {
|
||||
Ok(true) => Status::Ok(StatusCode::OK),
|
||||
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
|
||||
Err(err) => {
|
||||
warn!("Skipping fragment check due to the following error: {err}");
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
57
lychee-lib/src/checker/mail.rs
Normal file
57
lychee-lib/src/checker/mail.rs
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use http::StatusCode;
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use crate::ErrorKind;
|
||||
|
||||
use crate::{Status, Uri};
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use crate::types::mail;
|
||||
|
||||
/// A utility for checking the validity of email addresses.
|
||||
///
|
||||
/// `EmailChecker` is responsible for validating email addresses,
|
||||
/// optionally performing reachability checks when the appropriate
|
||||
/// features are enabled.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct MailChecker {}
|
||||
|
||||
impl MailChecker {
|
||||
/// Creates a new `EmailChecker`.
|
||||
pub(crate) const fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
/// Check a mail address, or equivalently a `mailto` URI.
|
||||
///
|
||||
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
|
||||
/// which are ignored by this check. They are not part of the mail address
|
||||
/// and instead passed to a mail client.
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
pub(crate) async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
self.perform_email_check(uri).await
|
||||
}
|
||||
|
||||
/// Ignore the mail check if the `email-check` and `native-tls` features are not enabled.
|
||||
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
|
||||
pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status {
|
||||
Status::Excluded
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
async fn perform_email_check(&self, uri: &Uri) -> Status {
|
||||
let address = uri.url.path().to_string();
|
||||
let input = CheckEmailInput::new(address);
|
||||
let result = &(check_email(&input).await);
|
||||
|
||||
if let Reachable::Invalid = result.is_reachable {
|
||||
ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
|
||||
} else {
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
}
|
||||
7
lychee-lib/src/checker/mod.rs
Normal file
7
lychee-lib/src/checker/mod.rs
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
//! Checker Module
|
||||
//!
|
||||
//! This module contains all checkers, which are responsible for checking the status of a URL.
|
||||
|
||||
pub(crate) mod file;
|
||||
pub(crate) mod mail;
|
||||
pub(crate) mod website;
|
||||
226
lychee-lib/src/checker/website.rs
Normal file
226
lychee-lib/src/checker/website.rs
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
use crate::{
|
||||
chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
|
||||
quirks::Quirks,
|
||||
retry::RetryExt,
|
||||
types::uri::github::GithubUri,
|
||||
BasicAuthCredentials, ErrorKind, Status, Uri,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use http::StatusCode;
|
||||
use octocrab::Octocrab;
|
||||
use reqwest::Request;
|
||||
use std::{collections::HashSet, time::Duration};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct WebsiteChecker {
|
||||
/// Request method used for making requests.
|
||||
method: reqwest::Method,
|
||||
|
||||
/// The HTTP client used for requests.
|
||||
reqwest_client: reqwest::Client,
|
||||
|
||||
/// GitHub client used for requests.
|
||||
github_client: Option<Octocrab>,
|
||||
|
||||
/// The chain of plugins to be executed on each request.
|
||||
plugin_request_chain: RequestChain,
|
||||
|
||||
/// Maximum number of retries per request before returning an error.
|
||||
max_retries: u64,
|
||||
|
||||
/// Initial wait time between retries of failed requests. This doubles after
|
||||
/// each failure.
|
||||
retry_wait_time: Duration,
|
||||
|
||||
/// Set of accepted return codes / status codes.
|
||||
///
|
||||
/// Unmatched return codes/ status codes are deemed as errors.
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
|
||||
/// Requires using HTTPS when it's available.
|
||||
///
|
||||
/// This would treat unencrypted links as errors when HTTPS is available.
|
||||
require_https: bool,
|
||||
}
|
||||
|
||||
impl WebsiteChecker {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) const fn new(
|
||||
method: reqwest::Method,
|
||||
retry_wait_time: Duration,
|
||||
max_retries: u64,
|
||||
reqwest_client: reqwest::Client,
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
github_client: Option<Octocrab>,
|
||||
require_https: bool,
|
||||
plugin_request_chain: RequestChain,
|
||||
) -> Self {
|
||||
Self {
|
||||
method,
|
||||
reqwest_client,
|
||||
github_client,
|
||||
plugin_request_chain,
|
||||
max_retries,
|
||||
retry_wait_time,
|
||||
accepted,
|
||||
require_https,
|
||||
}
|
||||
}
|
||||
|
||||
/// Retry requests up to `max_retries` times
|
||||
/// with an exponential backoff.
|
||||
pub(crate) async fn retry_request(&self, request: Request) -> Status {
|
||||
let mut retries: u64 = 0;
|
||||
let mut wait_time = self.retry_wait_time;
|
||||
let mut status = self.check_default(clone_unwrap(&request)).await;
|
||||
while retries < self.max_retries {
|
||||
if status.is_success() || !status.should_retry() {
|
||||
return status;
|
||||
}
|
||||
retries += 1;
|
||||
tokio::time::sleep(wait_time).await;
|
||||
wait_time = wait_time.saturating_mul(2);
|
||||
status = self.check_default(clone_unwrap(&request)).await;
|
||||
}
|
||||
status
|
||||
}
|
||||
|
||||
/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
|
||||
async fn check_default(&self, request: Request) -> Status {
|
||||
match self.reqwest_client.execute(request).await {
|
||||
Ok(ref response) => Status::new(response, self.accepted.clone()),
|
||||
Err(e) => e.into(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the given URI of a website.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - The URI is invalid.
|
||||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
/// - The URI cannot be converted to HTTPS.
|
||||
pub(crate) async fn check_website(
|
||||
&self,
|
||||
uri: &Uri,
|
||||
credentials: Option<BasicAuthCredentials>,
|
||||
) -> Result<Status, ErrorKind> {
|
||||
let default_chain: RequestChain = Chain::new(vec![
|
||||
Box::<Quirks>::default(),
|
||||
Box::new(credentials),
|
||||
Box::new(self.clone()),
|
||||
]);
|
||||
|
||||
match self.check_website_inner(uri, &default_chain).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
if self
|
||||
.check_website_inner(&uri.to_https()?, &default_chain)
|
||||
.await
|
||||
.is_success()
|
||||
{
|
||||
Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
|
||||
} else {
|
||||
Ok(Status::Ok(code))
|
||||
}
|
||||
}
|
||||
s => Ok(s),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the given URI of a website.
|
||||
///
|
||||
/// Unsupported schemes will be ignored
|
||||
///
|
||||
/// Note: we use `inner` to improve compile times by avoiding monomorphization
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - The URI is invalid.
|
||||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
|
||||
let request = self
|
||||
.reqwest_client
|
||||
.request(self.method.clone(), uri.as_str())
|
||||
.build();
|
||||
|
||||
let request = match request {
|
||||
Ok(r) => r,
|
||||
Err(e) => return e.into(),
|
||||
};
|
||||
|
||||
let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
|
||||
.traverse(request)
|
||||
.await;
|
||||
|
||||
self.handle_github(status, uri).await
|
||||
}
|
||||
|
||||
// Pull out the heavy machinery in case of a failed normal request.
|
||||
// This could be a GitHub URL and we ran into the rate limiter.
|
||||
// TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
|
||||
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
|
||||
if status.is_success() {
|
||||
return status;
|
||||
}
|
||||
|
||||
if let Ok(github_uri) = GithubUri::try_from(uri) {
|
||||
let status = self.check_github(github_uri).await;
|
||||
if status.is_success() {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
status
|
||||
}
|
||||
|
||||
/// Check a `uri` hosted on `GitHub` via the GitHub API.
|
||||
///
|
||||
/// # Caveats
|
||||
///
|
||||
/// Files inside private repositories won't get checked and instead would
|
||||
/// be reported as valid if the repository itself is reachable through the
|
||||
/// API.
|
||||
///
|
||||
/// A better approach would be to download the file through the API or
|
||||
/// clone the repo, but we chose the pragmatic approach.
|
||||
async fn check_github(&self, uri: GithubUri) -> Status {
|
||||
let Some(client) = &self.github_client else {
|
||||
return ErrorKind::MissingGitHubToken.into();
|
||||
};
|
||||
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
|
||||
Ok(repo) => repo,
|
||||
Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
|
||||
};
|
||||
if let Some(true) = repo.private {
|
||||
return Status::Ok(StatusCode::OK);
|
||||
} else if let Some(endpoint) = uri.endpoint {
|
||||
return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
|
||||
.into();
|
||||
}
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
|
||||
/// Clones a `reqwest::Request`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This panics if the request cannot be cloned. This should only happen if the
|
||||
/// request body is a `reqwest` stream. We disable the `stream` feature, so the
|
||||
/// body should never be a stream.
|
||||
///
|
||||
/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
|
||||
fn clone_unwrap(request: &Request) -> Request {
|
||||
request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Handler<Request, Status> for WebsiteChecker {
|
||||
async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
|
||||
ChainResult::Done(self.retry_request(input).await)
|
||||
}
|
||||
}
|
||||
|
|
@ -15,8 +15,6 @@
|
|||
)]
|
||||
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
|
||||
use http::{
|
||||
header::{HeaderMap, HeaderValue},
|
||||
StatusCode,
|
||||
|
|
@ -24,25 +22,21 @@ use http::{
|
|||
use log::{debug, warn};
|
||||
use octocrab::Octocrab;
|
||||
use regex::RegexSet;
|
||||
use reqwest::{header, redirect, Url};
|
||||
use reqwest::{header, redirect};
|
||||
use reqwest_cookie_store::CookieStoreMutex;
|
||||
use secrecy::{ExposeSecret, SecretString};
|
||||
use typed_builder::TypedBuilder;
|
||||
|
||||
use crate::{
|
||||
chain::{Chain, ClientRequestChains, RequestChain},
|
||||
checker::Checker,
|
||||
chain::RequestChain,
|
||||
checker::file::FileChecker,
|
||||
checker::{mail::MailChecker, website::WebsiteChecker},
|
||||
filter::{Excludes, Filter, Includes},
|
||||
quirks::Quirks,
|
||||
remap::Remaps,
|
||||
types::uri::github::GithubUri,
|
||||
utils::fragment_checker::FragmentChecker,
|
||||
ErrorKind, Request, Response, Result, Status, Uri,
|
||||
Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
|
||||
};
|
||||
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
use crate::types::mail;
|
||||
|
||||
/// Default number of redirects before a request is deemed as failed, 5.
|
||||
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
/// Default number of retries before a request is deemed as failed, 3.
|
||||
|
|
@ -248,6 +242,12 @@ pub struct ClientBuilder {
|
|||
/// Response timeout per request in seconds.
|
||||
timeout: Option<Duration>,
|
||||
|
||||
/// Base for resolving paths.
|
||||
///
|
||||
/// E.g. if the base is `/home/user/` and the path is `file.txt`, the
|
||||
/// resolved path would be `/home/user/file.txt`.
|
||||
base: Option<Base>,
|
||||
|
||||
/// Initial time between retries of failed requests.
|
||||
///
|
||||
/// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`].
|
||||
|
|
@ -383,20 +383,28 @@ impl ClientBuilder {
|
|||
include_mail: self.include_mail,
|
||||
};
|
||||
|
||||
Ok(Client {
|
||||
let website_checker = WebsiteChecker::new(
|
||||
self.method,
|
||||
self.retry_wait_time,
|
||||
self.max_retries,
|
||||
reqwest_client,
|
||||
self.accepted,
|
||||
github_client,
|
||||
self.require_https,
|
||||
self.plugin_request_chain,
|
||||
);
|
||||
|
||||
Ok(Client {
|
||||
remaps: self.remaps,
|
||||
fallback_extensions: self.fallback_extensions,
|
||||
filter,
|
||||
max_retries: self.max_retries,
|
||||
retry_wait_time: self.retry_wait_time,
|
||||
method: self.method,
|
||||
accepted: self.accepted,
|
||||
require_https: self.require_https,
|
||||
include_fragments: self.include_fragments,
|
||||
email_checker: MailChecker::new(),
|
||||
website_checker,
|
||||
file_checker: FileChecker::new(
|
||||
self.base,
|
||||
self.fallback_extensions,
|
||||
self.include_fragments,
|
||||
),
|
||||
fragment_checker: FragmentChecker::new(),
|
||||
plugin_request_chain: self.plugin_request_chain,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -407,50 +415,23 @@ impl ClientBuilder {
|
|||
/// options.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
/// Underlying `reqwest` client instance that handles the HTTP requests.
|
||||
reqwest_client: reqwest::Client,
|
||||
|
||||
/// Optional GitHub client that handles communications with GitHub.
|
||||
github_client: Option<Octocrab>,
|
||||
|
||||
/// Optional remapping rules for URIs matching pattern.
|
||||
remaps: Option<Remaps>,
|
||||
|
||||
/// Automatically append file extensions to `file://` URIs as needed
|
||||
fallback_extensions: Vec<String>,
|
||||
|
||||
/// Rules to decided whether each link should be checked or ignored.
|
||||
filter: Filter,
|
||||
|
||||
/// Maximum number of retries per request before returning an error.
|
||||
max_retries: u64,
|
||||
/// A checker for website URLs.
|
||||
website_checker: WebsiteChecker,
|
||||
|
||||
/// Initial wait time between retries of failed requests. This doubles after
|
||||
/// each failure.
|
||||
retry_wait_time: Duration,
|
||||
/// A checker for file URLs.
|
||||
file_checker: FileChecker,
|
||||
|
||||
/// HTTP method used for requests, e.g. `GET` or `HEAD`.
|
||||
///
|
||||
/// The same method will be used for all links.
|
||||
method: reqwest::Method,
|
||||
|
||||
/// Set of accepted return codes / status codes.
|
||||
///
|
||||
/// Unmatched return codes/ status codes are deemed as errors.
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
|
||||
/// Requires using HTTPS when it's available.
|
||||
///
|
||||
/// This would treat unencrypted links as errors when HTTPS is available.
|
||||
require_https: bool,
|
||||
|
||||
/// Enable the checking of fragments in links.
|
||||
include_fragments: bool,
|
||||
/// A checker for email URLs.
|
||||
email_checker: MailChecker,
|
||||
|
||||
/// Caches Fragments
|
||||
fragment_checker: FragmentChecker,
|
||||
|
||||
plugin_request_chain: RequestChain,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
|
|
@ -463,7 +444,7 @@ impl Client {
|
|||
///
|
||||
/// Returns an `Err` if:
|
||||
/// - `request` does not represent a valid URI.
|
||||
/// - Encrypted connection for a HTTP URL is available but unused. (Only
|
||||
/// - Encrypted connection for a HTTP URL is available but unused. (Only
|
||||
/// checked when `Client::require_https` is `true`.)
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
pub async fn check<T, E>(&self, request: T) -> Result<Response>
|
||||
|
|
@ -493,27 +474,22 @@ impl Client {
|
|||
return Ok(Response::new(uri.clone(), Status::Excluded, source));
|
||||
}
|
||||
|
||||
let default_chain: RequestChain = Chain::new(vec![
|
||||
Box::<Quirks>::default(),
|
||||
Box::new(credentials),
|
||||
Box::new(Checker::new(
|
||||
self.retry_wait_time,
|
||||
self.max_retries,
|
||||
self.reqwest_client.clone(),
|
||||
self.accepted.clone(),
|
||||
)),
|
||||
]);
|
||||
|
||||
let status = match uri.scheme() {
|
||||
// We don't check tel: URIs
|
||||
_ if uri.is_tel() => Status::Excluded,
|
||||
_ if uri.is_file() => self.check_file(uri).await,
|
||||
_ if uri.is_mail() => self.check_mail(uri).await,
|
||||
_ if uri.is_tel() => Status::Excluded,
|
||||
_ => self.check_website(uri, default_chain).await?,
|
||||
_ => self.check_website(uri, credentials).await?,
|
||||
};
|
||||
|
||||
Ok(Response::new(uri.clone(), status, source))
|
||||
}
|
||||
|
||||
/// Check a single file using the file checker.
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
self.file_checker.check(uri).await
|
||||
}
|
||||
|
||||
/// Remap `uri` using the client-defined remapping rules.
|
||||
///
|
||||
/// # Errors
|
||||
|
|
@ -541,151 +517,17 @@ impl Client {
|
|||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
/// - The URI cannot be converted to HTTPS.
|
||||
pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result<Status> {
|
||||
match self.check_website_inner(uri, &default_chain).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
if self
|
||||
.check_website_inner(&uri.to_https()?, &default_chain)
|
||||
.await
|
||||
.is_success()
|
||||
{
|
||||
Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
|
||||
} else {
|
||||
// HTTPS is not available for this URI,
|
||||
// so the original HTTP URL is fine.
|
||||
Ok(Status::Ok(code))
|
||||
}
|
||||
}
|
||||
s => Ok(s),
|
||||
}
|
||||
pub async fn check_website(
|
||||
&self,
|
||||
uri: &Uri,
|
||||
credentials: Option<BasicAuthCredentials>,
|
||||
) -> Result<Status> {
|
||||
self.website_checker.check_website(uri, credentials).await
|
||||
}
|
||||
|
||||
/// Checks the given URI of a website.
|
||||
///
|
||||
/// Unsupported schemes will be ignored
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - The URI is invalid.
|
||||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
|
||||
// Workaround for upstream reqwest panic
|
||||
if validate_url(&uri.url) {
|
||||
if matches!(uri.scheme(), "http" | "https") {
|
||||
// This is a truly invalid URI with a known scheme.
|
||||
// If we pass that to reqwest it would panic.
|
||||
return Status::Error(ErrorKind::InvalidURI(uri.clone()));
|
||||
}
|
||||
// This is merely a URI with a scheme that is not supported by
|
||||
// reqwest yet. It would be safe to pass that to reqwest and it
|
||||
// wouldn't panic, but it's also unnecessary, because it would
|
||||
// simply return an error.
|
||||
return Status::Unsupported(ErrorKind::InvalidURI(uri.clone()));
|
||||
}
|
||||
|
||||
let request = self
|
||||
.reqwest_client
|
||||
.request(self.method.clone(), uri.as_str())
|
||||
.build();
|
||||
|
||||
let request = match request {
|
||||
Ok(r) => r,
|
||||
Err(e) => return e.into(),
|
||||
};
|
||||
|
||||
let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
|
||||
.traverse(request)
|
||||
.await;
|
||||
|
||||
self.handle_github(status, uri).await
|
||||
}
|
||||
|
||||
// Pull out the heavy machinery in case of a failed normal request.
|
||||
// This could be a GitHub URL and we ran into the rate limiter.
|
||||
// TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023)
|
||||
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
|
||||
if status.is_success() {
|
||||
return status;
|
||||
}
|
||||
|
||||
if let Ok(github_uri) = GithubUri::try_from(uri) {
|
||||
let status = self.check_github(github_uri).await;
|
||||
// Only return GitHub status in case of success
|
||||
// Otherwise return the original error, which has more information
|
||||
if status.is_success() {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
status
|
||||
}
|
||||
|
||||
/// Check a `uri` hosted on `GitHub` via the GitHub API.
|
||||
///
|
||||
/// # Caveats
|
||||
///
|
||||
/// Files inside private repositories won't get checked and instead would
|
||||
/// be reported as valid if the repository itself is reachable through the
|
||||
/// API.
|
||||
///
|
||||
/// A better approach would be to download the file through the API or
|
||||
/// clone the repo, but we chose the pragmatic approach.
|
||||
async fn check_github(&self, uri: GithubUri) -> Status {
|
||||
let Some(client) = &self.github_client else {
|
||||
return ErrorKind::MissingGitHubToken.into();
|
||||
};
|
||||
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
|
||||
Ok(repo) => repo,
|
||||
Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
|
||||
};
|
||||
if let Some(true) = repo.private {
|
||||
// The private repo exists. Assume a given endpoint exists as well
|
||||
// (e.g. `issues` in `github.com/org/private/issues`). This is not
|
||||
// always the case but simplifies the check.
|
||||
return Status::Ok(StatusCode::OK);
|
||||
} else if let Some(endpoint) = uri.endpoint {
|
||||
// The URI returned a non-200 status code from a normal request and
|
||||
// now we find that this public repo is reachable through the API,
|
||||
// so that must mean the full URI (which includes the additional
|
||||
// endpoint) must be invalid.
|
||||
return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
|
||||
.into();
|
||||
}
|
||||
// Found public repo without endpoint
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
|
||||
/// Check a `file` URI.
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
let Ok(path) = uri.url.to_file_path() else {
|
||||
return ErrorKind::InvalidFilePath(uri.clone()).into();
|
||||
};
|
||||
|
||||
if path.exists() {
|
||||
if self.include_fragments {
|
||||
return self.check_fragment(&path, uri).await;
|
||||
}
|
||||
return Status::Ok(StatusCode::OK);
|
||||
}
|
||||
|
||||
if path.extension().is_some() {
|
||||
return ErrorKind::InvalidFilePath(uri.clone()).into();
|
||||
}
|
||||
|
||||
// if the path has no file extension, try to append some
|
||||
let mut path_buf = path.clone();
|
||||
for ext in &self.fallback_extensions {
|
||||
path_buf.set_extension(ext);
|
||||
if path_buf.exists() {
|
||||
if self.include_fragments {
|
||||
return self.check_fragment(&path_buf, uri).await;
|
||||
}
|
||||
return Status::Ok(StatusCode::OK);
|
||||
}
|
||||
}
|
||||
ErrorKind::InvalidFilePath(uri.clone()).into()
|
||||
/// Checks a `mailto` URI.
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
self.email_checker.check_mail(uri).await
|
||||
}
|
||||
|
||||
/// Checks a `file` URI's fragment.
|
||||
|
|
@ -699,43 +541,6 @@ impl Client {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check a mail address, or equivalently a `mailto` URI.
|
||||
///
|
||||
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
|
||||
/// which are ignored by this check. The are not part of the mail address
|
||||
/// and instead passed to a mail client.
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
let address = uri.url.path().to_string();
|
||||
let input = CheckEmailInput::new(address);
|
||||
let result = &(check_email(&input).await);
|
||||
|
||||
if let Reachable::Invalid = result.is_reachable {
|
||||
ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
|
||||
} else {
|
||||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
|
||||
/// Check a mail address, or equivalently a `mailto` URI.
|
||||
///
|
||||
/// This implementation simply excludes all email addresses.
|
||||
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
|
||||
#[allow(clippy::unused_async)]
|
||||
pub async fn check_mail(&self, _uri: &Uri) -> Status {
|
||||
Status::Excluded
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the given `Url` would cause `reqwest` to panic.
|
||||
// This is a workaround for https://github.com/lycheeverse/lychee/issues/539
|
||||
// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399
|
||||
// got merged.
|
||||
// It is exactly the same check that reqwest runs internally, but unfortunately
|
||||
// it `unwrap`s (and panics!) instead of returning an error, which we could handle.
|
||||
fn validate_url(url: &Url) -> bool {
|
||||
http::Uri::try_from(url.as_str()).is_err()
|
||||
}
|
||||
|
||||
/// A shorthand function to check a single URI.
|
||||
|
|
@ -777,7 +582,7 @@ mod tests {
|
|||
chain::{ChainResult, Handler, RequestChain},
|
||||
mock_server,
|
||||
test_utils::get_mock_client_response,
|
||||
Request, Status, Uri,
|
||||
ErrorKind, Request, Status, Uri,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -1026,9 +831,14 @@ mod tests {
|
|||
#[tokio::test]
|
||||
async fn test_avoid_reqwest_panic() {
|
||||
let client = ClientBuilder::builder().build().client().unwrap();
|
||||
// This request will fail, but it won't panic
|
||||
// This request will result in an Unsupported status, but it won't panic
|
||||
let res = client.check("http://\"").await.unwrap();
|
||||
assert!(res.status().is_error());
|
||||
|
||||
assert!(matches!(
|
||||
res.status(),
|
||||
Status::Unsupported(ErrorKind::BuildRequestClient(_))
|
||||
));
|
||||
assert!(res.status().is_unsupported());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
use crate::InputSource;
|
||||
use crate::{
|
||||
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
|
||||
Base, Input, Request, Result,
|
||||
|
|
@ -99,27 +100,31 @@ impl Collector {
|
|||
///
|
||||
/// Will return `Err` if links cannot be extracted from an input
|
||||
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
|
||||
let base = self.base;
|
||||
let skip_missing_inputs = self.skip_missing_inputs;
|
||||
let skip_hidden = self.skip_hidden;
|
||||
let skip_ignored = self.skip_ignored;
|
||||
let global_base = self.base;
|
||||
stream::iter(inputs)
|
||||
.par_then_unordered(None, move |input| async move {
|
||||
input.get_contents(
|
||||
self.skip_missing_inputs,
|
||||
self.skip_hidden,
|
||||
self.skip_ignored,
|
||||
)
|
||||
.par_then_unordered(None, move |input| {
|
||||
let default_base = global_base.clone();
|
||||
async move {
|
||||
let base = match &input.source {
|
||||
InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(),
|
||||
_ => default_base,
|
||||
};
|
||||
input
|
||||
.get_contents(skip_missing_inputs, skip_hidden, skip_ignored)
|
||||
.map(move |content| (content, base.clone()))
|
||||
}
|
||||
})
|
||||
.flatten()
|
||||
.par_then_unordered(None, move |content| {
|
||||
// send to parallel worker
|
||||
let base = base.clone();
|
||||
.par_then_unordered(None, move |(content, base)| {
|
||||
let basic_auth_extractor = self.basic_auth_extractor.clone();
|
||||
async move {
|
||||
let content = content?;
|
||||
|
||||
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
|
||||
let uris: Vec<RawUri> = extractor.extract(&content);
|
||||
|
||||
let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
|
||||
let requests = request::create(uris, &content, &base, &basic_auth_extractor);
|
||||
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
|
||||
}
|
||||
})
|
||||
|
|
@ -137,7 +142,7 @@ mod tests {
|
|||
use super::*;
|
||||
use crate::{
|
||||
mock_server,
|
||||
test_utils::{load_fixture, mail, website},
|
||||
test_utils::{load_fixture, mail, path, website},
|
||||
types::{FileType, Input, InputSource},
|
||||
Result, Uri,
|
||||
};
|
||||
|
|
@ -426,4 +431,85 @@ mod tests {
|
|||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_remote_urls() {
|
||||
let mock_server_1 = mock_server!(
|
||||
StatusCode::OK,
|
||||
set_body_string(r#"<a href="relative.html">Link</a>"#)
|
||||
);
|
||||
let mock_server_2 = mock_server!(
|
||||
StatusCode::OK,
|
||||
set_body_string(r#"<a href="relative.html">Link</a>"#)
|
||||
);
|
||||
|
||||
let inputs = vec![
|
||||
Input {
|
||||
source: InputSource::RemoteUrl(Box::new(
|
||||
Url::parse(&format!(
|
||||
"{}/foo/index.html",
|
||||
mock_server_1.uri().trim_end_matches('/')
|
||||
))
|
||||
.unwrap(),
|
||||
)),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
excluded_paths: None,
|
||||
},
|
||||
Input {
|
||||
source: InputSource::RemoteUrl(Box::new(
|
||||
Url::parse(&format!(
|
||||
"{}/bar/index.html",
|
||||
mock_server_2.uri().trim_end_matches('/')
|
||||
))
|
||||
.unwrap(),
|
||||
)),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
excluded_paths: None,
|
||||
},
|
||||
];
|
||||
|
||||
let links = collect(inputs, None).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
website(&format!(
|
||||
"{}/foo/relative.html",
|
||||
mock_server_1.uri().trim_end_matches('/')
|
||||
)),
|
||||
website(&format!(
|
||||
"{}/bar/relative.html",
|
||||
mock_server_2.uri().trim_end_matches('/')
|
||||
)),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_path_with_base() {
|
||||
let base = Base::try_from("/path/to/root").unwrap();
|
||||
assert_eq!(base, Base::Local("/path/to/root".into()));
|
||||
|
||||
let input = Input {
|
||||
source: InputSource::String(
|
||||
r#"
|
||||
<a href="index.html">Index</a>
|
||||
<a href="about.html">About</a>
|
||||
<a href="/another.html">Another</a>
|
||||
"#
|
||||
.into(),
|
||||
),
|
||||
file_type_hint: Some(FileType::Html),
|
||||
excluded_paths: None,
|
||||
};
|
||||
|
||||
let links = collect(vec![input], Some(base)).await;
|
||||
|
||||
let expected_links = HashSet::from_iter([
|
||||
path("/path/to/root/index.html"),
|
||||
path("/path/to/root/about.html"),
|
||||
path("/another.html"),
|
||||
]);
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,16 @@ pub(crate) fn website(url: &str) -> Uri {
|
|||
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
|
||||
}
|
||||
|
||||
/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme
|
||||
///
|
||||
/// # Panic
|
||||
///
|
||||
/// This panics if the given path is not absolute, so it should only be used for
|
||||
/// testing
|
||||
pub(crate) fn path<P: AsRef<Path>>(path: P) -> Uri {
|
||||
Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI"))
|
||||
}
|
||||
|
||||
/// Creates a mail URI from a string
|
||||
pub(crate) fn mail(address: &str) -> Uri {
|
||||
if address.starts_with("mailto:") {
|
||||
|
|
|
|||
|
|
@ -23,7 +23,10 @@ impl Base {
|
|||
pub(crate) fn join(&self, link: &str) -> Option<Url> {
|
||||
match self {
|
||||
Self::Remote(url) => url.join(link).ok(),
|
||||
Self::Local(_) => None,
|
||||
Self::Local(path) => {
|
||||
let full_path = path.join(link);
|
||||
Url::from_file_path(full_path).ok()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -36,18 +39,17 @@ impl Base {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_source(source: &InputSource) -> Option<Url> {
|
||||
pub(crate) fn from_source(source: &InputSource) -> Option<Base> {
|
||||
match &source {
|
||||
InputSource::RemoteUrl(url) => {
|
||||
// TODO: This should be refactored.
|
||||
// Cases like https://user:pass@example.com are not handled
|
||||
// We can probably use the original URL and just replace the
|
||||
// path component in the caller of this function
|
||||
if let Some(port) = url.port() {
|
||||
Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok()
|
||||
} else {
|
||||
Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
|
||||
}
|
||||
// Create a new URL with just the scheme, host, and port
|
||||
let mut base_url = url.clone();
|
||||
base_url.set_path("");
|
||||
base_url.set_query(None);
|
||||
base_url.set_fragment(None);
|
||||
|
||||
// We keep the username and password intact
|
||||
Some(Base::Remote(*base_url))
|
||||
}
|
||||
// other inputs do not have a URL to extract a base
|
||||
_ => None,
|
||||
|
|
@ -101,6 +103,16 @@ mod test_base {
|
|||
assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_local_path_string_as_base() -> Result<()> {
|
||||
let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"];
|
||||
|
||||
for case in cases {
|
||||
assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_local() -> Result<()> {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
|
@ -123,7 +135,7 @@ mod test_base {
|
|||
let url = Url::parse(url).unwrap();
|
||||
let source = InputSource::RemoteUrl(Box::new(url.clone()));
|
||||
let base = Base::from_source(&source);
|
||||
let expected = Url::parse(expected).unwrap();
|
||||
let expected = Base::Remote(Url::parse(expected).unwrap());
|
||||
assert_eq!(base, Some(expected));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,12 +86,24 @@ pub enum ErrorKind {
|
|||
#[error("Error with base dir `{0}` : {1}")]
|
||||
InvalidBase(String, String),
|
||||
|
||||
/// Cannot join the given text with the base URL
|
||||
#[error("Cannot join '{0}' with the base URL")]
|
||||
InvalidBaseJoin(String),
|
||||
|
||||
/// Cannot convert the given path to a URI
|
||||
#[error("Cannot convert path '{0}' to a URI")]
|
||||
InvalidPathToUri(String),
|
||||
|
||||
/// The given URI type is not supported
|
||||
#[error("Unsupported URI type: '{0}'")]
|
||||
UnsupportedUriType(String),
|
||||
|
||||
/// The given input can not be parsed into a valid URI remapping
|
||||
#[error("Error remapping URL: `{0}`")]
|
||||
InvalidUrlRemap(String),
|
||||
|
||||
/// The given path does not resolve to a valid file
|
||||
#[error("Cannot find local file {0}")]
|
||||
#[error("Invalid file path: {0}")]
|
||||
InvalidFile(PathBuf),
|
||||
|
||||
/// Error while traversing an input directory
|
||||
|
|
@ -246,6 +258,13 @@ impl PartialEq for ErrorKind {
|
|||
}
|
||||
(Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2,
|
||||
(Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2,
|
||||
(Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2,
|
||||
(Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2,
|
||||
(Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2,
|
||||
(Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2,
|
||||
(Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2,
|
||||
(Self::EmptyUrl, Self::EmptyUrl) => true,
|
||||
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
|
@ -281,6 +300,9 @@ impl Hash for ErrorKind {
|
|||
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
|
||||
Self::InsecureURL(u, ..) => u.hash(state),
|
||||
Self::InvalidBase(base, e) => (base, e).hash(state),
|
||||
Self::InvalidBaseJoin(s) => s.hash(state),
|
||||
Self::InvalidPathToUri(s) => s.hash(state),
|
||||
Self::UnsupportedUriType(s) => s.hash(state),
|
||||
Self::InvalidUrlRemap(remap) => (remap).hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ use thiserror::Error;
|
|||
|
||||
use crate::{types::accept::AcceptRange, AcceptRangeError};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[derive(Debug, Error, PartialEq)]
|
||||
pub enum StatusCodeSelectorError {
|
||||
#[error("invalid/empty input")]
|
||||
InvalidInput,
|
||||
|
|
|
|||
|
|
@ -19,12 +19,6 @@ pub struct RawUri {
|
|||
pub attribute: Option<String>,
|
||||
}
|
||||
|
||||
impl RawUri {
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(&self) -> bool {
|
||||
self.text.starts_with('#')
|
||||
}
|
||||
}
|
||||
impl Display for RawUri {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} (Attribute: {:?})", self.text, self.attribute)
|
||||
|
|
@ -40,17 +34,3 @@ impl From<&str> for RawUri {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
let raw_uri = RawUri::from("#anchor");
|
||||
assert!(raw_uri.is_anchor());
|
||||
|
||||
let raw_uri = RawUri::from("notan#anchor");
|
||||
assert!(!raw_uri.is_anchor());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -378,4 +378,9 @@ mod tests {
|
|||
website("https://example.com")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_file_uri() {
|
||||
assert!(Uri::try_from("file:///path/to/file").unwrap().is_file());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use log::info;
|
||||
use log::warn;
|
||||
use percent_encoding::percent_decode_str;
|
||||
use reqwest::Url;
|
||||
use std::{
|
||||
|
|
@ -13,156 +13,374 @@ use crate::{
|
|||
Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri,
|
||||
};
|
||||
|
||||
const MAX_TRUNCATED_STR_LEN: usize = 100;
|
||||
|
||||
/// Extract basic auth credentials for a given URL.
|
||||
fn credentials(extractor: &Option<BasicAuthExtractor>, uri: &Uri) -> Option<BasicAuthCredentials> {
|
||||
fn extract_credentials(
|
||||
extractor: &Option<BasicAuthExtractor>,
|
||||
uri: &Uri,
|
||||
) -> Option<BasicAuthCredentials> {
|
||||
extractor.as_ref().and_then(|ext| ext.matches(uri))
|
||||
}
|
||||
|
||||
/// Create a request from a raw URI.
|
||||
fn create_request(
|
||||
raw_uri: &RawUri,
|
||||
source: &InputSource,
|
||||
base: &Option<Base>,
|
||||
extractor: &Option<BasicAuthExtractor>,
|
||||
) -> Result<Request> {
|
||||
let uri = try_parse_into_uri(raw_uri, source, base)?;
|
||||
let source = truncate_source(source);
|
||||
let element = raw_uri.element.clone();
|
||||
let attribute = raw_uri.attribute.clone();
|
||||
let credentials = extract_credentials(extractor, &uri);
|
||||
|
||||
Ok(Request::new(uri, source, element, attribute, credentials))
|
||||
}
|
||||
|
||||
/// Try to parse the raw URI into a `Uri`.
|
||||
///
|
||||
/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text.
|
||||
/// If the base URL is not available, create a URI from the file path.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base
|
||||
/// to create a valid URI.
|
||||
/// - If a URI cannot be created from the file path.
|
||||
/// - If the source is not a file path (i.e. the URI type is not supported).
|
||||
fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option<Base>) -> Result<Uri> {
|
||||
let text = raw_uri.text.clone();
|
||||
let uri = match Uri::try_from(raw_uri.clone()) {
|
||||
Ok(uri) => uri,
|
||||
Err(_) => match base {
|
||||
Some(base_url) => match base_url.join(&text) {
|
||||
Some(url) => Uri { url },
|
||||
None => return Err(ErrorKind::InvalidBaseJoin(text.clone())),
|
||||
},
|
||||
None => match source {
|
||||
InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?,
|
||||
_ => return Err(ErrorKind::UnsupportedUriType(text)),
|
||||
},
|
||||
},
|
||||
};
|
||||
Ok(uri)
|
||||
}
|
||||
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(text: &str) -> bool {
|
||||
text.starts_with('#')
|
||||
}
|
||||
|
||||
/// Create a URI from a file path
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - If the link text is an anchor and the file name cannot be extracted from the file path.
|
||||
/// - If the path cannot be resolved.
|
||||
/// - If the resolved path cannot be converted to a URL.
|
||||
fn create_uri_from_file_path(
|
||||
file_path: &Path,
|
||||
link_text: &str,
|
||||
base: &Option<Base>,
|
||||
) -> Result<Uri> {
|
||||
let target_path = if is_anchor(link_text) {
|
||||
// For anchors, we need to append the anchor to the file name.
|
||||
let file_name = file_path
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?;
|
||||
|
||||
format!("{file_name}{link_text}")
|
||||
} else {
|
||||
link_text.to_string()
|
||||
};
|
||||
let Ok(constructed_url) = resolve_and_create_url(file_path, &target_path, base) else {
|
||||
return Err(ErrorKind::InvalidPathToUri(target_path));
|
||||
};
|
||||
Ok(Uri {
|
||||
url: constructed_url,
|
||||
})
|
||||
}
|
||||
|
||||
/// Truncate the source in case it gets too long
|
||||
///
|
||||
/// This is only needed for string inputs.
|
||||
/// For other inputs, the source is simply a "label" (an enum variant).
|
||||
// TODO: This would not be necessary if we used `Cow` for the source.
|
||||
fn truncate_source(source: &InputSource) -> InputSource {
|
||||
const MAX_TRUNCATED_STR_LEN: usize = 100;
|
||||
|
||||
match source {
|
||||
InputSource::String(s) => {
|
||||
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
|
||||
}
|
||||
other => other.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create requests out of the collected URLs.
|
||||
/// Only keeps "valid" URLs. This filters out anchors for example.
|
||||
///
|
||||
/// If a URLs is ignored (because of the current settings),
|
||||
/// it will not be added to the `HashSet`.
|
||||
pub(crate) fn create(
|
||||
uris: Vec<RawUri>,
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
extractor: &Option<BasicAuthExtractor>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let base_url = Base::from_source(&input_content.source);
|
||||
) -> HashSet<Request> {
|
||||
let base = base
|
||||
.clone()
|
||||
.or_else(|| Base::from_source(&input_content.source));
|
||||
|
||||
let requests: Result<Vec<Option<Request>>> = uris
|
||||
.into_iter()
|
||||
.map(|raw_uri| {
|
||||
let is_anchor = raw_uri.is_anchor();
|
||||
let text = raw_uri.text.clone();
|
||||
let element = raw_uri.element.clone();
|
||||
let attribute = raw_uri.attribute.clone();
|
||||
|
||||
// Truncate the source in case it gets too long Ideally we should
|
||||
// avoid the initial String allocation for `source` altogether
|
||||
let source = match &input_content.source {
|
||||
InputSource::String(s) => {
|
||||
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
|
||||
uris.into_iter()
|
||||
.filter_map(|raw_uri| {
|
||||
match create_request(&raw_uri, &input_content.source, &base, extractor) {
|
||||
Ok(request) => Some(request),
|
||||
Err(e) => {
|
||||
warn!("Error creating request: {:?}", e);
|
||||
None
|
||||
}
|
||||
// Cloning is cheap here
|
||||
c => c.clone(),
|
||||
};
|
||||
|
||||
if let Ok(uri) = Uri::try_from(raw_uri) {
|
||||
let credentials = credentials(extractor, &uri);
|
||||
|
||||
Ok(Some(Request::new(
|
||||
uri,
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
credentials,
|
||||
)))
|
||||
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
|
||||
let uri = Uri { url };
|
||||
let credentials = credentials(extractor, &uri);
|
||||
|
||||
Ok(Some(Request::new(
|
||||
uri,
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
credentials,
|
||||
)))
|
||||
} else if let InputSource::FsPath(root) = &input_content.source {
|
||||
let path = if is_anchor {
|
||||
match root.file_name() {
|
||||
Some(file_name) => match file_name.to_str() {
|
||||
Some(valid_str) => valid_str.to_string() + &text,
|
||||
None => return Err(ErrorKind::InvalidFile(root.clone())),
|
||||
},
|
||||
None => return Err(ErrorKind::InvalidFile(root.clone())),
|
||||
}
|
||||
} else {
|
||||
text
|
||||
};
|
||||
|
||||
if let Some(url) = create_uri_from_path(root, &path, base)? {
|
||||
let uri = Uri { url };
|
||||
let credentials = credentials(extractor, &uri);
|
||||
|
||||
Ok(Some(Request::new(
|
||||
uri,
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
credentials,
|
||||
)))
|
||||
} else {
|
||||
// In case we cannot create a URI from a path but we didn't receive an error,
|
||||
// it means that some preconditions were not met, e.g. the `base_url` wasn't set.
|
||||
Ok(None)
|
||||
}
|
||||
} else if let Some(url) = construct_url(&base_url, &text) {
|
||||
if base.is_some() {
|
||||
Ok(None)
|
||||
} else {
|
||||
let uri = Uri { url: url? };
|
||||
let credentials = credentials(extractor, &uri);
|
||||
|
||||
Ok(Some(Request::new(
|
||||
uri,
|
||||
source,
|
||||
element,
|
||||
attribute,
|
||||
credentials,
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
info!("Handling of `{}` not implemented yet", text);
|
||||
Ok(None)
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let requests: Vec<Request> = requests?.into_iter().flatten().collect();
|
||||
Ok(HashSet::from_iter(requests))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn construct_url(base: &Option<Url>, text: &str) -> Option<Result<Url>> {
|
||||
base.as_ref().map(|base| {
|
||||
base.join(text)
|
||||
.map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}")))
|
||||
})
|
||||
}
|
||||
/// Create a URI from a path
|
||||
///
|
||||
/// `src_path` is the path of the source file.
|
||||
/// `dest_path` is the path being linked to.
|
||||
/// The optional `base_uri` specifies the base URI to resolve the destination path against.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - If the percent-decoded destination path cannot be decoded as UTF-8.
|
||||
/// - The path cannot be resolved
|
||||
/// - The resolved path cannot be converted to a URL.
|
||||
fn resolve_and_create_url(
|
||||
src_path: &Path,
|
||||
dest_path: &str,
|
||||
base_uri: &Option<Base>,
|
||||
) -> Result<Url> {
|
||||
let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
|
||||
|
||||
fn create_uri_from_path(src: &Path, dst: &str, base: &Option<Base>) -> Result<Option<Url>> {
|
||||
let (dst, frag) = url::remove_get_params_and_separate_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` gets left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?;
|
||||
let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?;
|
||||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(|mut url| {
|
||||
url.set_fragment(frag);
|
||||
url
|
||||
})
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
||||
None => Ok(None),
|
||||
}
|
||||
// Decode the destination path to avoid double-encoding
|
||||
// This addresses the issue mentioned in the original comment about double-encoding
|
||||
let decoded_dest = percent_decode_str(dest_path).decode_utf8()?;
|
||||
|
||||
let Ok(Some(resolved_path)) = path::resolve(src_path, &PathBuf::from(&*decoded_dest), base_uri)
|
||||
else {
|
||||
return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string()));
|
||||
};
|
||||
|
||||
let Ok(mut url) = Url::from_file_path(&resolved_path) else {
|
||||
return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone()));
|
||||
};
|
||||
|
||||
url.set_fragment(fragment);
|
||||
Ok(url)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::types::FileType;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
assert!(is_anchor("#anchor"));
|
||||
assert!(!is_anchor("notan#anchor"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_uri_from_path() {
|
||||
let result =
|
||||
create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
|
||||
assert_eq!(result.unwrap().as_str(), "file:///test+encoding");
|
||||
resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap();
|
||||
assert_eq!(result.as_str(), "file:///test+encoding");
|
||||
}
|
||||
|
||||
fn create_input(content: &str, file_type: FileType) -> InputContent {
|
||||
InputContent {
|
||||
content: content.to_string(),
|
||||
file_type,
|
||||
source: InputSource::String(content.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relative_url_resolution() {
|
||||
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
|
||||
let input = create_input(
|
||||
r#"<a href="relative.html">Relative Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let uris = vec![RawUri::from("relative.html")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_absolute_url_resolution() {
|
||||
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
|
||||
let input = create_input(
|
||||
r#"<a href="https://another.com/page">Absolute Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let uris = vec![RawUri::from("https://another.com/page")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://another.com/page"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_root_relative_url_resolution() {
|
||||
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
|
||||
let input = create_input(
|
||||
r#"<a href="/root-relative">Root Relative Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let uris = vec![RawUri::from("/root-relative")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://example.com/root-relative"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parent_directory_url_resolution() {
|
||||
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
|
||||
let input = create_input(
|
||||
r#"<a href="../parent">Parent Directory Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let uris = vec![RawUri::from("../parent")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://example.com/parent"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fragment_url_resolution() {
|
||||
let base = Some(Base::try_from("https://example.com/path/page.html").unwrap());
|
||||
let input = create_input(r##"<a href="#fragment">Fragment Link</a>"##, FileType::Html);
|
||||
|
||||
let uris = vec![RawUri::from("#fragment")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_base_url_resolution() {
|
||||
let base = None;
|
||||
let input = create_input(
|
||||
r#"<a href="https://example.com/page">Absolute Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let uris = vec![RawUri::from("https://example.com/page")];
|
||||
let requests = create(uris, &input, &base, &None);
|
||||
|
||||
assert_eq!(requests.len(), 1);
|
||||
assert!(requests
|
||||
.iter()
|
||||
.any(|r| r.uri.url.as_str() == "https://example.com/page"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_request_from_relative_file_path() {
|
||||
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
|
||||
let input_source = InputSource::FsPath(PathBuf::from("page.html"));
|
||||
|
||||
let actual =
|
||||
create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
actual,
|
||||
Request::new(
|
||||
Uri {
|
||||
url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
|
||||
},
|
||||
input_source,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_request_from_absolute_file_path() {
|
||||
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
|
||||
let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html"));
|
||||
|
||||
// Use an absolute path that's outside the base directory
|
||||
let actual = create_request(
|
||||
&RawUri::from("/usr/local/share/doc/example.html"),
|
||||
&input_source,
|
||||
&base,
|
||||
&None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
actual,
|
||||
Request::new(
|
||||
Uri {
|
||||
url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap()
|
||||
},
|
||||
input_source,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_relative_path_into_uri() {
|
||||
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
|
||||
let input = create_input(
|
||||
r#"<a href="relative.html">Relative Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let raw_uri = RawUri::from("relative.html");
|
||||
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
|
||||
|
||||
assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_absolute_path_into_uri() {
|
||||
let base = Some(Base::Local(PathBuf::from("/tmp/lychee")));
|
||||
let input = create_input(
|
||||
r#"<a href="/absolute.html">Absolute Link</a>"#,
|
||||
FileType::Html,
|
||||
);
|
||||
|
||||
let raw_uri = RawUri::from("absolute.html");
|
||||
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
|
||||
|
||||
assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue