mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-21 13:34:46 +00:00
This commit introduces several improvements to the file checking process and URI handling: - Extract file checking logic into separate `Checker` structs (`FileChecker`, `WebsiteChecker`, `MailChecker`) - Improve handling of relative and absolute file paths - Enhance URI parsing and creation from file paths - Refactor `create_request` function for better clarity and error handling These changes provide better support for resolving relative links, handling different base URLs, and working with file paths. Fixes https://github.com/lycheeverse/lychee/issues/1296 and https://github.com/lycheeverse/lychee/issues/1480
226 lines
7.4 KiB
Rust
226 lines
7.4 KiB
Rust
use crate::{
|
|
chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
|
|
quirks::Quirks,
|
|
retry::RetryExt,
|
|
types::uri::github::GithubUri,
|
|
BasicAuthCredentials, ErrorKind, Status, Uri,
|
|
};
|
|
use async_trait::async_trait;
|
|
use http::StatusCode;
|
|
use octocrab::Octocrab;
|
|
use reqwest::Request;
|
|
use std::{collections::HashSet, time::Duration};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub(crate) struct WebsiteChecker {
|
|
/// Request method used for making requests.
|
|
method: reqwest::Method,
|
|
|
|
/// The HTTP client used for requests.
|
|
reqwest_client: reqwest::Client,
|
|
|
|
/// GitHub client used for requests.
|
|
github_client: Option<Octocrab>,
|
|
|
|
/// The chain of plugins to be executed on each request.
|
|
plugin_request_chain: RequestChain,
|
|
|
|
/// Maximum number of retries per request before returning an error.
|
|
max_retries: u64,
|
|
|
|
/// Initial wait time between retries of failed requests. This doubles after
|
|
/// each failure.
|
|
retry_wait_time: Duration,
|
|
|
|
/// Set of accepted return codes / status codes.
|
|
///
|
|
/// Unmatched return codes/ status codes are deemed as errors.
|
|
accepted: Option<HashSet<StatusCode>>,
|
|
|
|
/// Requires using HTTPS when it's available.
|
|
///
|
|
/// This would treat unencrypted links as errors when HTTPS is available.
|
|
require_https: bool,
|
|
}
|
|
|
|
impl WebsiteChecker {
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub(crate) const fn new(
|
|
method: reqwest::Method,
|
|
retry_wait_time: Duration,
|
|
max_retries: u64,
|
|
reqwest_client: reqwest::Client,
|
|
accepted: Option<HashSet<StatusCode>>,
|
|
github_client: Option<Octocrab>,
|
|
require_https: bool,
|
|
plugin_request_chain: RequestChain,
|
|
) -> Self {
|
|
Self {
|
|
method,
|
|
reqwest_client,
|
|
github_client,
|
|
plugin_request_chain,
|
|
max_retries,
|
|
retry_wait_time,
|
|
accepted,
|
|
require_https,
|
|
}
|
|
}
|
|
|
|
/// Retry requests up to `max_retries` times
|
|
/// with an exponential backoff.
|
|
pub(crate) async fn retry_request(&self, request: Request) -> Status {
|
|
let mut retries: u64 = 0;
|
|
let mut wait_time = self.retry_wait_time;
|
|
let mut status = self.check_default(clone_unwrap(&request)).await;
|
|
while retries < self.max_retries {
|
|
if status.is_success() || !status.should_retry() {
|
|
return status;
|
|
}
|
|
retries += 1;
|
|
tokio::time::sleep(wait_time).await;
|
|
wait_time = wait_time.saturating_mul(2);
|
|
status = self.check_default(clone_unwrap(&request)).await;
|
|
}
|
|
status
|
|
}
|
|
|
|
/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
|
|
async fn check_default(&self, request: Request) -> Status {
|
|
match self.reqwest_client.execute(request).await {
|
|
Ok(ref response) => Status::new(response, self.accepted.clone()),
|
|
Err(e) => e.into(),
|
|
}
|
|
}
|
|
|
|
/// Checks the given URI of a website.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// This returns an `Err` if
|
|
/// - The URI is invalid.
|
|
/// - The request failed.
|
|
/// - The response status code is not accepted.
|
|
/// - The URI cannot be converted to HTTPS.
|
|
pub(crate) async fn check_website(
|
|
&self,
|
|
uri: &Uri,
|
|
credentials: Option<BasicAuthCredentials>,
|
|
) -> Result<Status, ErrorKind> {
|
|
let default_chain: RequestChain = Chain::new(vec![
|
|
Box::<Quirks>::default(),
|
|
Box::new(credentials),
|
|
Box::new(self.clone()),
|
|
]);
|
|
|
|
match self.check_website_inner(uri, &default_chain).await {
|
|
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
|
if self
|
|
.check_website_inner(&uri.to_https()?, &default_chain)
|
|
.await
|
|
.is_success()
|
|
{
|
|
Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
|
|
} else {
|
|
Ok(Status::Ok(code))
|
|
}
|
|
}
|
|
s => Ok(s),
|
|
}
|
|
}
|
|
|
|
/// Checks the given URI of a website.
|
|
///
|
|
/// Unsupported schemes will be ignored
|
|
///
|
|
/// Note: we use `inner` to improve compile times by avoiding monomorphization
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// This returns an `Err` if
|
|
/// - The URI is invalid.
|
|
/// - The request failed.
|
|
/// - The response status code is not accepted.
|
|
async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
|
|
let request = self
|
|
.reqwest_client
|
|
.request(self.method.clone(), uri.as_str())
|
|
.build();
|
|
|
|
let request = match request {
|
|
Ok(r) => r,
|
|
Err(e) => return e.into(),
|
|
};
|
|
|
|
let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
|
|
.traverse(request)
|
|
.await;
|
|
|
|
self.handle_github(status, uri).await
|
|
}
|
|
|
|
// Pull out the heavy machinery in case of a failed normal request.
|
|
// This could be a GitHub URL and we ran into the rate limiter.
|
|
// TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
|
|
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
|
|
if status.is_success() {
|
|
return status;
|
|
}
|
|
|
|
if let Ok(github_uri) = GithubUri::try_from(uri) {
|
|
let status = self.check_github(github_uri).await;
|
|
if status.is_success() {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
status
|
|
}
|
|
|
|
/// Check a `uri` hosted on `GitHub` via the GitHub API.
|
|
///
|
|
/// # Caveats
|
|
///
|
|
/// Files inside private repositories won't get checked and instead would
|
|
/// be reported as valid if the repository itself is reachable through the
|
|
/// API.
|
|
///
|
|
/// A better approach would be to download the file through the API or
|
|
/// clone the repo, but we chose the pragmatic approach.
|
|
async fn check_github(&self, uri: GithubUri) -> Status {
|
|
let Some(client) = &self.github_client else {
|
|
return ErrorKind::MissingGitHubToken.into();
|
|
};
|
|
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
|
|
Ok(repo) => repo,
|
|
Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
|
|
};
|
|
if let Some(true) = repo.private {
|
|
return Status::Ok(StatusCode::OK);
|
|
} else if let Some(endpoint) = uri.endpoint {
|
|
return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
|
|
.into();
|
|
}
|
|
Status::Ok(StatusCode::OK)
|
|
}
|
|
}
|
|
|
|
/// Clones a `reqwest::Request`.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// This panics if the request cannot be cloned. This should only happen if the
|
|
/// request body is a `reqwest` stream. We disable the `stream` feature, so the
|
|
/// body should never be a stream.
|
|
///
|
|
/// See <https://github.com/seanmonstar/reqwest/blob/de5dbb1ab849cc301dcefebaeabdf4ce2e0f1e53/src/async_impl/body.rs#L168>
|
|
fn clone_unwrap(request: &Request) -> Request {
|
|
request.try_clone().expect("Failed to clone request: body was a stream, which should be impossible with `stream` feature disabled")
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Handler<Request, Status> for WebsiteChecker {
|
|
async fn handle(&mut self, input: Request) -> ChainResult<Request, Status> {
|
|
ChainResult::Done(self.retry_request(input).await)
|
|
}
|
|
}
|