mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-29 10:50:24 +00:00
chore!: improve client and remap modules (#913)
`lychee_lib::client`:
- Improved documentation.
- Added an log message in `ClientBuilder::client()` when provied user-agent
overrides the one defined in provied custom header.
- Removed unnecessary error handling in `Client::check()` when setting HTTPS
scheme because all failure cases should occur when checking this URL the first
time already.
- Removed unnecessary error handling in `Client::remap()` since
`lychee-lib::remap::Remaps::remap()` doesn't returns a `Result` anymore.
- Fixed potential integer overflow in `Client::check_website()` when the wait
time between retries doubles, by using `std::time::Duration::saturating_mul`
instead.
- Renamed `invalid()` to `validate_url()`.
`lychee_lib::remap`:
- Improved documentation, in particular, clarified (in the comment) that it's
URLs not URIs being remapped.
- Changed `Remaps::remap()` so it takes `&mut Url` instead of `Uri` as its
argument, and doesn't return a `Result` as a result.
- Using `Url` instead of `Uri` because it aligns with the concept of
remapping locations rather than identifiers.
- Mutating the URL directly instead of returning a new one for it's more
straightforward.
- There is no error handling because we don't convert from URL to URI
anymore. Furthermore, this always succeed in the first place so we never
needed error handling.
- Added implementation of `IntoIterator` for `&'a Remaps` and convenience method
of `Remaps::iter`. (Their mutable or moving counterparts are deliberately
avoided because we don't want library users to modify all consume the
remapping rules after its instantiation.)
`lychee_lib::error`:
- Renamed `ErrorKind::InvalidUriRemap` to `InvalidUrlRemap` and improved
its error message.
Changes to other modules are minor and only serves to accompany aforementioned
changes.
This commit is contained in:
parent
f88c21fc5a
commit
e2406089ad
5 changed files with 241 additions and 155 deletions
|
|
@ -48,7 +48,7 @@ where
|
|||
let mut request = request?;
|
||||
|
||||
// Apply URI remappings (if any)
|
||||
request.uri = params.client.remap(request.uri)?;
|
||||
params.client.remap(&mut request.uri);
|
||||
|
||||
// Avoid panic on broken pipe.
|
||||
// See https://github.com/rust-lang/rust/issues/46016
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ use crate::{
|
|||
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
/// Default number of retries before a request is deemed as failed, 3.
|
||||
pub const DEFAULT_MAX_RETRIES: u64 = 3;
|
||||
/// Default wait time in seconds between requests, 1.
|
||||
/// Default wait time in seconds between retries, 1.
|
||||
pub const DEFAULT_RETRY_WAIT_TIME_SECS: usize = 1;
|
||||
/// Default timeout in seconds before a request is deemed as failed, 20.
|
||||
pub const DEFAULT_TIMEOUT_SECS: usize = 20;
|
||||
|
|
@ -47,10 +47,12 @@ pub const DEFAULT_TIMEOUT_SECS: usize = 20;
|
|||
pub const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
|
||||
|
||||
// Constants currently not configurable by the user.
|
||||
/// A timeout for only the connect phase of a Client.
|
||||
/// A timeout for only the connect phase of a [`Client`].
|
||||
const CONNECT_TIMEOUT: u64 = 10;
|
||||
/// TCP keepalive
|
||||
/// See <https://tldp.org/HOWTO/TCP-Keepalive-HOWTO/overview.html> for more info
|
||||
/// TCP keepalive.
|
||||
///
|
||||
/// See <https://tldp.org/HOWTO/TCP-Keepalive-HOWTO/overview.html> for more
|
||||
/// infomation.
|
||||
const TCP_KEEPALIVE: u64 = 60;
|
||||
|
||||
/// Builder for [`Client`].
|
||||
|
|
@ -61,7 +63,8 @@ const TCP_KEEPALIVE: u64 = 60;
|
|||
#[builder(builder_method_doc = "
|
||||
Create a builder for building `ClientBuilder`.
|
||||
|
||||
On the builder call, call methods with same name as its fields to set their values.
|
||||
On the builder call, call methods with the same names as its fields to set their
|
||||
values.
|
||||
|
||||
Finally, call `.build()` to create the instance of `ClientBuilder`.
|
||||
")]
|
||||
|
|
@ -70,28 +73,32 @@ pub struct ClientBuilder {
|
|||
///
|
||||
/// This allows much more request before getting rate-limited.
|
||||
///
|
||||
/// ## Rate-limiting Defaults
|
||||
/// # Rate-limiting Defaults
|
||||
///
|
||||
/// As of Feb 2022, it's 60 per hour without GitHub token v.s.
|
||||
/// 5000 per hour with token.
|
||||
github_token: Option<SecretString>,
|
||||
|
||||
/// Remap URIs matching a pattern to a different URI
|
||||
/// Remap URIs matching a pattern to a different URI.
|
||||
///
|
||||
/// This makes it possible to remap any HTTP/HTTPS endpoint to a different
|
||||
/// HTTP/HTTPS endpoint. This feature could also be used to proxy
|
||||
/// HTTP/HTTPS one. This feature could also be used to proxy
|
||||
/// certain requests.
|
||||
///
|
||||
/// # Usage Notes
|
||||
///
|
||||
/// Use with caution because a large set of remapping rules may cause
|
||||
/// performance issues. Furthermore rules are executed in order and multiple
|
||||
/// mappings for the same URI are allowed, so there is no guarantee that the
|
||||
/// rules may not conflict with each other.
|
||||
/// performance issues.
|
||||
///
|
||||
/// Furthermore rules are executed sequentially and multiple mappings for
|
||||
/// the same URI are allowed, so it is up to the library user's discretion to
|
||||
/// make sure rules don't conflict with each other.
|
||||
remaps: Option<Remaps>,
|
||||
|
||||
/// Links matching this set of regular expressions are **always** checked.
|
||||
///
|
||||
/// This has higher precedence over [`ClientBuilder::excludes`], **but**
|
||||
/// has lower precedence over any other `exclude_` fields or
|
||||
/// has lower precedence compared to any other `exclude_` fields or
|
||||
/// [`ClientBuilder::schemes`] below.
|
||||
includes: Option<RegexSet>,
|
||||
|
||||
|
|
@ -109,7 +116,7 @@ pub struct ClientBuilder {
|
|||
|
||||
/// When `true`, exclude private IP addresses.
|
||||
///
|
||||
/// ## IPv4
|
||||
/// # IPv4
|
||||
///
|
||||
/// The private address ranges are defined in [IETF RFC 1918] and include:
|
||||
///
|
||||
|
|
@ -117,16 +124,17 @@ pub struct ClientBuilder {
|
|||
/// - `172.16.0.0/12`
|
||||
/// - `192.168.0.0/16`
|
||||
///
|
||||
/// ## IPv6
|
||||
/// # IPv6
|
||||
///
|
||||
/// The address is a unique local address (`fc00::/7`).
|
||||
///
|
||||
/// This property is defined in [IETF RFC 4193].
|
||||
///
|
||||
/// ## Note
|
||||
/// # Note
|
||||
///
|
||||
/// Unicast site-local network was defined in [IETF RFC 4291], but was fully deprecated in
|
||||
/// [IETF RFC 3879]. So it is **NOT** considered as private on this purpose.
|
||||
/// Unicast site-local network was defined in [IETF RFC 4291], but was fully
|
||||
/// deprecated in [IETF RFC 3879]. So it is **NOT** considered as private on
|
||||
/// this purpose.
|
||||
///
|
||||
/// [IETF RFC 1918]: https://tools.ietf.org/html/rfc1918
|
||||
/// [IETF RFC 4193]: https://tools.ietf.org/html/rfc4193
|
||||
|
|
@ -136,17 +144,19 @@ pub struct ClientBuilder {
|
|||
|
||||
/// When `true`, exclude link-local IPs.
|
||||
///
|
||||
/// ## IPv4
|
||||
/// # IPv4
|
||||
///
|
||||
/// The address is `169.254.0.0/16`.
|
||||
///
|
||||
/// This property is defined by [IETF RFC 3927].
|
||||
///
|
||||
/// ## IPv6
|
||||
/// # IPv6
|
||||
///
|
||||
/// The address is a unicast address with link-local scope, as defined in [RFC 4291].
|
||||
/// The address is a unicast address with link-local scope, as defined in
|
||||
/// [RFC 4291].
|
||||
///
|
||||
/// A unicast address has link-local scope if it has the prefix `fe80::/10`, as per [RFC 4291 section 2.4].
|
||||
/// A unicast address has link-local scope if it has the prefix `fe80::/10`,
|
||||
/// as per [RFC 4291 section 2.4].
|
||||
///
|
||||
/// [IETF RFC 3927]: https://tools.ietf.org/html/rfc3927
|
||||
/// [RFC 4291]: https://tools.ietf.org/html/rfc4291
|
||||
|
|
@ -155,15 +165,16 @@ pub struct ClientBuilder {
|
|||
|
||||
/// When `true`, exclude loopback IP addresses.
|
||||
///
|
||||
/// ## IPv4
|
||||
/// # IPv4
|
||||
///
|
||||
/// This is a loopback address (`127.0.0.0/8`).
|
||||
///
|
||||
/// This property is defined by [IETF RFC 1122].
|
||||
///
|
||||
/// ## IPv6
|
||||
/// # IPv6
|
||||
///
|
||||
/// This is the loopback address (`::1`), as defined in [IETF RFC 4291 section 2.5.3].
|
||||
/// This is the loopback address (`::1`), as defined in
|
||||
/// [IETF RFC 4291 section 2.5.3].
|
||||
///
|
||||
/// [IETF RFC 1122]: https://tools.ietf.org/html/rfc1122
|
||||
/// [IETF RFC 4291 section 2.5.3]: https://tools.ietf.org/html/rfc4291#section-2.5.3
|
||||
|
|
@ -173,16 +184,24 @@ pub struct ClientBuilder {
|
|||
exclude_mail: bool,
|
||||
|
||||
/// Maximum number of redirects per request before returning an error.
|
||||
///
|
||||
/// Defaults to [`DEFAULT_MAX_REDIRECTS`].
|
||||
#[builder(default = DEFAULT_MAX_REDIRECTS)]
|
||||
max_redirects: usize,
|
||||
|
||||
/// Maximum number of retries per request before returning an error.
|
||||
///
|
||||
/// Defaults to [`DEFAULT_MAX_RETRIES`].
|
||||
#[builder(default = DEFAULT_MAX_RETRIES)]
|
||||
max_retries: u64,
|
||||
|
||||
/// User-agent used for checking links.
|
||||
///
|
||||
/// *NOTE*: This may be helpful for bypassing certain firewalls.
|
||||
/// Defaults to [`DEFAULT_USER_AGENT`].
|
||||
///
|
||||
/// # Notes
|
||||
///
|
||||
/// This may be helpful for bypassing certain firewalls.
|
||||
// Faking the user agent is necessary for some websites, unfortunately.
|
||||
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
|
||||
#[builder(default_code = "String::from(DEFAULT_USER_AGENT)")]
|
||||
|
|
@ -190,22 +209,25 @@ pub struct ClientBuilder {
|
|||
|
||||
/// When `true`, accept invalid SSL certificates.
|
||||
///
|
||||
/// ## Warning
|
||||
/// # Warning
|
||||
///
|
||||
/// You should think very carefully before using this method. If
|
||||
/// invalid certificates are trusted, any certificate for any site
|
||||
/// will be trusted for use. This includes expired certificates. This
|
||||
/// introduces significant vulnerabilities, and should only be used
|
||||
/// as a last resort.
|
||||
/// You should think very carefully before allowing invalid SSL
|
||||
/// certificates. It will accept any certificate for any site to be trusted
|
||||
/// including expired certificates. This introduces significant
|
||||
/// vulnerabilities, and should only be used as a last resort.
|
||||
// TODO: We should add a warning message in CLI. (Lucius, Jan 2023)
|
||||
allow_insecure: bool,
|
||||
|
||||
/// When non-empty, only links with matched URI schemes are checked.
|
||||
/// Otherwise, this has no effect.
|
||||
/// Set of accepted URL schemes.
|
||||
///
|
||||
/// Only links with matched URI schemes are checked. This has no effect when
|
||||
/// it's empty.
|
||||
schemes: HashSet<String>,
|
||||
|
||||
/// Sets the default [headers] for every request. See also [here].
|
||||
/// Default [headers] for every request.
|
||||
///
|
||||
/// This allows working around validation issues on some websites.
|
||||
/// This allows working around validation issues on some websites. See also
|
||||
/// [here] for usage examples.
|
||||
///
|
||||
/// [headers]: https://docs.rs/http/latest/http/header/struct.HeaderName.html
|
||||
/// [here]: https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.default_headers
|
||||
|
|
@ -220,17 +242,29 @@ pub struct ClientBuilder {
|
|||
/// Unmatched return codes/ status codes are deemed as errors.
|
||||
accepted: Option<HashSet<StatusCode>>,
|
||||
|
||||
/// Response timeout per request.
|
||||
/// Response timeout per request in seconds.
|
||||
timeout: Option<Duration>,
|
||||
|
||||
/// Initial time between retries of failed requests
|
||||
/// Initial time between retries of failed requests.
|
||||
///
|
||||
/// The wait time will increase using an exponential backoff mechanism
|
||||
retry_wait_time: Option<Duration>,
|
||||
/// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`].
|
||||
///
|
||||
/// # Notes
|
||||
///
|
||||
/// For each request, the wait time increases using an exponential backoff
|
||||
/// mechanism. For example, if the value is 1 second, then it waits for
|
||||
/// 2 ^ (N-1) seconds before the N-th retry.
|
||||
///
|
||||
/// This prevents spending too much system resources on slow responders and
|
||||
/// prioritizes other requests.
|
||||
#[builder(default_code = "Duration::from_secs(DEFAULT_RETRY_WAIT_TIME_SECS as u64)")]
|
||||
retry_wait_time: Duration,
|
||||
|
||||
/// Requires using HTTPS when it's available.
|
||||
/// When `true`, requires using HTTPS when it's available.
|
||||
///
|
||||
/// This would treat unencrypted links as errors when HTTPS is avaliable.
|
||||
/// It has no effect on non-HTTP schemes or if the URL doesn't support
|
||||
/// HTTPS.
|
||||
require_https: bool,
|
||||
}
|
||||
|
||||
|
|
@ -248,25 +282,31 @@ impl ClientBuilder {
|
|||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if:
|
||||
/// - The user-agent is invalid.
|
||||
/// - The request client cannot be created.
|
||||
/// See [here](https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#errors).
|
||||
/// - The Github client cannot be created.
|
||||
/// - The user-agent contains characters other than ASCII 32-127.
|
||||
/// - The reqwest client cannot be instantiated. This occurs if a TLS
|
||||
/// backend cannot be initialized or the resolver fails to load the system
|
||||
/// configuration. See [here].
|
||||
/// - The Github client cannot be created. Since the implementation also
|
||||
/// uses reqwest under the hood, this errors in the same circumstances as
|
||||
/// the last one.
|
||||
///
|
||||
/// [here]: https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#errors
|
||||
pub fn client(self) -> Result<Client> {
|
||||
let Self {
|
||||
github_token,
|
||||
remaps,
|
||||
includes,
|
||||
excludes,
|
||||
user_agent,
|
||||
schemes,
|
||||
custom_headers: mut headers,
|
||||
method,
|
||||
accepted,
|
||||
..
|
||||
} = self;
|
||||
|
||||
headers.insert(header::USER_AGENT, HeaderValue::from_str(&user_agent)?);
|
||||
if let Some(prev_user_agent) =
|
||||
headers.insert(header::USER_AGENT, HeaderValue::try_from(&user_agent)?)
|
||||
{
|
||||
// TODO: make this configurable according to verbosity (Lucius, Jan 2023)
|
||||
println!(
|
||||
"Found user-agent in headers: {}. Overriding it with {user_agent}.",
|
||||
prev_user_agent.to_str().unwrap_or("<EFBFBD>"),
|
||||
);
|
||||
};
|
||||
|
||||
headers.insert(
|
||||
header::TRANSFER_ENCODING,
|
||||
|
|
@ -288,20 +328,22 @@ impl ClientBuilder {
|
|||
.build()
|
||||
.map_err(ErrorKind::NetworkRequest)?;
|
||||
|
||||
let github_client = match github_token.as_ref().map(ExposeSecret::expose_secret) {
|
||||
let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) {
|
||||
Some(token) if !token.is_empty() => Some(
|
||||
Octocrab::builder()
|
||||
.personal_token(token.clone())
|
||||
.build()
|
||||
// this is essentially the same reqwest::ClientBuilder::build error
|
||||
// see https://docs.rs/octocrab/0.18.1/src/octocrab/lib.rs.html#360-364
|
||||
.map_err(ErrorKind::BuildGithubClient)?,
|
||||
),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let filter = Filter {
|
||||
includes: includes.map(|regex| Includes { regex }),
|
||||
excludes: excludes.map(|regex| Excludes { regex }),
|
||||
schemes,
|
||||
includes: self.includes.map(|regex| Includes { regex }),
|
||||
excludes: self.excludes.map(|regex| Excludes { regex }),
|
||||
schemes: self.schemes,
|
||||
// exclude_all_private option turns on all "private" excludes,
|
||||
// including private IPs, link-local IPs and loopback IPs
|
||||
exclude_private_ips: self.exclude_all_private || self.exclude_private_ips,
|
||||
|
|
@ -310,21 +352,17 @@ impl ClientBuilder {
|
|||
exclude_mail: self.exclude_mail,
|
||||
};
|
||||
|
||||
let retry_wait_time = self
|
||||
.retry_wait_time
|
||||
.unwrap_or_else(|| Duration::from_secs(DEFAULT_RETRY_WAIT_TIME_SECS as u64));
|
||||
|
||||
let quirks = Quirks::default();
|
||||
|
||||
Ok(Client {
|
||||
reqwest_client,
|
||||
github_client,
|
||||
remaps,
|
||||
remaps: self.remaps,
|
||||
filter,
|
||||
max_retries: self.max_retries,
|
||||
retry_wait_time,
|
||||
method,
|
||||
accepted,
|
||||
retry_wait_time: self.retry_wait_time,
|
||||
method: self.method,
|
||||
accepted: self.accepted,
|
||||
require_https: self.require_https,
|
||||
quirks,
|
||||
})
|
||||
|
|
@ -333,16 +371,17 @@ impl ClientBuilder {
|
|||
|
||||
/// Handles incoming requests and returns responses.
|
||||
///
|
||||
/// See [`ClientBuilder`] which contains sane defaults for all configuration options.
|
||||
/// See [`ClientBuilder`] which contains sane defaults for all configuration
|
||||
/// options.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
/// Underlying `reqwest` client instance that handles the HTTP requests.
|
||||
reqwest_client: reqwest::Client,
|
||||
|
||||
/// Github client.
|
||||
/// Optional GitHub client that handles communications with GitHub.
|
||||
github_client: Option<Octocrab>,
|
||||
|
||||
/// Optional remapping rules for URIs matching pattern
|
||||
/// Optional remapping rules for URIs matching pattern.
|
||||
remaps: Option<Remaps>,
|
||||
|
||||
/// Rules to decided whether each link would be checked or ignored.
|
||||
|
|
@ -351,7 +390,8 @@ pub struct Client {
|
|||
/// Maximum number of retries per request before returning an error.
|
||||
max_retries: u64,
|
||||
|
||||
/// Initial time between retries of failed requests
|
||||
/// Initial wait time between retries of failed requests. This doubles after
|
||||
/// each failure.
|
||||
retry_wait_time: Duration,
|
||||
|
||||
/// HTTP method used for requests, e.g. `GET` or `HEAD`.
|
||||
|
|
@ -374,38 +414,48 @@ pub struct Client {
|
|||
}
|
||||
|
||||
impl Client {
|
||||
/// Check a single request
|
||||
/// Check a single request.
|
||||
///
|
||||
/// `request` can be either a [`Request`] or a type that can be converted
|
||||
/// into it. In any case, it must represent a valid URI.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - `request` is invalid.
|
||||
/// - The URI of the request is invalid.
|
||||
/// - Encrypted connection for a HTTP URL is available but unused.
|
||||
/// (Only checked when `Client::require_https` is `true`.)
|
||||
/// Returns an `Err` if:
|
||||
/// - `request` does not represent a valid URI.
|
||||
/// - Encrypted connection for a HTTP URL is available but unused. (Only
|
||||
/// checked when `Client::require_https` is `true`.)
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
pub async fn check<T, E>(&self, request: T) -> Result<Response>
|
||||
where
|
||||
Request: TryFrom<T, Error = E>,
|
||||
ErrorKind: From<E>,
|
||||
{
|
||||
let Request { uri, source, .. } = request.try_into()?;
|
||||
let Request {
|
||||
ref mut uri,
|
||||
source,
|
||||
..
|
||||
} = request.try_into()?;
|
||||
|
||||
let uri = self.remap(uri)?;
|
||||
self.remap(uri);
|
||||
|
||||
// TODO: Allow filtering based on element and attribute
|
||||
let status = if self.filter.is_excluded(&uri) {
|
||||
let status = if self.is_excluded(uri) {
|
||||
Status::Excluded
|
||||
} else if uri.is_file() {
|
||||
self.check_file(&uri)
|
||||
self.check_file(uri)
|
||||
} else if uri.is_mail() {
|
||||
self.check_mail(&uri).await
|
||||
self.check_mail(uri).await
|
||||
} else {
|
||||
match self.check_website(&uri).await {
|
||||
match self.check_website(uri).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
let mut https_uri = uri.clone();
|
||||
https_uri
|
||||
.set_scheme("https")
|
||||
.map_err(|_| ErrorKind::InvalidURI(uri.clone()))?;
|
||||
{
|
||||
// here `uri` must be valid, otherwise `check_website` won't
|
||||
// return `Ok`, thus `set_scheme` won't fail
|
||||
debug_assert!(!https_uri.url.cannot_be_a_base());
|
||||
https_uri.set_scheme("https").unwrap();
|
||||
}
|
||||
if self.check_website(&https_uri).await.is_success() {
|
||||
Status::Error(ErrorKind::InsecureURL(https_uri))
|
||||
} else {
|
||||
|
|
@ -419,15 +469,11 @@ impl Client {
|
|||
Ok(Response::new(uri.clone(), status, source))
|
||||
}
|
||||
|
||||
/// Remap URI using the client-defined remap patterns
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the remapping value is not a URI
|
||||
pub fn remap(&self, uri: Uri) -> Result<Uri> {
|
||||
match self.remaps {
|
||||
Some(ref remaps) => remaps.remap(uri),
|
||||
None => Ok(uri),
|
||||
/// Remap `uri` using the client-defined remapping rules.
|
||||
pub fn remap(&self, uri: &mut Uri) {
|
||||
// TODO: this should be logged (Lucius, Jan 2023)
|
||||
if let Some(ref remaps) = self.remaps {
|
||||
remaps.remap(&mut uri.url);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -442,7 +488,7 @@ impl Client {
|
|||
/// Unsupported schemes will be ignored
|
||||
pub async fn check_website(&self, uri: &Uri) -> Status {
|
||||
// Workaround for upstream reqwest panic
|
||||
if invalid(&uri.url) {
|
||||
if validate_url(&uri.url) {
|
||||
if matches!(uri.scheme(), "http" | "https") {
|
||||
// This is a truly invalid URI with a known scheme.
|
||||
// If we pass that to reqwest it would panic.
|
||||
|
|
@ -456,21 +502,22 @@ impl Client {
|
|||
}
|
||||
|
||||
let mut retries: u64 = 0;
|
||||
let mut wait = self.retry_wait_time;
|
||||
let mut wait_time = self.retry_wait_time;
|
||||
|
||||
let mut status = self.check_default(uri).await;
|
||||
while retries < self.max_retries {
|
||||
if status.is_success() {
|
||||
return status;
|
||||
}
|
||||
sleep(wait).await;
|
||||
sleep(wait_time).await;
|
||||
retries += 1;
|
||||
wait *= 2;
|
||||
wait_time = wait_time.saturating_mul(2);
|
||||
status = self.check_default(uri).await;
|
||||
}
|
||||
|
||||
// Pull out the heavy machinery in case of a failed normal request.
|
||||
// This could be a GitHub URL and we ran into the rate limiter.
|
||||
// TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023)
|
||||
if let Ok(github_uri) = GithubUri::try_from(uri) {
|
||||
let status = self.check_github(github_uri).await;
|
||||
// Only return Github status in case of success
|
||||
|
|
@ -494,10 +541,7 @@ impl Client {
|
|||
/// A better approach would be to download the file through the API or
|
||||
/// clone the repo, but we chose the pragmatic approach.
|
||||
async fn check_github(&self, uri: GithubUri) -> Status {
|
||||
let client = match &self.github_client {
|
||||
Some(client) => client,
|
||||
None => return ErrorKind::MissingGitHubToken.into(),
|
||||
};
|
||||
let Some(client) = &self.github_client else { return ErrorKind::MissingGitHubToken.into() };
|
||||
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
|
||||
Ok(repo) => repo,
|
||||
Err(e) => return ErrorKind::GithubRequest(e).into(),
|
||||
|
|
@ -572,19 +616,21 @@ impl Client {
|
|||
// got merged.
|
||||
// It is exactly the same check that reqwest runs internally, but unfortunately
|
||||
// it `unwrap`s (and panics!) instead of returning an error, which we could handle.
|
||||
fn invalid(url: &Url) -> bool {
|
||||
url.as_str().parse::<http::Uri>().is_err()
|
||||
fn validate_url(url: &Url) -> bool {
|
||||
http::Uri::try_from(url.as_str()).is_err()
|
||||
}
|
||||
|
||||
/// A convenience function to check a single URI.
|
||||
///
|
||||
/// This provides the simplest link check utility without having to create a [`Client`].
|
||||
/// For more complex scenarios, see documentation of [`ClientBuilder`] instead.
|
||||
/// This provides the simplest link check utility without having to create a
|
||||
/// [`Client`]. For more complex scenarios, see documentation of
|
||||
/// [`ClientBuilder`] instead.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if:
|
||||
/// - The request client cannot be built (see [`ClientBuilder::client`] for failure cases).
|
||||
/// - The request client cannot be built (see [`ClientBuilder::client`] for
|
||||
/// failure cases).
|
||||
/// - The request cannot be checked (see [`Client::check`] for failure cases).
|
||||
pub async fn check<T, E>(request: T) -> Result<Response>
|
||||
where
|
||||
|
|
@ -633,9 +679,9 @@ mod tests {
|
|||
|
||||
assert!(res.status().is_failure());
|
||||
|
||||
// on slow connections, this might take a bit longer than nominal backed-off timeout (7 secs)
|
||||
assert!(end.as_secs() >= 7);
|
||||
assert!(end.as_secs() <= 8);
|
||||
// on slow connections, this might take a bit longer than nominal
|
||||
// backed-off timeout (7 secs)
|
||||
assert!((7..=8).contains(&end.as_secs()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
|
|
@ -60,8 +60,6 @@ mod types;
|
|||
/// Functionality to extract URIs from inputs
|
||||
pub mod extract;
|
||||
|
||||
/// Remapping rules which allow to map URIs matching a pattern to a different
|
||||
/// URI. Use in moderation as there are no safety- or performance guarantees.
|
||||
pub mod remap;
|
||||
|
||||
/// Filters are a way to define behavior when encountering
|
||||
|
|
|
|||
|
|
@ -1,20 +1,39 @@
|
|||
//! Remapping rules which allow to map URLs matching a pattern to a different
|
||||
//! URL.
|
||||
//!
|
||||
//! # Notes
|
||||
//! Use in moderation as there are no sanity or performance guarantees.
|
||||
//!
|
||||
//! - There is no constraint on remapping rules upon instantiation or during
|
||||
//! remapping. In particular, rules are checked sequentially so later rules
|
||||
//! might contradict with earlier ones if they both match a URL.
|
||||
//! - A large rule set has a performance impact because the client needs to
|
||||
//! match every link against all rules.
|
||||
|
||||
// Notes on terminology:
|
||||
// The major difference between URI (Uniform Resource Identifier) and
|
||||
// URL (Uniform Resource Locator) is that the former is an indentifier for
|
||||
// resources and the latter is a locator.
|
||||
// We are not interested in differentiating resources by names and the purpose of
|
||||
// remapping is to provide an alternative **location** in certain
|
||||
// circumanstances. Thus the documentation should be about remapping URLs
|
||||
// (locations), not remapping URIs (identities).
|
||||
|
||||
use std::ops::Index;
|
||||
|
||||
use crate::{ErrorKind, Result};
|
||||
use regex::Regex;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::Uri;
|
||||
use crate::ErrorKind;
|
||||
|
||||
/// Remaps allow mapping from a URI pattern to a specified URI
|
||||
/// Rules that remap matching URL patterns.
|
||||
///
|
||||
/// Some use-cases are
|
||||
/// - Testing URIs prior to production deployment
|
||||
/// - Testing URIs behind a proxy
|
||||
/// Some use-cases are:
|
||||
/// - Testing URLs prior to production deployment.
|
||||
/// - Testing URLs behind a proxy.
|
||||
///
|
||||
/// Be careful when using this feature because checking every link against a
|
||||
/// large set of regular expressions has a performance impact. Also there are no
|
||||
/// constraints on the URI mapping, so the rules might contradict each other.
|
||||
/// # Notes
|
||||
/// See module level documentation of usage notes.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Remaps(Vec<(Regex, Url)>);
|
||||
|
||||
|
|
@ -25,28 +44,28 @@ impl Remaps {
|
|||
Self(patterns)
|
||||
}
|
||||
|
||||
/// Remap URI using the client-defined remap patterns
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the remapping value is not a valid URI
|
||||
pub fn remap(&self, uri: Uri) -> Result<Uri> {
|
||||
let mut uri = uri;
|
||||
for (pattern, new_uri) in &self.0 {
|
||||
if pattern.is_match(uri.as_str()) {
|
||||
uri = Uri::try_from(new_uri.clone())?;
|
||||
}
|
||||
}
|
||||
Ok(uri)
|
||||
/// Returns an iterator over the rules.
|
||||
// `iter_mut` is deliberately avoided.
|
||||
pub fn iter(&self) -> std::slice::Iter<(Regex, Url)> {
|
||||
self.0.iter()
|
||||
}
|
||||
|
||||
/// Returns `true` if there are no remappings defined.
|
||||
/// Remap URL against remapping rules.
|
||||
pub fn remap(&self, url: &mut Url) {
|
||||
for (pattern, new_url) in self {
|
||||
if pattern.is_match(url.as_str()) {
|
||||
*url = new_url.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if there is no remapping rule defined.
|
||||
#[must_use]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
/// Get the number of defined remap rules
|
||||
/// Get the number of remapping rules.
|
||||
#[must_use]
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
|
|
@ -64,13 +83,24 @@ impl Index<usize> for Remaps {
|
|||
impl TryFrom<&[String]> for Remaps {
|
||||
type Error = ErrorKind;
|
||||
|
||||
/// Try to convert a slice of `String`s to remapping rules.
|
||||
///
|
||||
/// Each string should contain a Regex pattern and a URL, separated by
|
||||
/// whitespaces.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if:
|
||||
/// - Any string in the slice is not of the form `REGEX URL`.
|
||||
/// - REGEX is not a valid regular expression.
|
||||
/// - URL is not a valid URL.
|
||||
fn try_from(remaps: &[String]) -> std::result::Result<Self, Self::Error> {
|
||||
let mut parsed = Vec::new();
|
||||
|
||||
for remap in remaps {
|
||||
let params: Vec<_> = remap.split_whitespace().collect();
|
||||
if params.len() != 2 {
|
||||
return Err(ErrorKind::InvalidUriRemap(remap.to_string()));
|
||||
return Err(ErrorKind::InvalidUrlRemap(remap.to_string()));
|
||||
}
|
||||
|
||||
let pattern = Regex::new(params[0])?;
|
||||
|
|
@ -83,6 +113,18 @@ impl TryFrom<&[String]> for Remaps {
|
|||
}
|
||||
}
|
||||
|
||||
// Implementation for mutable iterator and moving iterator are deliberately
|
||||
// avoided
|
||||
impl<'a> IntoIterator for &'a Remaps {
|
||||
type Item = &'a (Regex, Url);
|
||||
|
||||
type IntoIter = std::slice::Iter<'a, (Regex, Url)>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -90,37 +132,37 @@ mod tests {
|
|||
#[test]
|
||||
fn test_remap() {
|
||||
let pattern = Regex::new("https://example.com").unwrap();
|
||||
let uri = Uri::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, uri.clone().url)]);
|
||||
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
|
||||
|
||||
let input = Uri::try_from("https://example.com").unwrap();
|
||||
let remapped = remaps.remap(input).unwrap();
|
||||
let mut input = Url::try_from("https://example.com").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
|
||||
assert_eq!(remapped, uri);
|
||||
assert_eq!(input, new_url);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_path() {
|
||||
let pattern = Regex::new("../../issues").unwrap();
|
||||
let uri = Uri::try_from("https://example.com").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, uri.clone().url)]);
|
||||
let new_url = Url::try_from("https://example.com").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
|
||||
|
||||
let input = Uri::try_from("file://../../issues").unwrap();
|
||||
let remapped = remaps.remap(input).unwrap();
|
||||
let mut input = Url::try_from("file://../../issues").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
|
||||
assert_eq!(remapped, uri);
|
||||
assert_eq!(input, new_url);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_skip() {
|
||||
let pattern = Regex::new("https://example.com").unwrap();
|
||||
let uri = Uri::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, uri.url)]);
|
||||
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url)]);
|
||||
|
||||
let input = Uri::try_from("https://unrelated.example.com").unwrap();
|
||||
let remapped = remaps.remap(input.clone()).unwrap();
|
||||
let mut input = Url::try_from("https://unrelated.example.com").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
|
||||
// URI was not modified
|
||||
assert_eq!(remapped, input);
|
||||
// URL was not modified
|
||||
assert_eq!(input, input);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,8 +67,8 @@ pub enum ErrorKind {
|
|||
#[error("Error with base dir `{0}` : {1}")]
|
||||
InvalidBase(String, String),
|
||||
/// The given input can not be parsed into a valid URI remapping
|
||||
#[error("Error handling URI remap expression. Cannot parse into URI remapping: `{0}`")]
|
||||
InvalidUriRemap(String),
|
||||
#[error("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: `{0}`")]
|
||||
InvalidUrlRemap(String),
|
||||
/// The given path does not resolve to a valid file
|
||||
#[error("Cannot find local file {0}")]
|
||||
FileNotFound(PathBuf),
|
||||
|
|
@ -201,7 +201,7 @@ impl Hash for ErrorKind {
|
|||
Self::UnreachableEmailAddress(u, ..) => u.hash(state),
|
||||
Self::InsecureURL(u, ..) => u.hash(state),
|
||||
Self::InvalidBase(base, e) => (base, e).hash(state),
|
||||
Self::InvalidUriRemap(remap) => (remap).hash(state),
|
||||
Self::InvalidUrlRemap(remap) => (remap).hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
|
||||
Self::InvalidStatusCode(c) => c.hash(state),
|
||||
|
|
|
|||
Loading…
Reference in a new issue