From 05bd3817eeb0d00109823f00c71ffdc2168902b1 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 24 Feb 2022 12:24:57 +0100 Subject: [PATCH] Make retry wait time configurable (#525) --- README.md | 5 +++- lychee-bin/src/client.rs | 6 +++-- lychee-bin/src/options.rs | 19 +++++++++++---- lychee-bin/src/parse.rs | 4 ++-- lychee-lib/src/client.rs | 49 +++++++++++++++++++++++++++++++++++---- lychee-lib/src/lib.rs | 2 +- 6 files changed, 69 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index cc5b806..c5b7765 100644 --- a/README.md +++ b/README.md @@ -246,10 +246,13 @@ OPTIONS: --max-retries Maximum number of retries per request [default: 3] -X, --method Request method [default: get] -o, --output Output file of status report + -r, --retry-wait-time Minimum wait time in seconds between retries of failed requests [default: + 1] -s, --scheme ... Only test links with the given schemes (e.g. http and https) -T, --threads Number of threads to utilize. Defaults to number of cores available to the system - -t, --timeout Website timeout from connect to response finished [default: 20] + -t, --timeout Website timeout in seconds from connect to response finished [default: + 20] -u, --user-agent User agent [default: lychee/0.8.2] ARGS: diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 2d40948..c93f295 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -1,5 +1,5 @@ use crate::options::Config; -use crate::parse::{parse_basic_auth, parse_headers, parse_statuscodes, parse_timeout}; +use crate::parse::{parse_basic_auth, parse_duration_secs, parse_headers, parse_statuscodes}; use anyhow::{Context, Result}; use headers::HeaderMapExt; use lychee_lib::{Client, ClientBuilder}; @@ -15,7 +15,8 @@ pub(crate) fn create(cfg: &Config) -> Result { } let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok()); - let timeout = parse_timeout(cfg.timeout); + let timeout = parse_duration_secs(cfg.timeout); + let retry_wait_time = parse_duration_secs(cfg.retry_wait_time); let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?; let include = RegexSet::new(&cfg.include)?; let exclude = RegexSet::new(&cfg.exclude)?; @@ -41,6 +42,7 @@ pub(crate) fn create(cfg: &Config) -> Result { .custom_headers(headers) .method(method) .timeout(timeout) + .retry_wait_time(retry_wait_time) .github_token(cfg.github_token.clone()) .schemes(HashSet::from_iter(schemes)) .accepted(accepted) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index c8cf625..f4a08ac 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -3,7 +3,8 @@ use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr, time use anyhow::{anyhow, Error, Result}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, + Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, + DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -33,7 +34,8 @@ expressions supported; one pattern per line. Automatically excludes patterns from `{}` if file exists", LYCHEE_IGNORE_FILE, ); -const TIMEOUT_STR: &str = concatcp!(DEFAULT_TIMEOUT); +const TIMEOUT_STR: &str = concatcp!(DEFAULT_TIMEOUT_SECS); +const RETRY_WAIT_TIME_STR: &str = concatcp!(DEFAULT_RETRY_WAIT_TIME_SECS); #[derive(Debug, Deserialize)] pub(crate) enum Format { @@ -81,7 +83,8 @@ default_function! { max_concurrency: usize = DEFAULT_MAX_CONCURRENCY; max_cache_age: Duration = humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap(); user_agent: String = DEFAULT_USER_AGENT.to_string(); - timeout: usize = DEFAULT_TIMEOUT; + timeout: usize = DEFAULT_TIMEOUT_SECS; + retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS; method: String = DEFAULT_METHOD.to_string(); } @@ -260,11 +263,16 @@ pub(crate) struct Config { #[serde(default)] pub(crate) accept: Option, - /// Website timeout from connect to response finished + /// Website timeout in seconds from connect to response finished #[structopt(short, long, default_value = &TIMEOUT_STR)] #[serde(default = "timeout")] pub(crate) timeout: usize, + /// Minimum wait time in seconds between retries of failed requests + #[structopt(short, long, default_value = &RETRY_WAIT_TIME_STR)] + #[serde(default = "retry_wait_time")] + pub(crate) retry_wait_time: usize, + /// Request method // Using `-X` as a short param similar to curl #[structopt(short = "X", long, default_value = DEFAULT_METHOD)] @@ -361,7 +369,8 @@ impl Config { exclude_mail: false; headers: Vec::::new(); accept: None; - timeout: DEFAULT_TIMEOUT; + timeout: DEFAULT_TIMEOUT_SECS; + retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS; method: DEFAULT_METHOD; base: None; basic_auth: None; diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index 344b0b0..9f368f5 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -14,8 +14,8 @@ fn read_header(input: &str) -> Result<(String, String)> { Ok((elements[0].into(), elements[1].into())) } -pub(crate) const fn parse_timeout(timeout: usize) -> Duration { - Duration::from_secs(timeout as u64) +pub(crate) const fn parse_duration_secs(secs: usize) -> Duration { + Duration::from_secs(secs as u64) } pub(crate) fn parse_headers>(headers: &[T]) -> Result { diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 13eb87d..66d6cb4 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -39,9 +39,9 @@ pub const DEFAULT_MAX_REDIRECTS: usize = 5; /// Default number of retries before a request is deemed as failed, 3. pub const DEFAULT_MAX_RETRIES: u64 = 3; /// Default wait time in seconds between requests, 1. -pub const DEFAULT_RETRY_WAIT_TIME: u64 = 1; +pub const DEFAULT_RETRY_WAIT_TIME_SECS: usize = 1; /// Default timeout in seconds before a request is deemed as failed, 20. -pub const DEFAULT_TIMEOUT: usize = 20; +pub const DEFAULT_TIMEOUT_SECS: usize = 20; /// Default user agent, `lychee-`. pub const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION")); @@ -67,15 +67,18 @@ pub struct ClientBuilder { /// As of Feb 2022, it's 60 per hour without GitHub token v.s. /// 5000 per hour with token. github_token: Option, + /// Links matching this set of regular expressions are **always** checked. /// /// This has higher precedence over [`ClientBuilder::excludes`], **but** /// has lower precedence over any other `exclude_` fields or /// [`ClientBuilder::schemes`] below. includes: Option, + /// Links matching this set of regular expressions are ignored, **except** /// when a link also matches against [`ClientBuilder::includes`]. excludes: Option, + /// When `true`, exclude all private network addresses. /// /// This effectively turns on the following fields: @@ -83,6 +86,7 @@ pub struct ClientBuilder { /// - [`ClientBuilder::exclude_link_local_ips`] /// - [`ClientBuilder::exclude_loopback_ips`] exclude_all_private: bool, + /// When `true`, exclude private IP addresses. /// /// ## IPv4 @@ -109,6 +113,7 @@ pub struct ClientBuilder { /// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291 /// [IETF RFC 3879]: https://tools.ietf.org/html/rfc3879 exclude_private_ips: bool, + /// When `true`, exclude link-local IPs. /// /// ## IPv4 @@ -127,6 +132,7 @@ pub struct ClientBuilder { /// [RFC 4291]: https://tools.ietf.org/html/rfc4291 /// [RFC 4291 section 2.4]: https://tools.ietf.org/html/rfc4291#section-2.4 exclude_link_local_ips: bool, + /// When `true`, exclude loopback IP addresses. /// /// ## IPv4 @@ -142,14 +148,18 @@ pub struct ClientBuilder { /// [IETF RFC 1122]: https://tools.ietf.org/html/rfc1122 /// [IETF RFC 4291 section 2.5.3]: https://tools.ietf.org/html/rfc4291#section-2.5.3 exclude_loopback_ips: bool, + /// When `true`, don't check mail addresses. exclude_mail: bool, + /// Maximum number of redirects per request before returning an error. #[builder(default = DEFAULT_MAX_REDIRECTS)] max_redirects: usize, + /// Maximum number of retries per request before returning an error. #[builder(default = DEFAULT_MAX_RETRIES)] max_retries: u64, + /// User-agent used for checking links. /// /// *NOTE*: This may be helpful for bypassing certain firewalls. @@ -157,6 +167,7 @@ pub struct ClientBuilder { // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). #[builder(default_code = "String::from(DEFAULT_USER_AGENT)")] user_agent: String, + /// When `true`, accept invalid SSL certificates. /// /// ## Warning @@ -167,9 +178,11 @@ pub struct ClientBuilder { /// introduces significant vulnerabilities, and should only be used /// as a last resort. allow_insecure: bool, + /// When non-empty, only links with matched URI schemes are checked. /// Otherwise, this has no effect. schemes: HashSet, + /// Sets the default [headers] for every request. See also [here]. /// /// This allows working around validation issues on some websites. @@ -177,15 +190,24 @@ pub struct ClientBuilder { /// [headers]: https://docs.rs/http/latest/http/header/struct.HeaderName.html /// [here]: https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.default_headers custom_headers: HeaderMap, + /// HTTP method used for requests, e.g. `GET` or `HEAD`. #[builder(default = reqwest::Method::GET)] method: reqwest::Method, + /// Set of accepted return codes / status codes. /// /// Unmatched return codes/ status codes are deemed as errors. accepted: Option>, + /// Response timeout per request. timeout: Option, + + /// Initial time between retries of failed requests + /// + /// The wait time will increase using an exponential backoff mechanism + retry_wait_time: Option, + /// Requires using HTTPS when it's available. /// /// This would treat unencrypted links as errors when HTTPS is avaliable. @@ -265,6 +287,10 @@ impl ClientBuilder { exclude_mail: self.exclude_mail, }; + let retry_wait_time = self + .retry_wait_time + .unwrap_or_else(|| Duration::from_secs(DEFAULT_RETRY_WAIT_TIME_SECS as u64)); + let quirks = Quirks::default(); Ok(Client { @@ -272,6 +298,7 @@ impl ClientBuilder { github_client, filter, max_retries: self.max_retries, + retry_wait_time, method, accepted, require_https: self.require_https, @@ -287,22 +314,34 @@ impl ClientBuilder { pub struct Client { /// Underlying `reqwest` client instance that handles the HTTP requests. reqwest_client: reqwest::Client, + /// Github client. github_client: Option, + /// Rules to decided whether each link would be checked or ignored. filter: Filter, + /// Maximum number of retries per request before returning an error. max_retries: u64, + + /// Initial time between retries of failed requests + retry_wait_time: Duration, + /// HTTP method used for requests, e.g. `GET` or `HEAD`. + /// + /// The same method will be used for all links. method: reqwest::Method, + /// Set of accepted return codes / status codes. /// /// Unmatched return codes/ status codes are deemed as errors. accepted: Option>, + /// Requires using HTTPS when it's available. /// - /// This would treat unecrypted links as errors when HTTPS is avaliable. + /// This would treat unencrypted links as errors when HTTPS is avaliable. require_https: bool, + /// Override behaviors for certain known issues with special URIs. quirks: Quirks, } @@ -362,14 +401,14 @@ impl Client { /// Here `uri` must has either `http` or `https` scheme. pub async fn check_website(&self, uri: &Uri) -> Status { let mut retries: u64 = 0; - let mut wait = DEFAULT_RETRY_WAIT_TIME; + let mut wait = self.retry_wait_time; let mut status = self.check_default(uri).await; while retries < self.max_retries { if status.is_success() { return status; } - sleep(Duration::from_secs(wait)).await; + sleep(wait).await; retries += 1; wait *= 2; status = self.check_default(uri).await; diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 6d60294..a066053 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -79,7 +79,7 @@ pub use crate::{ // Constants get exposed so that the CLI can use the same defaults as the library client::{ check, Client, ClientBuilder, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, - DEFAULT_RETRY_WAIT_TIME, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, + DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }, collector::Collector, filter::{Excludes, Filter, Includes},