Make retry wait time configurable (#525)

This commit is contained in:
Matthias 2022-02-24 12:24:57 +01:00 committed by GitHub
parent a5a56006dd
commit 05bd3817ee
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 69 additions and 16 deletions

View file

@ -246,10 +246,13 @@ OPTIONS:
--max-retries <max-retries> Maximum number of retries per request [default: 3]
-X, --method <method> Request method [default: get]
-o, --output <output> Output file of status report
-r, --retry-wait-time <retry-wait-time> Minimum wait time in seconds between retries of failed requests [default:
1]
-s, --scheme <scheme>... Only test links with the given schemes (e.g. http and https)
-T, --threads <threads> Number of threads to utilize. Defaults to number of cores available to
the system
-t, --timeout <timeout> Website timeout from connect to response finished [default: 20]
-t, --timeout <timeout> Website timeout in seconds from connect to response finished [default:
20]
-u, --user-agent <user-agent> User agent [default: lychee/0.8.2]
ARGS:

View file

@ -1,5 +1,5 @@
use crate::options::Config;
use crate::parse::{parse_basic_auth, parse_headers, parse_statuscodes, parse_timeout};
use crate::parse::{parse_basic_auth, parse_duration_secs, parse_headers, parse_statuscodes};
use anyhow::{Context, Result};
use headers::HeaderMapExt;
use lychee_lib::{Client, ClientBuilder};
@ -15,7 +15,8 @@ pub(crate) fn create(cfg: &Config) -> Result<Client> {
}
let accepted = cfg.accept.clone().and_then(|a| parse_statuscodes(&a).ok());
let timeout = parse_timeout(cfg.timeout);
let timeout = parse_duration_secs(cfg.timeout);
let retry_wait_time = parse_duration_secs(cfg.retry_wait_time);
let method: reqwest::Method = reqwest::Method::from_str(&cfg.method.to_uppercase())?;
let include = RegexSet::new(&cfg.include)?;
let exclude = RegexSet::new(&cfg.exclude)?;
@ -41,6 +42,7 @@ pub(crate) fn create(cfg: &Config) -> Result<Client> {
.custom_headers(headers)
.method(method)
.timeout(timeout)
.retry_wait_time(retry_wait_time)
.github_token(cfg.github_token.clone())
.schemes(HashSet::from_iter(schemes))
.accepted(accepted)

View file

@ -3,7 +3,8 @@ use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr, time
use anyhow::{anyhow, Error, Result};
use const_format::{concatcp, formatcp};
use lychee_lib::{
Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT,
Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS,
DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
};
use secrecy::{ExposeSecret, SecretString};
use serde::Deserialize;
@ -33,7 +34,8 @@ expressions supported; one pattern per line. Automatically excludes
patterns from `{}` if file exists",
LYCHEE_IGNORE_FILE,
);
const TIMEOUT_STR: &str = concatcp!(DEFAULT_TIMEOUT);
const TIMEOUT_STR: &str = concatcp!(DEFAULT_TIMEOUT_SECS);
const RETRY_WAIT_TIME_STR: &str = concatcp!(DEFAULT_RETRY_WAIT_TIME_SECS);
#[derive(Debug, Deserialize)]
pub(crate) enum Format {
@ -81,7 +83,8 @@ default_function! {
max_concurrency: usize = DEFAULT_MAX_CONCURRENCY;
max_cache_age: Duration = humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();
user_agent: String = DEFAULT_USER_AGENT.to_string();
timeout: usize = DEFAULT_TIMEOUT;
timeout: usize = DEFAULT_TIMEOUT_SECS;
retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS;
method: String = DEFAULT_METHOD.to_string();
}
@ -260,11 +263,16 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) accept: Option<String>,
/// Website timeout from connect to response finished
/// Website timeout in seconds from connect to response finished
#[structopt(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
pub(crate) timeout: usize,
/// Minimum wait time in seconds between retries of failed requests
#[structopt(short, long, default_value = &RETRY_WAIT_TIME_STR)]
#[serde(default = "retry_wait_time")]
pub(crate) retry_wait_time: usize,
/// Request method
// Using `-X` as a short param similar to curl
#[structopt(short = "X", long, default_value = DEFAULT_METHOD)]
@ -361,7 +369,8 @@ impl Config {
exclude_mail: false;
headers: Vec::<String>::new();
accept: None;
timeout: DEFAULT_TIMEOUT;
timeout: DEFAULT_TIMEOUT_SECS;
retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS;
method: DEFAULT_METHOD;
base: None;
basic_auth: None;

View file

@ -14,8 +14,8 @@ fn read_header(input: &str) -> Result<(String, String)> {
Ok((elements[0].into(), elements[1].into()))
}
pub(crate) const fn parse_timeout(timeout: usize) -> Duration {
Duration::from_secs(timeout as u64)
pub(crate) const fn parse_duration_secs(secs: usize) -> Duration {
Duration::from_secs(secs as u64)
}
pub(crate) fn parse_headers<T: AsRef<str>>(headers: &[T]) -> Result<HeaderMap> {

View file

@ -39,9 +39,9 @@ pub const DEFAULT_MAX_REDIRECTS: usize = 5;
/// Default number of retries before a request is deemed as failed, 3.
pub const DEFAULT_MAX_RETRIES: u64 = 3;
/// Default wait time in seconds between requests, 1.
pub const DEFAULT_RETRY_WAIT_TIME: u64 = 1;
pub const DEFAULT_RETRY_WAIT_TIME_SECS: usize = 1;
/// Default timeout in seconds before a request is deemed as failed, 20.
pub const DEFAULT_TIMEOUT: usize = 20;
pub const DEFAULT_TIMEOUT_SECS: usize = 20;
/// Default user agent, `lychee-<PKG_VERSION>`.
pub const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
@ -67,15 +67,18 @@ pub struct ClientBuilder {
/// As of Feb 2022, it's 60 per hour without GitHub token v.s.
/// 5000 per hour with token.
github_token: Option<SecretString>,
/// Links matching this set of regular expressions are **always** checked.
///
/// This has higher precedence over [`ClientBuilder::excludes`], **but**
/// has lower precedence over any other `exclude_` fields or
/// [`ClientBuilder::schemes`] below.
includes: Option<RegexSet>,
/// Links matching this set of regular expressions are ignored, **except**
/// when a link also matches against [`ClientBuilder::includes`].
excludes: Option<RegexSet>,
/// When `true`, exclude all private network addresses.
///
/// This effectively turns on the following fields:
@ -83,6 +86,7 @@ pub struct ClientBuilder {
/// - [`ClientBuilder::exclude_link_local_ips`]
/// - [`ClientBuilder::exclude_loopback_ips`]
exclude_all_private: bool,
/// When `true`, exclude private IP addresses.
///
/// ## IPv4
@ -109,6 +113,7 @@ pub struct ClientBuilder {
/// [IETF RFC 4291]: https://tools.ietf.org/html/rfc4291
/// [IETF RFC 3879]: https://tools.ietf.org/html/rfc3879
exclude_private_ips: bool,
/// When `true`, exclude link-local IPs.
///
/// ## IPv4
@ -127,6 +132,7 @@ pub struct ClientBuilder {
/// [RFC 4291]: https://tools.ietf.org/html/rfc4291
/// [RFC 4291 section 2.4]: https://tools.ietf.org/html/rfc4291#section-2.4
exclude_link_local_ips: bool,
/// When `true`, exclude loopback IP addresses.
///
/// ## IPv4
@ -142,14 +148,18 @@ pub struct ClientBuilder {
/// [IETF RFC 1122]: https://tools.ietf.org/html/rfc1122
/// [IETF RFC 4291 section 2.5.3]: https://tools.ietf.org/html/rfc4291#section-2.5.3
exclude_loopback_ips: bool,
/// When `true`, don't check mail addresses.
exclude_mail: bool,
/// Maximum number of redirects per request before returning an error.
#[builder(default = DEFAULT_MAX_REDIRECTS)]
max_redirects: usize,
/// Maximum number of retries per request before returning an error.
#[builder(default = DEFAULT_MAX_RETRIES)]
max_retries: u64,
/// User-agent used for checking links.
///
/// *NOTE*: This may be helpful for bypassing certain firewalls.
@ -157,6 +167,7 @@ pub struct ClientBuilder {
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
#[builder(default_code = "String::from(DEFAULT_USER_AGENT)")]
user_agent: String,
/// When `true`, accept invalid SSL certificates.
///
/// ## Warning
@ -167,9 +178,11 @@ pub struct ClientBuilder {
/// introduces significant vulnerabilities, and should only be used
/// as a last resort.
allow_insecure: bool,
/// When non-empty, only links with matched URI schemes are checked.
/// Otherwise, this has no effect.
schemes: HashSet<String>,
/// Sets the default [headers] for every request. See also [here].
///
/// This allows working around validation issues on some websites.
@ -177,15 +190,24 @@ pub struct ClientBuilder {
/// [headers]: https://docs.rs/http/latest/http/header/struct.HeaderName.html
/// [here]: https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.default_headers
custom_headers: HeaderMap,
/// HTTP method used for requests, e.g. `GET` or `HEAD`.
#[builder(default = reqwest::Method::GET)]
method: reqwest::Method,
/// Set of accepted return codes / status codes.
///
/// Unmatched return codes/ status codes are deemed as errors.
accepted: Option<HashSet<StatusCode>>,
/// Response timeout per request.
timeout: Option<Duration>,
/// Initial time between retries of failed requests
///
/// The wait time will increase using an exponential backoff mechanism
retry_wait_time: Option<Duration>,
/// Requires using HTTPS when it's available.
///
/// This would treat unencrypted links as errors when HTTPS is avaliable.
@ -265,6 +287,10 @@ impl ClientBuilder {
exclude_mail: self.exclude_mail,
};
let retry_wait_time = self
.retry_wait_time
.unwrap_or_else(|| Duration::from_secs(DEFAULT_RETRY_WAIT_TIME_SECS as u64));
let quirks = Quirks::default();
Ok(Client {
@ -272,6 +298,7 @@ impl ClientBuilder {
github_client,
filter,
max_retries: self.max_retries,
retry_wait_time,
method,
accepted,
require_https: self.require_https,
@ -287,22 +314,34 @@ impl ClientBuilder {
pub struct Client {
/// Underlying `reqwest` client instance that handles the HTTP requests.
reqwest_client: reqwest::Client,
/// Github client.
github_client: Option<Octocrab>,
/// Rules to decided whether each link would be checked or ignored.
filter: Filter,
/// Maximum number of retries per request before returning an error.
max_retries: u64,
/// Initial time between retries of failed requests
retry_wait_time: Duration,
/// HTTP method used for requests, e.g. `GET` or `HEAD`.
///
/// The same method will be used for all links.
method: reqwest::Method,
/// Set of accepted return codes / status codes.
///
/// Unmatched return codes/ status codes are deemed as errors.
accepted: Option<HashSet<StatusCode>>,
/// Requires using HTTPS when it's available.
///
/// This would treat unecrypted links as errors when HTTPS is avaliable.
/// This would treat unencrypted links as errors when HTTPS is avaliable.
require_https: bool,
/// Override behaviors for certain known issues with special URIs.
quirks: Quirks,
}
@ -362,14 +401,14 @@ impl Client {
/// Here `uri` must has either `http` or `https` scheme.
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: u64 = 0;
let mut wait = DEFAULT_RETRY_WAIT_TIME;
let mut wait = self.retry_wait_time;
let mut status = self.check_default(uri).await;
while retries < self.max_retries {
if status.is_success() {
return status;
}
sleep(Duration::from_secs(wait)).await;
sleep(wait).await;
retries += 1;
wait *= 2;
status = self.check_default(uri).await;

View file

@ -79,7 +79,7 @@ pub use crate::{
// Constants get exposed so that the CLI can use the same defaults as the library
client::{
check, Client, ClientBuilder, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT,
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
},
collector::Collector,
filter::{Excludes, Filter, Includes},