Make retries configurable; align constants (#446)

Using the same default values for the library and the
binary now but tweaked the values a bit for slightly faster performance.
This commit is contained in:
Matthias 2022-01-07 01:03:10 +01:00 committed by GitHub
parent 8df50cf501
commit 21f3160b71
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 50 additions and 24 deletions

View file

@ -233,11 +233,12 @@ OPTIONS:
-f, --format <format> Output format of final status report (compact, detailed, json, markdown)
[default: compact]
--github-token <github-token> GitHub API token to use when checking github.com links, to avoid rate
limiting [env: GITHUB_TOKEN=]
limiting [env: GITHUB_TOKEN]
-h, --headers <headers>... Custom request headers
--include <include>... URLs to check (supports regex). Has preference over all excludes
--max-concurrency <max-concurrency> Maximum number of concurrent network requests [default: 128]
-m, --max-redirects <max-redirects> Maximum number of allowed redirects [default: 10]
-m, --max-redirects <max-redirects> Maximum number of allowed redirects [default: 5]
--max-retries <max-retries> Maximum number of retries per request [default: 3]
-X, --method <method> Request method [default: get]
-o, --output <output> Output file of status report
-s, --scheme <scheme>... Only test links with the given schemes (e.g. http and https)

View file

@ -2,22 +2,22 @@ use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr};
use anyhow::{anyhow, Error, Result};
use lazy_static::lazy_static;
use lychee_lib::{Base, Input};
use lychee_lib::{
Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT,
};
use serde::Deserialize;
use structopt::{clap::crate_version, StructOpt};
use structopt::StructOpt;
const METHOD: &str = "get";
const TIMEOUT: usize = 20;
const MAX_CONCURRENCY: usize = 128;
const MAX_REDIRECTS: usize = 10;
const USER_AGENT: &str = concat!("lychee/", crate_version!());
// this exists because structopt requires `&str` type values for defaults
// (we can't use e.g. `TIMEOUT` or `timeout()` which gets created for serde)
lazy_static! {
static ref TIMEOUT_STR: String = TIMEOUT.to_string();
static ref TIMEOUT_STR: String = DEFAULT_TIMEOUT.to_string();
static ref MAX_CONCURRENCY_STR: String = MAX_CONCURRENCY.to_string();
static ref MAX_REDIRECTS_STR: String = MAX_REDIRECTS.to_string();
static ref MAX_REDIRECTS_STR: String = DEFAULT_MAX_REDIRECTS.to_string();
static ref MAX_RETRIES_STR: String = DEFAULT_MAX_RETRIES.to_string();
}
#[derive(Debug, Deserialize)]
@ -61,10 +61,11 @@ macro_rules! default_function {
// Generate the functions for serde defaults
default_function! {
max_redirects: usize = MAX_REDIRECTS;
max_redirects: usize = DEFAULT_MAX_REDIRECTS;
max_retries: u64 = DEFAULT_MAX_RETRIES;
max_concurrency: usize = MAX_CONCURRENCY;
user_agent: String = USER_AGENT.to_string();
timeout: usize = TIMEOUT;
user_agent: String = DEFAULT_USER_AGENT.to_string();
timeout: usize = DEFAULT_TIMEOUT;
method: String = METHOD.to_string();
}
@ -142,6 +143,11 @@ pub(crate) struct Config {
#[serde(default = "max_redirects")]
pub(crate) max_redirects: usize,
/// Maximum number of retries per request
#[structopt(long, default_value = &MAX_RETRIES_STR)]
#[serde(default = "max_retries")]
pub(crate) max_retries: u64,
/// Maximum number of concurrent network requests
#[structopt(long, default_value = &MAX_CONCURRENCY_STR)]
#[serde(default = "max_concurrency")]
@ -154,7 +160,7 @@ pub(crate) struct Config {
pub(crate) threads: Option<usize>,
/// User agent
#[structopt(short, long, default_value = USER_AGENT)]
#[structopt(short, long, default_value = DEFAULT_USER_AGENT)]
#[serde(default = "user_agent")]
pub(crate) user_agent: String,
@ -308,10 +314,11 @@ impl Config {
// Keys with defaults to assign
verbose: false;
no_progress: false;
max_redirects: MAX_REDIRECTS;
max_redirects: DEFAULT_MAX_REDIRECTS;
max_retries: DEFAULT_MAX_RETRIES;
max_concurrency: MAX_CONCURRENCY;
threads: None;
user_agent: USER_AGENT;
user_agent: DEFAULT_USER_AGENT;
insecure: false;
scheme: Vec::<String>::new();
include: Vec::<String>::new();
@ -324,7 +331,7 @@ impl Config {
exclude_mail: false;
headers: Vec::<String>::new();
accept: None;
timeout: TIMEOUT;
timeout: DEFAULT_TIMEOUT;
method: METHOD;
base: None;
basic_auth: None;

View file

@ -180,7 +180,7 @@ mod cli {
// Currently getting a 429 with Googlebot.
// See https://github.com/lycheeverse/lychee/issues/448
// See https://twitter.com/matthiasendler/status/1479224185125748737
// TODO: Remove this exlusion in the future
// TODO: Remove this exclusion in the future
"--exclude",
"twitter"
)

View file

@ -23,8 +23,16 @@ use crate::{
ErrorKind, Request, Response, Result, Status, Uri,
};
const DEFAULT_MAX_REDIRECTS: usize = 5;
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
/// Default lychee user agent
pub const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
/// Number of redirects until a request gets declared as failed
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
/// Number of retries until a request gets declared as failed
pub const DEFAULT_MAX_RETRIES: u64 = 3;
/// Wait time in seconds between requests (will be doubled after every failure)
pub const DEFAULT_RETRY_WAIT_TIME: u64 = 1;
/// Total timeout per request until a request gets declared as failed
pub const DEFAULT_TIMEOUT: usize = 20;
/// Handles incoming requests and returns responses. Usually you would not
/// initialize a `Client` yourself, but use the `ClientBuilder` because it
@ -37,6 +45,8 @@ pub struct Client {
github_client: Option<Github>,
/// Filtered domain handling.
filter: Filter,
/// Maximum number of retries
max_retries: u64,
/// Default request HTTP method to use.
method: reqwest::Method,
/// The set of accepted HTTP status codes for valid URIs.
@ -74,6 +84,9 @@ pub struct ClientBuilder {
/// Maximum number of redirects before returning error
#[builder(default = DEFAULT_MAX_REDIRECTS)]
max_redirects: usize,
/// Maximum number of retries before returning error
#[builder(default = DEFAULT_MAX_RETRIES)]
max_retries: u64,
/// User agent used for checking links
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
@ -169,6 +182,7 @@ impl ClientBuilder {
reqwest_client,
github_client: github_token,
filter,
max_retries: self.max_retries,
method: self.method.clone(),
accepted: self.accepted.clone(),
require_https: self.require_https,
@ -233,16 +247,16 @@ impl Client {
/// Check a website URI
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let mut retries: u64 = 0;
let mut wait = DEFAULT_RETRY_WAIT_TIME;
let mut status = self.check_default(uri).await;
while retries > 0 {
while retries < self.max_retries {
if status.is_success() {
return status;
}
retries -= 1;
sleep(Duration::from_secs(wait)).await;
retries += 1;
wait *= 2;
status = self.check_default(uri).await;
}

View file

@ -72,7 +72,11 @@ use ring as _; // required for apple silicon
#[doc(inline)]
pub use crate::{
client::{check, Client, ClientBuilder},
// Constants get exposed so that the CLI can use the same defaults as the library
client::{
check, Client, ClientBuilder, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT,
},
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{