mirror of
https://github.com/Hopiu/lychee.git
synced 2026-03-17 05:00:26 +00:00
Remove cache from collector and remove custom reqwest client pool
* Reqwest comes with its own request pool, so there's no need in adding another layer of indirection. This also gets rid of a lot of allocs. * Remove cache from collector * Improve error handling and documentation * Add back test for request caching in single file Signed-off-by: MichaIng <micha@dietpi.com> Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
parent
a7f809612d
commit
961f12e58e
18 changed files with 112 additions and 99 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1579,7 +1579,6 @@ version = "0.7.2"
|
|||
dependencies = [
|
||||
"cached",
|
||||
"check-if-email-exists",
|
||||
"deadpool",
|
||||
"doc-comment",
|
||||
"fast_chemail",
|
||||
"glob",
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ async fn main() -> Result<()> {
|
|||
.build()
|
||||
.client()?;
|
||||
|
||||
let response = client.check("http://example.org").await?;
|
||||
let response = client.check("https://example.org").await?;
|
||||
dbg!(&response);
|
||||
assert!(response.status().is_success());
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri};
|
||||
use lychee_lib::{ClientBuilder, Input, Request, Result, Uri};
|
||||
use std::convert::TryFrom;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
|
|
@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4;
|
|||
async fn main() -> Result<()> {
|
||||
// These channels are used to send requests and receive responses to and
|
||||
// from the lychee client pool
|
||||
let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
|
||||
let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
|
||||
let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS);
|
||||
|
||||
// Add as many requests as you like
|
||||
|
|
@ -29,13 +29,17 @@ async fn main() -> Result<()> {
|
|||
// Create a default lychee client
|
||||
let client = ClientBuilder::default().client()?;
|
||||
|
||||
// Create a pool with four lychee clients
|
||||
let clients = vec![client; CONCURRENT_REQUESTS];
|
||||
let mut clients = ClientPool::new(send_resp, recv_req, clients);
|
||||
|
||||
// Handle requests in a client pool
|
||||
tokio::spawn(async move {
|
||||
clients.listen().await;
|
||||
while let Some(req) = recv_req.recv().await {
|
||||
// Client::check() may fail only because Request::try_from() may fail
|
||||
// here request is already Request, so it never fails
|
||||
let resp = client.check(req).await.unwrap();
|
||||
send_resp
|
||||
.send(resp)
|
||||
.await
|
||||
.expect("Cannot send response to channel");
|
||||
}
|
||||
});
|
||||
|
||||
// Finally, listen to incoming responses from lychee
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
These links are all the same and should only be checked once:
|
||||
https://example.org/
|
||||
https://example.org/
|
||||
https://example.org
|
||||
5
fixtures/TEST_REPETITION_1.txt
Normal file
5
fixtures/TEST_REPETITION_1.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
These links are all the same and match those of TEST_REPETITION_2.txt.
|
||||
All links in both files should be counted as one and checked once only.
|
||||
https://example.org/
|
||||
https://example.org/
|
||||
https://example.org
|
||||
5
fixtures/TEST_REPETITION_2.txt
Normal file
5
fixtures/TEST_REPETITION_2.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
These links are all the same and match those of TEST_REPETITION_1.txt.
|
||||
All links in both files should be counted as one and checked once only.
|
||||
https://example.org/
|
||||
https://example.org/
|
||||
https://example.org
|
||||
|
|
@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr};
|
|||
use anyhow::{anyhow, Context, Result};
|
||||
use headers::HeaderMapExt;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response};
|
||||
use lychee_lib::{ClientBuilder, Collector, Input, Request, Response};
|
||||
use openssl_sys as _; // required for vendored-openssl feature
|
||||
use regex::RegexSet;
|
||||
use ring as _; // required for apple silicon
|
||||
|
|
@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
Some(bar)
|
||||
};
|
||||
|
||||
let (send_req, recv_req) = mpsc::channel(max_concurrency);
|
||||
let (send_req, mut recv_req) = mpsc::channel(max_concurrency);
|
||||
let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency);
|
||||
|
||||
let mut stats = ResponseStats::new();
|
||||
|
|
@ -245,9 +245,15 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
|
||||
// Start receiving requests
|
||||
tokio::spawn(async move {
|
||||
let clients = vec![client; max_concurrency];
|
||||
let mut clients = ClientPool::new(send_resp, recv_req, clients);
|
||||
clients.listen().await;
|
||||
while let Some(req) = recv_req.recv().await {
|
||||
// `Client::check()` may fail only because `Request::try_from()` may
|
||||
// fail. Here `req` is already a valid `Request`, so it never fails.
|
||||
let resp = client.check(req).await.unwrap();
|
||||
send_resp
|
||||
.send(resp)
|
||||
.await
|
||||
.expect("Cannot send response to channel");
|
||||
}
|
||||
});
|
||||
|
||||
while let Some(response) = recv_resp.recv().await {
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ mod test {
|
|||
stats.add(Response(
|
||||
Input::Stdin,
|
||||
ResponseBody {
|
||||
uri: website("http://example.org/ok"),
|
||||
uri: website("https://example.org/ok"),
|
||||
status: Status::Ok(StatusCode::OK),
|
||||
},
|
||||
));
|
||||
|
|
|
|||
|
|
@ -216,11 +216,35 @@ mod cli {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_repetition() {
|
||||
fn test_caching_single_file() {
|
||||
let mut cmd = main_command();
|
||||
let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt");
|
||||
// Repetitions in one file shall all be checked and counted only once.
|
||||
let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
|
||||
|
||||
cmd.arg(&test_schemes_path)
|
||||
cmd.arg(&test_schemes_path_1)
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............1"))
|
||||
.stdout(contains("Successful.......1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
// Test that two identical requests don't get executed twice.
|
||||
// Note: This currently fails, because we currently don't cache responses. We
|
||||
// used to, but there were issues.
|
||||
// See https://github.com/lycheeverse/lychee/pull/349.
|
||||
// We're planning to add back caching support at a later point in time,
|
||||
// which is why we keep the test around.
|
||||
fn test_caching_across_files() {
|
||||
let mut cmd = main_command();
|
||||
// Repetitions across multiple files shall all be checked and counted only once.
|
||||
let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
|
||||
let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt");
|
||||
|
||||
cmd.arg(&test_schemes_path_1)
|
||||
.arg(&test_schemes_path_2)
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@ version = "0.7.2"
|
|||
|
||||
[dependencies]
|
||||
check-if-email-exists = "0.8.25"
|
||||
deadpool = "0.7.0"
|
||||
fast_chemail = "0.9.6"
|
||||
glob = "0.3.0"
|
||||
html5ever = "0.25.1"
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@ use crate::{
|
|||
const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
|
||||
|
||||
/// Handles incoming requests and returns responses. Usually you would not
|
||||
/// initialize a `Client` yourself, but use the `ClientBuilder` because it
|
||||
/// provides sane defaults for all configuration options.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
/// Underlying reqwest client instance that handles the HTTP requests.
|
||||
|
|
@ -123,7 +126,13 @@ impl ClientBuilder {
|
|||
}
|
||||
|
||||
/// The build method instantiates the client.
|
||||
#[allow(clippy::missing_errors_doc)]
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if:
|
||||
/// - The user agent cannot be parsed
|
||||
/// - The request client cannot be created
|
||||
/// - The Github client cannot be created
|
||||
pub fn client(&self) -> Result<Client> {
|
||||
let mut headers = self.custom_headers.clone();
|
||||
headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?);
|
||||
|
|
@ -169,6 +178,13 @@ impl ClientBuilder {
|
|||
}
|
||||
|
||||
impl Client {
|
||||
/// Check a single request
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - The request cannot be parsed
|
||||
/// - An HTTP website with an invalid URI format gets checked
|
||||
pub async fn check<T, E>(&self, request: T) -> Result<Response>
|
||||
where
|
||||
Request: TryFrom<T, Error = E>,
|
||||
|
|
@ -185,7 +201,10 @@ impl Client {
|
|||
match self.check_website(&uri).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
let mut https_uri = uri.clone();
|
||||
https_uri.url.set_scheme("https").unwrap();
|
||||
https_uri
|
||||
.url
|
||||
.set_scheme("https")
|
||||
.map_err(|_| ErrorKind::InvalidURI(uri.clone()))?;
|
||||
if self.check_website(&https_uri).await.is_success() {
|
||||
Status::Error(Box::new(ErrorKind::InsecureURL(https_uri)))
|
||||
} else {
|
||||
|
|
@ -200,10 +219,12 @@ impl Client {
|
|||
}
|
||||
|
||||
/// Check if the given URI is filtered by the client
|
||||
#[must_use]
|
||||
pub fn filtered(&self, uri: &Uri) -> bool {
|
||||
self.filter.is_excluded(uri)
|
||||
}
|
||||
|
||||
/// Check a website URI
|
||||
pub async fn check_website(&self, uri: &Uri) -> Status {
|
||||
let mut retries: i64 = 3;
|
||||
let mut wait: u64 = 1;
|
||||
|
|
@ -256,6 +277,7 @@ impl Client {
|
|||
}
|
||||
}
|
||||
|
||||
/// Check a file URI
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
if let Ok(path) = uri.url.to_file_path() {
|
||||
if path.exists() {
|
||||
|
|
@ -265,6 +287,7 @@ impl Client {
|
|||
ErrorKind::InvalidFilePath(uri.clone()).into()
|
||||
}
|
||||
|
||||
/// Check a mail address
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
|
||||
let result = &(check_email(&input).await)[0];
|
||||
|
|
|
|||
|
|
@ -1,45 +0,0 @@
|
|||
use client::Client;
|
||||
use deadpool::unmanaged::Pool;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
use crate::{client, types};
|
||||
|
||||
#[allow(missing_debug_implementations)]
|
||||
/// Manages a channel for incoming requests
|
||||
/// and a pool of lychee clients to handle them
|
||||
pub struct ClientPool {
|
||||
tx: mpsc::Sender<types::Response>,
|
||||
rx: mpsc::Receiver<types::Request>,
|
||||
pool: deadpool::unmanaged::Pool<client::Client>,
|
||||
}
|
||||
|
||||
impl ClientPool {
|
||||
#[must_use]
|
||||
/// Creates a new client pool
|
||||
pub fn new(
|
||||
tx: mpsc::Sender<types::Response>,
|
||||
rx: mpsc::Receiver<types::Request>,
|
||||
clients: Vec<Client>,
|
||||
) -> Self {
|
||||
let pool = Pool::from(clients);
|
||||
ClientPool { tx, rx, pool }
|
||||
}
|
||||
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
/// Start listening for incoming requests and send each of them
|
||||
/// asynchronously to a client from the pool
|
||||
pub async fn listen(&mut self) {
|
||||
while let Some(req) = self.rx.recv().await {
|
||||
let client = self.pool.get().await;
|
||||
let tx = self.tx.clone();
|
||||
tokio::spawn(async move {
|
||||
// Client::check() may fail only because Request::try_from() may fail
|
||||
// here request is already Request, so it never fails
|
||||
let resp = client.check(req).await.unwrap();
|
||||
tx.send(resp)
|
||||
.await
|
||||
.expect("Cannot send response to channel");
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
|
||||
use crate::{extract::Extractor, Base, Input, Request, Result};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
|
|
@ -7,18 +7,20 @@ pub struct Collector {
|
|||
base: Option<Base>,
|
||||
skip_missing_inputs: bool,
|
||||
max_concurrency: usize,
|
||||
cache: HashSet<Uri>,
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
/// Create a new collector with an empty cache
|
||||
#[must_use]
|
||||
pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
|
||||
pub const fn new(
|
||||
base: Option<Base>,
|
||||
skip_missing_inputs: bool,
|
||||
max_concurrency: usize,
|
||||
) -> Self {
|
||||
Collector {
|
||||
base,
|
||||
skip_missing_inputs,
|
||||
max_concurrency,
|
||||
cache: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -29,7 +31,7 @@ impl Collector {
|
|||
/// # Errors
|
||||
///
|
||||
/// Will return `Err` if links cannot be extracted from an input
|
||||
pub async fn collect_links(mut self, inputs: &[Input]) -> Result<HashSet<Request>> {
|
||||
pub async fn collect_links(self, inputs: &[Input]) -> Result<HashSet<Request>> {
|
||||
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency);
|
||||
|
||||
// extract input contents
|
||||
|
|
@ -71,17 +73,8 @@ impl Collector {
|
|||
links.extend(new_links?);
|
||||
}
|
||||
|
||||
// Filter out already cached links (duplicates)
|
||||
links.retain(|l| !self.cache.contains(&l.uri));
|
||||
|
||||
self.update_cache(&links);
|
||||
Ok(links)
|
||||
}
|
||||
|
||||
/// Update internal link cache
|
||||
fn update_cache(&mut self, links: &HashSet<Request>) {
|
||||
self.cache.extend(links.iter().cloned().map(|l| l.uri));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ impl Extractor {
|
|||
match resolved {
|
||||
Some(path) => Url::from_file_path(&path)
|
||||
.map(Some)
|
||||
.map_err(|_e| ErrorKind::InvalidUrl(path)),
|
||||
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
|
@ -239,7 +239,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_extract_link_at_end_of_line() {
|
||||
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
|
||||
let link = input.trim_end();
|
||||
|
||||
let mut extractor = Extractor::new(None);
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ pub use includes::Includes;
|
|||
use crate::Uri;
|
||||
|
||||
/// Pre-defined exclusions for known false-positives
|
||||
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
|
||||
const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
|
||||
|
||||
#[inline]
|
||||
#[must_use]
|
||||
|
|
@ -298,7 +298,7 @@ mod test {
|
|||
..Filter::default()
|
||||
};
|
||||
|
||||
assert!(filter.is_excluded(&website("http://github.com")));
|
||||
assert!(filter.is_excluded(&website("https://github.com")));
|
||||
assert!(filter.is_excluded(&website("http://exclude.org")));
|
||||
assert!(filter.is_excluded(&mail("mail@example.org")));
|
||||
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
doc_comment::doctest!("../../README.md");
|
||||
|
||||
mod client;
|
||||
mod client_pool;
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
mod helpers;
|
||||
|
|
@ -73,8 +72,7 @@ use ring as _; // required for apple silicon
|
|||
|
||||
#[doc(inline)]
|
||||
pub use crate::{
|
||||
client::{check, ClientBuilder},
|
||||
client_pool::ClientPool,
|
||||
client::{check, Client, ClientBuilder},
|
||||
collector::Collector,
|
||||
filter::{Excludes, Filter, Includes},
|
||||
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ pub enum ErrorKind {
|
|||
/// The given URI cannot be converted to a file path
|
||||
InvalidFilePath(Uri),
|
||||
/// The given path cannot be converted to a URI
|
||||
InvalidUrl(PathBuf),
|
||||
InvalidUrlFromPath(PathBuf),
|
||||
/// The given mail address is unreachable
|
||||
UnreachableEmailAddress(Uri),
|
||||
/// The given header could not be parsed.
|
||||
|
|
@ -40,8 +40,10 @@ pub enum ErrorKind {
|
|||
InvalidGlobPattern(glob::PatternError),
|
||||
/// The Github API could not be called because of a missing Github token.
|
||||
MissingGitHubToken,
|
||||
/// The website is available in HTTPS protocol, but HTTP scheme is used.
|
||||
/// The website is available through HTTPS, but HTTP scheme is used.
|
||||
InsecureURL(Uri),
|
||||
/// Invalid URI
|
||||
InvalidURI(Uri),
|
||||
}
|
||||
|
||||
impl PartialEq for ErrorKind {
|
||||
|
|
@ -76,7 +78,8 @@ impl Hash for ErrorKind {
|
|||
Self::HubcapsError(e) => e.to_string().hash(state),
|
||||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
|
||||
Self::InvalidUrl(p) => p.hash(state),
|
||||
Self::InvalidURI(u) => u.hash(state),
|
||||
Self::InvalidUrlFromPath(p) => p.hash(state),
|
||||
Self::Utf8Error(e) => e.to_string().hash(state),
|
||||
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
|
||||
u.hash(state);
|
||||
|
|
@ -113,7 +116,10 @@ impl Display for ErrorKind {
|
|||
write!(f, "Cannot parse {} as website url ({})", s, url_err)
|
||||
}
|
||||
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
|
||||
Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
|
||||
Self::InvalidURI(u) => write!(f, "Invalid URI: {}", u),
|
||||
Self::InvalidUrlFromPath(p) => {
|
||||
write!(f, "Invalid path to URL conversion: {}", p.display())
|
||||
}
|
||||
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
|
||||
Self::InvalidHeader(e) => e.fmt(f),
|
||||
Self::InvalidGlobPattern(e) => e.fmt(f),
|
||||
|
|
|
|||
|
|
@ -160,12 +160,12 @@ mod test {
|
|||
fn test_uri_from_str() {
|
||||
assert!(Uri::try_from("").is_err());
|
||||
assert_eq!(
|
||||
Uri::try_from("http://example.org"),
|
||||
Ok(website("http://example.org"))
|
||||
Uri::try_from("https://example.org"),
|
||||
Ok(website("https://example.org"))
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("http://example.org/@test/testing"),
|
||||
Ok(website("http://example.org/@test/testing"))
|
||||
Uri::try_from("https://example.org/@test/testing"),
|
||||
Ok(website("https://example.org/@test/testing"))
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("mail@example.org"),
|
||||
|
|
|
|||
Loading…
Reference in a new issue