Remove cache from collector and remove custom reqwest client pool

* Reqwest comes with its own request pool, so there's no need in adding
another layer of indirection. This also gets rid of a lot of allocs.
* Remove cache from collector
* Improve error handling and documentation
* Add back test for request caching in single file

Signed-off-by: MichaIng <micha@dietpi.com>
Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
MichaIng 2021-10-07 18:07:18 +02:00 committed by GitHub
parent a7f809612d
commit 961f12e58e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 112 additions and 99 deletions

1
Cargo.lock generated
View file

@ -1579,7 +1579,6 @@ version = "0.7.2"
dependencies = [
"cached",
"check-if-email-exists",
"deadpool",
"doc-comment",
"fast_chemail",
"glob",

View file

@ -40,7 +40,7 @@ async fn main() -> Result<()> {
.build()
.client()?;
let response = client.check("http://example.org").await?;
let response = client.check("https://example.org").await?;
dbg!(&response);
assert!(response.status().is_success());
Ok(())

View file

@ -1,4 +1,4 @@
use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri};
use lychee_lib::{ClientBuilder, Input, Request, Result, Uri};
use std::convert::TryFrom;
use tokio::sync::mpsc;
@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4;
async fn main() -> Result<()> {
// These channels are used to send requests and receive responses to and
// from the lychee client pool
let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS);
// Add as many requests as you like
@ -29,13 +29,17 @@ async fn main() -> Result<()> {
// Create a default lychee client
let client = ClientBuilder::default().client()?;
// Create a pool with four lychee clients
let clients = vec![client; CONCURRENT_REQUESTS];
let mut clients = ClientPool::new(send_resp, recv_req, clients);
// Handle requests in a client pool
tokio::spawn(async move {
clients.listen().await;
while let Some(req) = recv_req.recv().await {
// Client::check() may fail only because Request::try_from() may fail
// here request is already Request, so it never fails
let resp = client.check(req).await.unwrap();
send_resp
.send(resp)
.await
.expect("Cannot send response to channel");
}
});
// Finally, listen to incoming responses from lychee

View file

@ -1,4 +0,0 @@
These links are all the same and should only be checked once:
https://example.org/
https://example.org/
https://example.org

View file

@ -0,0 +1,5 @@
These links are all the same and match those of TEST_REPETITION_2.txt.
All links in both files should be counted as one and checked once only.
https://example.org/
https://example.org/
https://example.org

View file

@ -0,0 +1,5 @@
These links are all the same and match those of TEST_REPETITION_1.txt.
All links in both files should be counted as one and checked once only.
https://example.org/
https://example.org/
https://example.org

View file

@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr};
use anyhow::{anyhow, Context, Result};
use headers::HeaderMapExt;
use indicatif::{ProgressBar, ProgressStyle};
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response};
use lychee_lib::{ClientBuilder, Collector, Input, Request, Response};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use ring as _; // required for apple silicon
@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
Some(bar)
};
let (send_req, recv_req) = mpsc::channel(max_concurrency);
let (send_req, mut recv_req) = mpsc::channel(max_concurrency);
let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency);
let mut stats = ResponseStats::new();
@ -245,9 +245,15 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
// Start receiving requests
tokio::spawn(async move {
let clients = vec![client; max_concurrency];
let mut clients = ClientPool::new(send_resp, recv_req, clients);
clients.listen().await;
while let Some(req) = recv_req.recv().await {
// `Client::check()` may fail only because `Request::try_from()` may
// fail. Here `req` is already a valid `Request`, so it never fails.
let resp = client.check(req).await.unwrap();
send_resp
.send(resp)
.await
.expect("Cannot send response to channel");
}
});
while let Some(response) = recv_resp.recv().await {

View file

@ -174,7 +174,7 @@ mod test {
stats.add(Response(
Input::Stdin,
ResponseBody {
uri: website("http://example.org/ok"),
uri: website("https://example.org/ok"),
status: Status::Ok(StatusCode::OK),
},
));

View file

@ -216,11 +216,35 @@ mod cli {
}
#[test]
fn test_repetition() {
fn test_caching_single_file() {
let mut cmd = main_command();
let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt");
// Repetitions in one file shall all be checked and counted only once.
let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
cmd.arg(&test_schemes_path)
cmd.arg(&test_schemes_path_1)
.env_clear()
.assert()
.success()
.stdout(contains("Total............1"))
.stdout(contains("Successful.......1"));
}
#[test]
#[ignore]
// Test that two identical requests don't get executed twice.
// Note: This currently fails, because we currently don't cache responses. We
// used to, but there were issues.
// See https://github.com/lycheeverse/lychee/pull/349.
// We're planning to add back caching support at a later point in time,
// which is why we keep the test around.
fn test_caching_across_files() {
let mut cmd = main_command();
// Repetitions across multiple files shall all be checked and counted only once.
let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt");
cmd.arg(&test_schemes_path_1)
.arg(&test_schemes_path_2)
.env_clear()
.assert()
.success()

View file

@ -18,7 +18,6 @@ version = "0.7.2"
[dependencies]
check-if-email-exists = "0.8.25"
deadpool = "0.7.0"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"

View file

@ -26,6 +26,9 @@ use crate::{
const DEFAULT_MAX_REDIRECTS: usize = 5;
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));
/// Handles incoming requests and returns responses. Usually you would not
/// initialize a `Client` yourself, but use the `ClientBuilder` because it
/// provides sane defaults for all configuration options.
#[derive(Debug, Clone)]
pub struct Client {
/// Underlying reqwest client instance that handles the HTTP requests.
@ -123,7 +126,13 @@ impl ClientBuilder {
}
/// The build method instantiates the client.
#[allow(clippy::missing_errors_doc)]
///
/// # Errors
///
/// Returns an `Err` if:
/// - The user agent cannot be parsed
/// - The request client cannot be created
/// - The Github client cannot be created
pub fn client(&self) -> Result<Client> {
let mut headers = self.custom_headers.clone();
headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?);
@ -169,6 +178,13 @@ impl ClientBuilder {
}
impl Client {
/// Check a single request
///
/// # Errors
///
/// This returns an `Err` if
/// - The request cannot be parsed
/// - An HTTP website with an invalid URI format gets checked
pub async fn check<T, E>(&self, request: T) -> Result<Response>
where
Request: TryFrom<T, Error = E>,
@ -185,7 +201,10 @@ impl Client {
match self.check_website(&uri).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
let mut https_uri = uri.clone();
https_uri.url.set_scheme("https").unwrap();
https_uri
.url
.set_scheme("https")
.map_err(|_| ErrorKind::InvalidURI(uri.clone()))?;
if self.check_website(&https_uri).await.is_success() {
Status::Error(Box::new(ErrorKind::InsecureURL(https_uri)))
} else {
@ -200,10 +219,12 @@ impl Client {
}
/// Check if the given URI is filtered by the client
#[must_use]
pub fn filtered(&self, uri: &Uri) -> bool {
self.filter.is_excluded(uri)
}
/// Check a website URI
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
@ -256,6 +277,7 @@ impl Client {
}
}
/// Check a file URI
pub async fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
@ -265,6 +287,7 @@ impl Client {
ErrorKind::InvalidFilePath(uri.clone()).into()
}
/// Check a mail address
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];

View file

@ -1,45 +0,0 @@
use client::Client;
use deadpool::unmanaged::Pool;
use tokio::sync::mpsc;
use crate::{client, types};
#[allow(missing_debug_implementations)]
/// Manages a channel for incoming requests
/// and a pool of lychee clients to handle them
pub struct ClientPool {
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Request>,
pool: deadpool::unmanaged::Pool<client::Client>,
}
impl ClientPool {
#[must_use]
/// Creates a new client pool
pub fn new(
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<types::Request>,
clients: Vec<Client>,
) -> Self {
let pool = Pool::from(clients);
ClientPool { tx, rx, pool }
}
#[allow(clippy::missing_panics_doc)]
/// Start listening for incoming requests and send each of them
/// asynchronously to a client from the pool
pub async fn listen(&mut self) {
while let Some(req) = self.rx.recv().await {
let client = self.pool.get().await;
let tx = self.tx.clone();
tokio::spawn(async move {
// Client::check() may fail only because Request::try_from() may fail
// here request is already Request, so it never fails
let resp = client.check(req).await.unwrap();
tx.send(resp)
.await
.expect("Cannot send response to channel");
});
}
}
}

View file

@ -1,4 +1,4 @@
use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
use crate::{extract::Extractor, Base, Input, Request, Result};
use std::collections::HashSet;
/// Collector keeps the state of link collection
@ -7,18 +7,20 @@ pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet<Uri>,
}
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
pub const fn new(
base: Option<Base>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Self {
Collector {
base,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
}
}
@ -29,7 +31,7 @@ impl Collector {
/// # Errors
///
/// Will return `Err` if links cannot be extracted from an input
pub async fn collect_links(mut self, inputs: &[Input]) -> Result<HashSet<Request>> {
pub async fn collect_links(self, inputs: &[Input]) -> Result<HashSet<Request>> {
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency);
// extract input contents
@ -71,17 +73,8 @@ impl Collector {
links.extend(new_links?);
}
// Filter out already cached links (duplicates)
links.retain(|l| !self.cache.contains(&l.uri));
self.update_cache(&links);
Ok(links)
}
/// Update internal link cache
fn update_cache(&mut self, links: &HashSet<Request>) {
self.cache.extend(links.iter().cloned().map(|l| l.uri));
}
}
#[cfg(test)]

View file

@ -154,7 +154,7 @@ impl Extractor {
match resolved {
Some(path) => Url::from_file_path(&path)
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrl(path)),
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),
}
}
@ -239,7 +239,7 @@ mod test {
#[test]
fn test_extract_link_at_end_of_line() {
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();
let mut extractor = Extractor::new(None);

View file

@ -9,7 +9,7 @@ pub use includes::Includes;
use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
#[inline]
#[must_use]
@ -298,7 +298,7 @@ mod test {
..Filter::default()
};
assert!(filter.is_excluded(&website("http://github.com")));
assert!(filter.is_excluded(&website("https://github.com")));
assert!(filter.is_excluded(&website("http://exclude.org")));
assert!(filter.is_excluded(&mail("mail@example.org")));

View file

@ -47,7 +47,6 @@
doc_comment::doctest!("../../README.md");
mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod helpers;
@ -73,8 +72,7 @@ use ring as _; // required for apple silicon
#[doc(inline)]
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
client::{check, Client, ClientBuilder},
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},

View file

@ -25,7 +25,7 @@ pub enum ErrorKind {
/// The given URI cannot be converted to a file path
InvalidFilePath(Uri),
/// The given path cannot be converted to a URI
InvalidUrl(PathBuf),
InvalidUrlFromPath(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
@ -40,8 +40,10 @@ pub enum ErrorKind {
InvalidGlobPattern(glob::PatternError),
/// The Github API could not be called because of a missing Github token.
MissingGitHubToken,
/// The website is available in HTTPS protocol, but HTTP scheme is used.
/// The website is available through HTTPS, but HTTP scheme is used.
InsecureURL(Uri),
/// Invalid URI
InvalidURI(Uri),
}
impl PartialEq for ErrorKind {
@ -76,7 +78,8 @@ impl Hash for ErrorKind {
Self::HubcapsError(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::InvalidUrl(p) => p.hash(state),
Self::InvalidURI(u) => u.hash(state),
Self::InvalidUrlFromPath(p) => p.hash(state),
Self::Utf8Error(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
u.hash(state);
@ -113,7 +116,10 @@ impl Display for ErrorKind {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
Self::InvalidURI(u) => write!(f, "Invalid URI: {}", u),
Self::InvalidUrlFromPath(p) => {
write!(f, "Invalid path to URL conversion: {}", p.display())
}
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),

View file

@ -160,12 +160,12 @@ mod test {
fn test_uri_from_str() {
assert!(Uri::try_from("").is_err());
assert_eq!(
Uri::try_from("http://example.org"),
Ok(website("http://example.org"))
Uri::try_from("https://example.org"),
Ok(website("https://example.org"))
);
assert_eq!(
Uri::try_from("http://example.org/@test/testing"),
Ok(website("http://example.org/@test/testing"))
Uri::try_from("https://example.org/@test/testing"),
Ok(website("https://example.org/@test/testing"))
);
assert_eq!(
Uri::try_from("mail@example.org"),