Add simple, standalone client

Adds a new function `lychee::check()`, which removes
a lot of boilerplate for simple cases. Adjusted the code,
tests, and documentation.
The downside is that `check` now returns a Result, so
we have to use `?` to get to the response. That's because
we have to account for the case where the given string is
not a valid URI.
This commit is contained in:
Matthias Endler 2021-02-18 01:32:48 +01:00
parent ae2d02b8a0
commit 16cd67331a
8 changed files with 118 additions and 59 deletions

View file

@ -205,25 +205,35 @@ ARGS:
## Library usage
You can use lychee as a library for your own projects.
Simply add it as a dependency and build your client:
Here is a "hello world" example:
```rust
use lychee::{Request, Input, ClientBuilder, Status};
use lychee::Uri::Website;
use url::Url;
use std::error::Error;
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let response = lychee::check("https://github.com/lycheeverse/lychee").await?;
println!("{}", response);
Ok(())
}
```
This is equivalent to the following snippet, in which we build our own client:
```rust
use lychee::{ClientBuilder, Status};
use std::error::Error;
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let client = ClientBuilder::default().build()?;
let url = Url::parse("https://github.com/lycheeverse/lychee")?;
let response = client.check(Request::new(Website(url), Input::Stdin)).await;
let response = client.check("https://github.com/lycheeverse/lychee").await?;
assert!(matches!(response.status, Status::Ok(_)));
Ok(())
}
```
The client is very customizable, e.g.
The client builder is very customizable:
```rust
let client = lychee::ClientBuilder::default()
@ -242,11 +252,12 @@ let client = lychee::ClientBuilder::default()
.build()?;
```
All options that you set will be used for all link checks.
See the [builder documentation](https://docs.rs/lychee/latest/lychee/struct.ClientBuilder.html) for all options.
## GitHub Action usage
GitHub Action is available as a separate repository: [lycheeverse/lychee-action](https://github.com/lycheeverse/lychee-action)
A GitHub Action that uses lychee is available as a separate repository: [lycheeverse/lychee-action](https://github.com/lycheeverse/lychee-action)
which includes usage instructions.
## Troubleshooting and workarounds

View file

@ -77,7 +77,7 @@ fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbos
fn fmt(stats: &ResponseStats, format: &Format) -> Result<String> {
Ok(match format {
Format::String => stats.to_string(),
Format::JSON => serde_json::to_string_pretty(&stats)?,
Format::Json => serde_json::to_string_pretty(&stats)?,
})
}

View file

@ -16,7 +16,7 @@ const MAX_REDIRECTS: usize = 10;
#[derive(Debug, Deserialize)]
pub enum Format {
String,
JSON,
Json,
}
impl FromStr for Format {
@ -24,7 +24,7 @@ impl FromStr for Format {
fn from_str(format: &str) -> Result<Self, Self::Err> {
match format {
"string" => Ok(Format::String),
"json" => Ok(Format::JSON),
"json" => Ok(Format::Json),
_ => Err(anyhow!("Could not parse format {}", format)),
}
}

View file

@ -1,12 +1,12 @@
use anyhow::{anyhow, Context, Result};
use anyhow::{anyhow, bail, Context, Result};
use check_if_email_exists::{check_email, CheckEmailInput};
use derive_builder::Builder;
use headers::{HeaderMap, HeaderValue};
use hubcaps::{Credentials, Github};
use regex::{Regex, RegexSet};
use reqwest::header;
use std::net::IpAddr;
use std::{collections::HashSet, time::Duration};
use std::{convert::TryInto, net::IpAddr};
use tokio::time::sleep;
use url::Url;
@ -153,30 +153,32 @@ impl ClientBuilder {
}
impl Client {
pub async fn check(&self, request: Request) -> Response {
pub async fn check<T: TryInto<Request>>(&self, request: T) -> Result<Response> {
let request: Request = match request.try_into() {
Ok(request) => request,
Err(_e) => bail!("Invalid URI:"),
};
if self.excluded(&request) {
return Response::new(request.uri, Status::Excluded, request.source);
return Ok(Response::new(request.uri, Status::Excluded, request.source));
}
let status = match request.uri {
Uri::Website(ref url) => self.check_website(&url).await,
Uri::Mail(ref address) => {
let valid = self.valid_mail(&address).await;
if valid {
// TODO: We should not be using a HTTP status code for mail
Status::Ok(http::StatusCode::OK)
} else {
Status::Error(format!("Invalid mail address: {}", address))
// TODO: We should not be using a HTTP status code for mail
match self.valid_mail(&address).await {
true => Status::Ok(http::StatusCode::OK),
false => Status::Error(format!("Invalid mail address: {}", address)),
}
}
};
Response::new(request.uri, status, request.source)
Ok(Response::new(request.uri, status, request.source))
}
pub async fn check_website(&self, url: &Url) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let status = loop {
let res = self.check_normal(&url).await;
let res = self.check_default(&url).await;
match res.is_success() {
true => return res,
false => {
@ -216,7 +218,7 @@ impl Client {
}
}
async fn check_normal(&self, url: &Url) -> Status {
async fn check_default(&self, url: &Url) -> Status {
let request = self
.reqwest_client
.request(self.method.clone(), url.as_str());
@ -317,6 +319,14 @@ impl Client {
}
}
/// A convenience function to check a single URI
/// This is the most simple link check and avoids having to create a client manually.
/// For more complex scenarios, look into using the `ClientBuilder` instead.
pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {
let client = ClientBuilder::default().build()?;
Ok(client.check(request).await?)
}
#[cfg(test)]
mod test {
use crate::collector::Input;
@ -366,8 +376,9 @@ mod test {
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url(&mock_server.uri()))
.await;
.check(mock_server.uri())
.await
.unwrap();
assert!(matches!(res.status, Status::Failed(_)));
}
@ -385,7 +396,8 @@ mod test {
.build()
.unwrap()
.check(website_url(&mock_server.uri()))
.await;
.await
.unwrap();
let end = start.elapsed();
assert!(matches!(res.status, Status::Failed(_)));
@ -414,6 +426,7 @@ mod test {
.unwrap()
.check(website_url("https://github.com/lycheeverse/lychee"))
.await
.unwrap()
.status,
Status::Ok(_)
));
@ -424,8 +437,9 @@ mod test {
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://github.com/lycheeverse/not-lychee"))
.check("https://github.com/lycheeverse/not-lychee")
.await
.unwrap()
.status;
assert!(matches!(res, Status::Error(_)));
}
@ -444,6 +458,7 @@ mod test {
.unwrap()
.check(website_url(&mock_server.uri()))
.await
.unwrap()
.status;
assert!(matches!(res, Status::Ok(_)));
}
@ -453,8 +468,9 @@ mod test {
let res = ClientBuilder::default()
.build()
.unwrap()
.check(website_url("https://expired.badssl.com/"))
.await;
.check("https://expired.badssl.com/")
.await
.unwrap();
assert!(matches!(res.status, Status::Error(_)));
// Same, but ignore certificate error
@ -462,8 +478,9 @@ mod test {
.allow_insecure(true)
.build()
.unwrap()
.check(website_url("https://expired.badssl.com/"))
.await;
.check("https://expired.badssl.com/")
.await
.unwrap();
assert!(matches!(res.status, Status::Ok(_)));
}
@ -473,7 +490,8 @@ mod test {
.build()
.unwrap()
.check(website_url("https://crates.io/crates/lychee"))
.await;
.await
.unwrap();
assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND)));
// Try again, but with a custom header.
@ -486,7 +504,8 @@ mod test {
.build()
.unwrap()
.check(website_url("https://crates.io/crates/lychee"))
.await;
.await
.unwrap();
assert!(matches!(res.status, Status::Ok(_)));
}
@ -511,7 +530,7 @@ mod test {
.build()
.unwrap();
let resp = client.check(website_url(&mock_server.uri())).await;
let resp = client.check(website_url(&mock_server.uri())).await.unwrap();
assert!(matches!(resp.status, Status::Timeout(_)));
}

View file

@ -25,8 +25,10 @@ impl ClientPool {
let client = self.pool.get().await;
let tx = self.tx.clone();
tokio::spawn(async move {
let resp = client.check(req).await;
tx.send(resp).await.unwrap();
let resp = client.check(req).await.expect("Invalid URI");
tx.send(resp)
.await
.expect("Cannot send response to channel");
});
}
}

View file

@ -11,7 +11,7 @@ use url::Url;
#[derive(Clone, Debug)]
pub enum FileType {
HTML,
Html,
Markdown,
Plaintext,
}
@ -29,7 +29,7 @@ impl<P: AsRef<Path>> From<P> for FileType {
match path.extension() {
Some(ext) => match ext {
_ if ext == "md" => FileType::Markdown,
_ if (ext == "htm" || ext == "html") => FileType::HTML,
_ if (ext == "htm" || ext == "html") => FileType::Html,
_ => FileType::Plaintext,
},
None => FileType::Plaintext,
@ -147,7 +147,7 @@ pub(crate) fn extract_links(
) -> HashSet<Request> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::HTML => extract_links_from_html(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
@ -234,7 +234,7 @@ mod test {
</html>"#;
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::HTML),
&InputContent::from_string(input, FileType::Html),
Some(Url::parse("https://github.com/lycheeverse/").unwrap()),
)
.into_iter()
@ -305,7 +305,7 @@ mod test {
fn test_extract_html5_not_valid_xml() {
let input = load_fixture("TEST_HTML5.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
@ -328,7 +328,7 @@ mod test {
fn test_extract_html5_not_valid_xml_relative_links() {
let input = load_fixture("TEST_HTML5.html");
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(&input, FileType::HTML),
&InputContent::from_string(&input, FileType::Html),
Some(Url::parse("https://example.com").unwrap()),
)
.into_iter()
@ -357,7 +357,7 @@ mod test {
// this has been problematic with previous XML based parser
let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
@ -375,7 +375,7 @@ mod test {
// minified HTML with some quirky elements such as href attribute values specified without quotes
let input = load_fixture("TEST_HTML5_MINIFIED.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
@ -399,7 +399,7 @@ mod test {
// malformed links shouldn't stop the parser from further parsing
let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();
@ -419,7 +419,7 @@ mod test {
// the element name shouldn't matter for attributes like href, src, cite etc
let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html");
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
extract_links(&InputContent::from_string(&input, FileType::Html), None)
.into_iter()
.map(|r| r.uri)
.collect();

View file

@ -2,22 +2,30 @@
/**
* `lychee` is a library for checking links.
* The main struct of this crate is `ClientBuilder` which can be used to
* configure and run your own link checker.
*
* "Hello world" example:
* ```
* use std::error::Error;
*
* use lychee::{Request, Input, ClientBuilder, Status};
* use lychee::Uri::Website;
* use url::Url;
* #[tokio::main]
* async fn main() -> Result<(), Box<dyn Error>> {
* let response = lychee::check("https://github.com/lycheeverse/lychee").await?;
* println!("{}", response);
* Ok(())
* }
* ```
*
* For more specific use-cases you can build a lychee client yourself,
* using the `ClientBuilder` which can be used to
* configure and run your own link checker and grants full flexibility:
*
* ```
* use lychee::{ClientBuilder, Status};
* use std::error::Error;
*
* #[tokio::main]
* async fn main() -> Result<(), Box<dyn Error>> {
* let client = ClientBuilder::default().build()?;
* let url = Url::parse("https://github.com/lycheeverse/lychee")?;
* let response = client.check(Request::new(Website(url), Input::Stdin)).await;
* let response = client.check("https://github.com/lycheeverse/lychee").await?;
* assert!(matches!(response.status, Status::Ok(_)));
* Ok(())
* }
@ -33,6 +41,7 @@ pub mod collector;
pub mod extract;
pub mod test_utils;
pub use client::check;
pub use client::ClientBuilder;
pub use client_pool::ClientPool;
pub use collector::Input;

View file

@ -21,18 +21,36 @@ impl Display for Request {
}
}
impl TryFrom<String> for Request {
type Error = anyhow::Error;
fn try_from(s: String) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s.as_str())?;
Ok(Request::new(uri, Input::String(s)))
}
}
impl TryFrom<&str> for Request {
type Error = anyhow::Error;
fn try_from(s: &str) -> Result<Self, Self::Error> {
let uri = Uri::try_from(s)?;
Ok(Request::new(uri, Input::String(s.to_owned())))
}
}
/// Specifies how requests to websites will be made
pub(crate) enum RequestMethod {
GET,
HEAD,
Get,
Head,
}
impl TryFrom<String> for RequestMethod {
type Error = anyhow::Error;
fn try_from(value: String) -> Result<Self, Self::Error> {
match value.to_lowercase().as_ref() {
"get" => Ok(RequestMethod::GET),
"head" => Ok(RequestMethod::HEAD),
"get" => Ok(RequestMethod::Get),
"head" => Ok(RequestMethod::Head),
_ => Err(anyhow!("Only `get` and `head` allowed, got {}", value)),
}
}