Merge pull request #143 from lycheeverse/input-source

Show input source in status output
This commit is contained in:
Matthias 2021-02-17 15:03:49 +01:00 committed by GitHub
commit ae2d02b8a0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 512 additions and 255 deletions

12
Cargo.lock generated
View file

@ -1392,7 +1392,7 @@ dependencies = [
[[package]]
name = "lychee"
version = "0.5.0"
version = "0.6.0"
dependencies = [
"anyhow",
"assert_cmd",
@ -1411,6 +1411,7 @@ dependencies = [
"markup5ever",
"markup5ever_rcdom",
"openssl-sys",
"pad",
"predicates",
"pulldown-cmark",
"regex",
@ -1690,6 +1691,15 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "pad"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ad9b889f1b12e0b9ee24db044b5129150d5eada288edc800f789928dc8c0e3"
dependencies = [
"unicode-width",
]
[[package]]
name = "parking"
version = "2.0.0"

View file

@ -14,7 +14,7 @@ keywords = [
license = "Apache-2.0/MIT"
name = "lychee"
repository = "https://github.com/lycheeverse/lychee"
version = "0.5.0"
version = "0.6.0"
[dependencies]
anyhow = "1.0.38"
@ -51,6 +51,7 @@ serde_json = "1.0.62"
# This is necessary for the homebrew build
# https://github.com/Homebrew/homebrew-core/pull/70216
ring = "0.16.19"
pad = "0.1.6"
[dependencies.reqwest]
features = ["gzip"]

View file

@ -174,7 +174,7 @@ OPTIONS:
--basic-auth <basic-auth> Basic authentication support. E.g. `username:password`
-c, --config <config-file> Configuration file to use [default: ./lychee.toml]
--exclude <exclude>... Exclude URLs from checking (supports regex)
-f, --format <format> Output file format of status report [default: string]
-f, --format <format> Output file format of status report (json, string) [default: string]
--github-token <github-token> GitHub API token to use when checking github.com links, to avoid rate
limiting [env: GITHUB_TOKEN=]
-h, --headers <headers>... Custom request headers
@ -187,7 +187,7 @@ OPTIONS:
-T, --threads <threads> Number of threads to utilize. Defaults to number of cores available to
the system
-t, --timeout <timeout> Website timeout from connect to response finished [default: 20]
-u, --user-agent <user-agent> User agent [default: lychee/0.5.0]
-u, --user-agent <user-agent> User agent [default: lychee/0.6.0]
ARGS:
<inputs>... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs
@ -208,12 +208,19 @@ You can use lychee as a library for your own projects.
Simply add it as a dependency and build your client:
```rust
use http::StatusCode
use lychee::{Request, Input, ClientBuilder, Status};
use lychee::Uri::Website;
use url::Url;
use std::error::Error;
let client = lychee::ClientBuilder::default().build()?;
let url = Url::parse("https://github.com/lycheeverse/lychee")?;
let response = client.check(Website(url)).await?;
assert!(matches!(response.status, Status::Ok(_)));
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let client = ClientBuilder::default().build()?;
let url = Url::parse("https://github.com/lycheeverse/lychee")?;
let response = client.check(Request::new(Website(url), Input::Stdin)).await;
assert!(matches!(response.status, Status::Ok(_)));
Ok(())
}
```
The client is very customizable, e.g.

View file

@ -16,7 +16,7 @@ use crate::options::{Config, LycheeOptions};
use crate::stats::ResponseStats;
use lychee::collector::{self, Input};
use lychee::{ClientBuilder, ClientPool, Response, Status};
use lychee::{ClientBuilder, ClientPool, Response};
/// A C-like enum that can be cast to `i32` and used as process exit code.
enum ExitCode {
@ -62,22 +62,22 @@ fn run_main() -> Result<i32> {
}
fn show_progress(progress_bar: &Option<ProgressBar>, response: &Response, verbose: bool) {
let message = status_message(&response, verbose);
if (response.status.is_success() || response.status.is_excluded()) && !verbose {
return;
}
// Regular println! interferes with progress bar
if let Some(pb) = progress_bar {
pb.inc(1);
// regular println! interferes with progress bar
if let Some(message) = message {
pb.println(message);
}
} else if let Some(message) = message {
println!("{}", message);
};
pb.println(response.to_string());
} else {
println!("{}", response);
}
}
fn fmt(stats: &ResponseStats, format: &Format) -> Result<String> {
Ok(match format {
Format::String => stats.to_string(),
Format::JSON => serde_json::to_string(&stats)?,
Format::JSON => serde_json::to_string_pretty(&stats)?,
})
}
@ -120,6 +120,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
max_concurrency,
)
.await?;
let pb = if cfg.progress {
Some(
ProgressBar::new(links.len() as u64)
@ -166,13 +167,11 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
pb.finish_and_clear();
}
if cfg.verbose {
println!("\n{}", stats);
}
let stats_formatted = fmt(&stats, &cfg.format)?;
if let Some(output) = &cfg.output {
fs::write(output, fmt(&stats, &cfg.format)?)
.context("Cannot write status output to file")?;
fs::write(output, stats_formatted).context("Cannot write status output to file")?;
} else {
println!("\n{}", stats_formatted);
}
match stats.is_success() {
@ -228,18 +227,6 @@ fn parse_basic_auth(auth: &str) -> Result<Authorization<Basic>> {
Ok(Authorization::basic(params[0], params[1]))
}
fn status_message(response: &Response, verbose: bool) -> Option<String> {
match &response.status {
Status::Ok(code) if verbose => Some(format!("{} [{}]", response.uri, code)),
Status::Redirected if verbose => Some(format!("🔀️ {}", response.uri)),
Status::Excluded if verbose => Some(format!("👻 {}", response.uri)),
Status::Failed(code) => Some(format!("🚫 {} [{}]", response.uri, code)),
Status::Error(e) => Some(format!("{} ({})", response.uri, e)),
Status::Timeout => Some(format!("{}", response.uri)),
_ => None,
}
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -239,7 +239,7 @@ pub struct Config {
#[serde(default)]
pub output: Option<PathBuf>,
/// Output file format of status report
/// Output file format of status report (json, string)
#[structopt(short, long, default_value = "string")]
#[serde(default)]
pub format: Format,

View file

@ -1,65 +1,153 @@
use serde::{Deserialize, Serialize};
use pad::{Alignment, PadStr};
use serde::Serialize;
use std::{
collections::HashSet,
collections::{HashMap, HashSet},
fmt::{self, Display},
};
use lychee::{Response, Status::*, Uri};
use lychee::{collector::Input, Response, Status::*};
#[derive(Serialize, Deserialize)]
// Maximum padding for each entry in the final statistics output
const MAX_PADDING: usize = 20;
#[derive(Serialize)]
pub struct ResponseStats {
total: usize,
successful: usize,
failures: HashSet<Uri>,
timeouts: HashSet<Uri>,
redirects: HashSet<Uri>,
excludes: HashSet<Uri>,
errors: HashSet<Uri>,
failures: usize,
timeouts: usize,
redirects: usize,
excludes: usize,
errors: usize,
fail_map: HashMap<Input, HashSet<Response>>,
}
impl ResponseStats {
pub fn new() -> Self {
let fail_map = HashMap::new();
ResponseStats {
total: 0,
successful: 0,
failures: HashSet::new(),
timeouts: HashSet::new(),
redirects: HashSet::new(),
excludes: HashSet::new(),
errors: HashSet::new(),
failures: 0,
timeouts: 0,
redirects: 0,
excludes: 0,
errors: 0,
fail_map,
}
}
pub fn add(&mut self, response: Response) {
self.total += 1;
let uri = response.uri;
if !match response.status {
Failed(_) => self.failures.insert(uri),
Timeout => self.timeouts.insert(uri),
Redirected => self.redirects.insert(uri),
Excluded => self.excludes.insert(uri),
Error(_) => self.errors.insert(uri),
_ => false,
} {
self.successful += 1;
match response.status {
Failed(_) => self.failures += 1,
Timeout(_) => self.timeouts += 1,
Redirected(_) => self.redirects += 1,
Excluded => self.excludes += 1,
Error(_) => self.errors += 1,
_ => self.successful += 1,
}
if matches!(
response.status,
Failed(_) | Timeout(_) | Redirected(_) | Error(_)
) {
let fail = self.fail_map.entry(response.source.clone()).or_default();
fail.insert(response);
};
}
pub fn is_success(&self) -> bool {
self.total == self.successful + self.excludes.len()
self.total == self.successful + self.excludes
}
}
fn write_stat(f: &mut fmt::Formatter, title: &str, stat: usize) -> fmt::Result {
let fill = title.chars().count();
f.write_str(title)?;
f.write_str(
&stat
.to_string()
.pad(MAX_PADDING - fill, '.', Alignment::Right, false),
)?;
f.write_str("\n")
}
impl Display for ResponseStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let separator = "-".repeat(MAX_PADDING);
writeln!(f, "📝 Summary")?;
writeln!(f, "-------------------")?;
writeln!(f, "🔍 Total: {}", self.total)?;
writeln!(f, "✅ Successful: {}", self.successful)?;
writeln!(f, "⏳ Timeouts: {}", self.timeouts.len())?;
writeln!(f, "🔀 Redirected: {}", self.redirects.len())?;
writeln!(f, "👻 Excluded: {}", self.excludes.len())?;
writeln!(f, "🚫 Errors: {}", self.errors.len() + self.failures.len())
writeln!(f, "{}", separator)?;
write_stat(f, "🔍 Total", self.total)?;
write_stat(f, "✅ Successful", self.successful)?;
write_stat(f, "⏳ Timeouts", self.timeouts)?;
write_stat(f, "🔀 Redirected", self.redirects)?;
write_stat(f, "👻 Excluded", self.excludes)?;
write_stat(f, "🚫 Errors", self.errors + self.failures)?;
if !&self.fail_map.is_empty() {
writeln!(f)?;
}
for (input, responses) in &self.fail_map {
writeln!(f, "Input: {}", input)?;
for response in responses {
writeln!(
f,
" {} {}\n {}",
response.status.icon(),
response.uri,
response.status
)?
}
}
writeln!(f)
}
}
#[cfg(test)]
mod test_super {
use lychee::{test_utils::website, Status};
use super::*;
#[test]
fn test_stats() {
let mut stats = ResponseStats::new();
stats.add(Response {
uri: website("http://example.com/ok"),
status: Status::Ok(http::StatusCode::OK),
source: Input::Stdin,
});
stats.add(Response {
uri: website("http://example.com/failed"),
status: Status::Failed(http::StatusCode::BAD_GATEWAY),
source: Input::Stdin,
});
stats.add(Response {
uri: website("http://example.com/redirect"),
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
source: Input::Stdin,
});
let mut expected_map = HashMap::new();
expected_map.insert(
Input::Stdin,
vec![
Response {
uri: website("http://example.com/failed"),
status: Status::Failed(http::StatusCode::BAD_GATEWAY),
source: Input::Stdin,
},
Response {
uri: website("http://example.com/redirect"),
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
source: Input::Stdin,
},
]
.into_iter()
.collect::<HashSet<_>>(),
);
assert_eq!(stats.fail_map, expected_map);
}
}

View file

@ -10,9 +10,9 @@ use std::{collections::HashSet, time::Duration};
use tokio::time::sleep;
use url::Url;
use crate::excludes::Excludes;
use crate::types::{Response, Status};
use crate::uri::Uri;
use crate::{excludes::Excludes, Request};
const VERSION: &str = env!("CARGO_PKG_VERSION");
const DEFAULT_MAX_REDIRECTS: usize = 5;
@ -153,6 +153,52 @@ impl ClientBuilder {
}
impl Client {
pub async fn check(&self, request: Request) -> Response {
if self.excluded(&request) {
return Response::new(request.uri, Status::Excluded, request.source);
}
let status = match request.uri {
Uri::Website(ref url) => self.check_website(&url).await,
Uri::Mail(ref address) => {
let valid = self.valid_mail(&address).await;
if valid {
// TODO: We should not be using a HTTP status code for mail
Status::Ok(http::StatusCode::OK)
} else {
Status::Error(format!("Invalid mail address: {}", address))
}
}
};
Response::new(request.uri, status, request.source)
}
pub async fn check_website(&self, url: &Url) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let status = loop {
let res = self.check_normal(&url).await;
match res.is_success() {
true => return res,
false => {
if retries > 0 {
retries -= 1;
sleep(Duration::from_secs(wait)).await;
wait *= 2;
} else {
break res;
}
}
}
};
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
return self.check_github(owner, repo).await;
}
status
}
async fn check_github(&self, owner: String, repo: String) -> Status {
match &self.github {
Some(github) => {
@ -189,33 +235,6 @@ impl Client {
Ok((owner.as_str().into(), repo.as_str().into()))
}
pub async fn check_real(&self, url: &Url) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
let status = loop {
let res = self.check_normal(&url).await;
match res.is_success() {
true => return res,
false => {
if retries > 0 {
retries -= 1;
sleep(Duration::from_secs(wait)).await;
wait *= 2;
} else {
break res;
}
}
}
};
// Pull out the heavy weapons in case of a failed normal request.
// This could be a Github URL and we run into the rate limiter.
if let Ok((owner, repo)) = self.extract_github(url.as_str()) {
return self.check_github(owner, repo).await;
}
status
}
pub async fn valid_mail(&self, address: &str) -> bool {
let input = CheckEmailInput::new(vec![address.to_string()]);
let results = check_email(&input).await;
@ -269,9 +288,9 @@ impl Client {
self.excludes.mail
}
pub fn excluded(&self, uri: &Uri) -> bool {
pub fn excluded(&self, request: &Request) -> bool {
if let Some(includes) = &self.includes {
if includes.is_match(uri.as_str()) {
if includes.is_match(request.uri.as_str()) {
// Includes take precedence over excludes
return false;
} else {
@ -282,43 +301,26 @@ impl Client {
}
}
}
if self.in_regex_excludes(uri.as_str()) {
if self.in_regex_excludes(request.uri.as_str()) {
return true;
}
if matches!(uri, Uri::Mail(_)) {
if matches!(request.uri, Uri::Mail(_)) {
return self.is_mail_excluded();
}
if self.in_ip_excludes(&uri) {
if self.in_ip_excludes(&request.uri) {
return true;
}
if self.scheme.is_none() {
return false;
}
uri.scheme() != self.scheme
}
pub async fn check(&self, uri: Uri) -> Response {
if self.excluded(&uri) {
return Response::new(uri, Status::Excluded);
}
let status = match uri {
Uri::Website(ref url) => self.check_real(&url).await,
Uri::Mail(ref address) => {
let valid = self.valid_mail(&address).await;
if valid {
// TODO: We should not be using a HTTP status code for mail
Status::Ok(http::StatusCode::OK)
} else {
Status::Error(format!("Invalid mail address: {}", address))
}
}
};
Response::new(uri, status)
request.uri.scheme() != self.scheme
}
}
#[cfg(test)]
mod test {
use crate::collector::Input;
use super::*;
use http::StatusCode;
use std::time::{Duration, Instant};
@ -345,8 +347,11 @@ mod test {
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
fn website_url(s: &str) -> Uri {
Uri::Website(Url::parse(s).expect("Expected valid Website URI"))
fn website_url(s: &str) -> Request {
Request::new(
Uri::Website(Url::parse(s).expect("Expected valid Website URI")),
Input::Stdin,
)
}
#[tokio::test]
@ -507,7 +512,7 @@ mod test {
.unwrap();
let resp = client.check(website_url(&mock_server.uri())).await;
assert!(matches!(resp.status, Status::Timeout));
assert!(matches!(resp.status, Status::Timeout(_)));
}
#[tokio::test]
@ -558,11 +563,17 @@ mod test {
assert_eq!(client.excluded(&website_url("http://github.com")), true);
assert_eq!(client.excluded(&website_url("http://exclude.org")), true);
assert_eq!(
client.excluded(&Uri::Mail("mail@example.com".to_string())),
client.excluded(&Request::new(
Uri::Mail("mail@example.com".to_string()),
Input::Stdin,
)),
true
);
assert_eq!(
client.excluded(&Uri::Mail("foo@bar.dev".to_string())),
client.excluded(&Request::new(
Uri::Mail("foo@bar.dev".to_string()),
Input::Stdin,
)),
false
);
}

View file

@ -2,19 +2,18 @@ use client::Client;
use deadpool::unmanaged::Pool;
use tokio::sync::mpsc;
use crate::uri;
use crate::{client, types};
pub struct ClientPool {
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<uri::Uri>,
rx: mpsc::Receiver<types::Request>,
pool: deadpool::unmanaged::Pool<client::Client>,
}
impl ClientPool {
pub fn new(
tx: mpsc::Sender<types::Response>,
rx: mpsc::Receiver<uri::Uri>,
rx: mpsc::Receiver<types::Request>,
clients: Vec<Client>,
) -> Self {
let pool = Pool::from(clients);

View file

@ -1,18 +1,21 @@
use crate::extract::{extract_links, FileType};
use crate::uri::Uri;
use crate::{
extract::{extract_links, FileType},
Request,
};
use anyhow::{anyhow, Context, Result};
use glob::glob_with;
use reqwest::Url;
use serde::Serialize;
use shellexpand::tilde;
use std::collections::HashSet;
use std::path::Path;
use std::path::PathBuf;
use std::{collections::HashSet, fmt::Display};
use tokio::fs::read_to_string;
use tokio::io::{stdin, AsyncReadExt};
const STDIN: &str = "-";
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum Input {
RemoteUrl(Url),
@ -22,6 +25,40 @@ pub enum Input {
String(String),
}
impl Serialize for Input {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl Display for Input {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Input::RemoteUrl(url) => {
write!(f, "{}", url)
}
Input::FsGlob {
pattern,
ignore_case: _,
} => {
write!(f, "{}", pattern)
}
Input::FsPath(path) => {
write!(f, "{}", path.to_str().unwrap_or_default())
}
Input::Stdin => {
write!(f, "stdin")
}
Input::String(_) => {
write!(f, "raw input string")
}
}
}
}
#[derive(Debug)]
pub struct InputContent {
pub input: Input,
@ -157,18 +194,6 @@ impl Input {
}
}
impl ToString for Input {
fn to_string(&self) -> String {
match self {
Self::RemoteUrl(url) => url.to_string(),
Self::FsGlob { pattern, .. } => pattern.clone(),
Self::FsPath(p) => p.to_str().unwrap_or_default().to_owned(),
Self::Stdin => STDIN.to_owned(),
Self::String(s) => s.clone(),
}
}
}
/// Fetch all unique links from a slice of inputs
/// All relative URLs get prefixed with `base_url` if given.
pub async fn collect_links(
@ -176,7 +201,7 @@ pub async fn collect_links(
base_url: Option<String>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Result<HashSet<Uri>> {
) -> Result<HashSet<Request>> {
let base_url = match base_url {
Some(url) => Some(Url::parse(&url)?),
_ => None,
@ -213,7 +238,7 @@ pub async fn collect_links(
// instead of building a HashSet with all links.
// This optimization would speed up cases where there's
// a lot of inputs and/or the inputs are large (e.g. big files).
let mut collected_links = HashSet::new();
let mut collected_links: HashSet<Request> = HashSet::new();
for handle in extract_links_handles {
let links = handle.await?;
@ -226,7 +251,10 @@ pub async fn collect_links(
#[cfg(test)]
mod test {
use super::*;
use crate::test_utils::get_mock_server_with_content;
use crate::{
test_utils::{get_mock_server_with_content, website},
Uri,
};
use std::fs::File;
use std::io::Write;
use std::str::FromStr;
@ -264,13 +292,17 @@ mod test {
},
];
let links = collect_links(&inputs, None, false, 8).await?;
let responses = collect_links(&inputs, None, false, 8).await?;
let links = responses
.into_iter()
.map(|r| r.uri)
.collect::<HashSet<Uri>>();
let mut expected_links = HashSet::new();
expected_links.insert(Uri::Website(Url::from_str(TEST_STRING)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_URL)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_FILE)?));
expected_links.insert(Uri::Website(Url::from_str(TEST_GLOB_1)?));
let mut expected_links: HashSet<Uri> = HashSet::new();
expected_links.insert(website(TEST_STRING));
expected_links.insert(website(TEST_URL));
expected_links.insert(website(TEST_FILE));
expected_links.insert(website(TEST_GLOB_1));
expected_links.insert(Uri::Mail(TEST_GLOB_2_MAIL.to_string()));
assert_eq!(links, expected_links);

View file

@ -1,5 +1,5 @@
use crate::collector::InputContent;
use crate::uri::Uri;
use crate::{collector::InputContent, Request};
use html5ever::parse_document;
use html5ever::tendril::{StrTendril, TendrilSink};
use linkify::LinkFinder;
@ -141,7 +141,10 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
.collect()
}
pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>) -> HashSet<Uri> {
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: Option<Url>,
) -> HashSet<Request> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::HTML => extract_links_from_html(&input_content.content),
@ -150,28 +153,33 @@ pub(crate) fn extract_links(input_content: &InputContent, base_url: Option<Url>)
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
let mut uris = HashSet::new();
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
match Uri::try_from(link.as_str()) {
Ok(uri) => {
uris.insert(uri);
requests.insert(Request::new(uri, input_content.input.clone()));
}
Err(_) => {
if !Path::new(&link).exists() {
if let Some(base_url) = &base_url {
if let Ok(new_url) = base_url.join(&link) {
uris.insert(Uri::Website(new_url));
requests.insert(Request::new(
Uri::Website(new_url),
input_content.input.clone(),
));
}
}
}
}
};
}
uris
requests
}
#[cfg(test)]
mod test {
use crate::test_utils::website;
use super::*;
use std::fs::File;
use std::io::{BufReader, Read};
@ -197,17 +205,18 @@ mod test {
#[test]
fn test_extract_markdown_links() {
let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)";
let links = extract_links(
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::Markdown),
Some(Url::parse("https://github.com/hello-rust/lychee/").unwrap()),
);
)
.into_iter()
.map(|r| r.uri)
.collect();
assert_eq!(
links,
[
Uri::Website(Url::parse("https://endler.dev").unwrap()),
Uri::Website(
Url::parse("https://github.com/hello-rust/lychee/relative_link").unwrap()
)
website("https://endler.dev"),
website("https://github.com/hello-rust/lychee/relative_link"),
]
.iter()
.cloned()
@ -219,23 +228,28 @@ mod test {
fn test_extract_html_links() {
let input = r#"<html>
<div class="row">
<a href="https://github.com/hello-rust/lychee/">
<a href="https://github.com/lycheeverse/lychee/">
<a href="blob/master/README.md">README</a>
</div>
</html>"#;
let links = extract_links(
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(input, FileType::HTML),
Some(Url::parse("https://github.com/hello-rust/").unwrap()),
);
Some(Url::parse("https://github.com/lycheeverse/").unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect();
assert_eq!(
links
.get(&Uri::Website(
Url::parse("https://github.com/hello-rust/blob/master/README.md").unwrap()
))
.is_some(),
true
links,
[
website("https://github.com/lycheeverse/lychee/"),
website("https://github.com/lycheeverse/blob/master/README.md"),
]
.iter()
.cloned()
.collect::<HashSet<Uri>>(),
);
}
@ -257,15 +271,21 @@ mod test {
fn test_non_markdown_links() {
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
let links = extract_links(&InputContent::from_string(input, FileType::Plaintext), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(input, FileType::Plaintext), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected = [
Uri::Website(Url::parse("https://endler.dev").unwrap()),
Uri::Website(Url::parse("https://hello-rust.show/foo/bar?lol=1").unwrap()),
website("https://endler.dev"),
website("https://hello-rust.show/foo/bar?lol=1"),
Uri::Mail("test@example.com".to_string()),
]
.iter()
.cloned()
.collect();
assert_eq!(links, expected)
}
@ -284,14 +304,18 @@ mod test {
#[test]
fn test_extract_html5_not_valid_xml() {
let input = load_fixture("TEST_HTML5.html");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
website("https://example.com/head/home"),
website("https://example.com/css/style_full_url.css"),
// the body links wouldn't be present if the file was parsed strictly as XML
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
website("https://example.com/body/a"),
website("https://example.com/body/div_empty_a"),
]
.iter()
.cloned()
@ -303,20 +327,23 @@ mod test {
#[test]
fn test_extract_html5_not_valid_xml_relative_links() {
let input = load_fixture("TEST_HTML5.html");
let links = extract_links(
let links: HashSet<Uri> = extract_links(
&InputContent::from_string(&input, FileType::HTML),
Some(Url::parse("https://example.com").unwrap()),
);
)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [
Uri::Website(Url::parse("https://example.com/head/home").unwrap()),
Uri::Website(Url::parse("https://example.com/images/icon.png").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_relative_url.css").unwrap()),
Uri::Website(Url::parse("https://example.com/css/style_full_url.css").unwrap()),
Uri::Website(Url::parse("https://example.com/js/script.js").unwrap()),
website("https://example.com/head/home"),
website("https://example.com/images/icon.png"),
website("https://example.com/css/style_relative_url.css"),
website("https://example.com/css/style_full_url.css"),
website("https://example.com/js/script.js"),
// the body links wouldn't be present if the file was parsed strictly as XML
Uri::Website(Url::parse("https://example.com/body/a").unwrap()),
Uri::Website(Url::parse("https://example.com/body/div_empty_a").unwrap()),
website("https://example.com/body/a"),
website("https://example.com/body/div_empty_a"),
]
.iter()
.cloned()
@ -329,14 +356,16 @@ mod test {
fn test_extract_html5_lowercase_doctype() {
// this has been problematic with previous XML based parser
let input = load_fixture("TEST_HTML5_LOWERCASE_DOCTYPE.html");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [Uri::Website(
Url::parse("https://example.com/body/a").unwrap(),
)]
.iter()
.cloned()
.collect();
let expected_links = [website("https://example.com/body/a")]
.iter()
.cloned()
.collect();
assert_eq!(links, expected_links);
}
@ -345,14 +374,18 @@ mod test {
fn test_extract_html5_minified() {
// minified HTML with some quirky elements such as href attribute values specified without quotes
let input = load_fixture("TEST_HTML5_MINIFIED.html");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [
Uri::Website(Url::parse("https://example.com/").unwrap()),
Uri::Website(Url::parse("https://example.com/favicon.ico").unwrap()),
Uri::Website(Url::parse("https://fonts.externalsite.com").unwrap()),
Uri::Website(Url::parse("https://example.com/docs/").unwrap()),
Uri::Website(Url::parse("https://example.com/forum").unwrap()),
website("https://example.com/"),
website("https://example.com/favicon.ico"),
website("https://fonts.externalsite.com"),
website("https://example.com/docs/"),
website("https://example.com/forum"),
]
.iter()
.cloned()
@ -365,7 +398,11 @@ mod test {
fn test_extract_html5_malformed() {
// malformed links shouldn't stop the parser from further parsing
let input = load_fixture("TEST_HTML5_MALFORMED_LINKS.html");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [Uri::Website(
Url::parse("https://example.com/valid").unwrap(),
@ -381,13 +418,17 @@ mod test {
fn test_extract_html5_custom_elements() {
// the element name shouldn't matter for attributes like href, src, cite etc
let input = load_fixture("TEST_HTML5_CUSTOM_ELEMENTS.html");
let links = extract_links(&InputContent::from_string(&input, FileType::HTML), None);
let links: HashSet<Uri> =
extract_links(&InputContent::from_string(&input, FileType::HTML), None)
.into_iter()
.map(|r| r.uri)
.collect();
let expected_links = [
Uri::Website(Url::parse("https://example.com/some-weird-element").unwrap()),
Uri::Website(Url::parse("https://example.com/even-weirder-src").unwrap()),
Uri::Website(Url::parse("https://example.com/even-weirder-href").unwrap()),
Uri::Website(Url::parse("https://example.com/citations").unwrap()),
website("https://example.com/some-weird-element"),
website("https://example.com/even-weirder-src"),
website("https://example.com/even-weirder-href"),
website("https://example.com/citations"),
]
.iter()
.cloned()

View file

@ -8,7 +8,7 @@
* "Hello world" example:
* ```
*
* use lychee::{ClientBuilder, Status};
* use lychee::{Request, Input, ClientBuilder, Status};
* use lychee::Uri::Website;
* use url::Url;
* use std::error::Error;
@ -17,7 +17,7 @@
* async fn main() -> Result<(), Box<dyn Error>> {
* let client = ClientBuilder::default().build()?;
* let url = Url::parse("https://github.com/lycheeverse/lychee")?;
* let response = client.check(Website(url)).await;
* let response = client.check(Request::new(Website(url), Input::Stdin)).await;
* assert!(matches!(response.status, Status::Ok(_)));
* Ok(())
* }
@ -35,6 +35,7 @@ pub mod test_utils;
pub use client::ClientBuilder;
pub use client_pool::ClientPool;
pub use collector::Input;
pub use excludes::Excludes;
pub use types::*;
pub use uri::Uri;

View file

@ -1,7 +1,10 @@
use http::StatusCode;
use reqwest::Url;
use wiremock::matchers::path;
use wiremock::{Mock, MockServer, ResponseTemplate};
use crate::Uri;
#[allow(unused)]
pub async fn get_mock_server<S>(response_code: S) -> MockServer
where
@ -30,3 +33,7 @@ where
mock_server
}
pub fn website(url: &str) -> Uri {
Uri::Website(Url::parse(url).unwrap())
}

View file

@ -1,6 +1,25 @@
use crate::uri::Uri;
use crate::{collector::Input, uri::Uri};
use anyhow::anyhow;
use std::{collections::HashSet, convert::TryFrom};
use serde::{Serialize, Serializer};
use std::{collections::HashSet, convert::TryFrom, fmt::Display};
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub struct Request {
pub uri: Uri,
pub source: Input,
}
impl Request {
pub fn new(uri: Uri, source: Input) -> Self {
Request { uri, source }
}
}
impl Display for Request {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} ({})", self.uri, self.source)
}
}
/// Specifies how requests to websites will be made
pub(crate) enum RequestMethod {
@ -19,35 +38,80 @@ impl TryFrom<String> for RequestMethod {
}
}
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq, Hash, Serialize)]
pub struct Response {
#[serde(flatten)]
pub uri: Uri,
pub status: Status,
#[serde(skip)]
pub source: Input,
}
impl Response {
pub fn new(uri: Uri, status: Status) -> Self {
Response { uri, status }
pub fn new(uri: Uri, status: Status, source: Input) -> Self {
Response {
uri,
status,
source,
}
}
}
impl Display for Response {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let metadata = match &self.status {
Status::Ok(code) | Status::Redirected(code) | Status::Failed(code) => {
format!(" [{}]", code)
}
Status::Timeout(code) if code.is_some() => format!(" [{}]", code.unwrap()),
Status::Error(e) => format!(" ({})", e),
_ => "".to_string(),
};
write!(f, "{} {}{}", self.status.icon(), self.uri, metadata)
}
}
/// Response status of the request
#[derive(Debug)]
#[derive(Debug, Hash, PartialEq, Eq)]
pub enum Status {
/// Request was successful
Ok(http::StatusCode),
/// Request failed with HTTP error code
Failed(http::StatusCode),
/// Request timed out
Timeout,
Timeout(Option<http::StatusCode>),
/// Got redirected to different resource
Redirected,
Redirected(http::StatusCode),
/// Resource was excluded from checking
Excluded,
/// Low-level error while loading resource
Error(String),
}
impl Display for Status {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let out = match self {
Status::Ok(c) => format!("OK ({})", c),
Status::Redirected(c) => format!("Redirect ({})", c),
Status::Excluded => "Excluded".to_string(),
Status::Failed(c) => format!("Failed ({})", c),
Status::Error(e) => format!("Runtime error ({})", e),
Status::Timeout(Some(c)) => format!("Timeout ({})", c),
Status::Timeout(None) => "Timeout".to_string(),
};
write!(f, "{}", out)
}
}
impl Serialize for Status {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.collect_str(self)
}
}
impl Status {
pub fn new(statuscode: http::StatusCode, accepted: Option<HashSet<http::StatusCode>>) -> Self {
if let Some(true) = accepted.map(|a| a.contains(&statuscode)) {
@ -55,7 +119,7 @@ impl Status {
} else if statuscode.is_success() {
Status::Ok(statuscode)
} else if statuscode.is_redirection() {
Status::Redirected
Status::Redirected(statuscode)
} else {
Status::Failed(statuscode)
}
@ -64,12 +128,27 @@ impl Status {
pub fn is_success(&self) -> bool {
matches!(self, Status::Ok(_))
}
pub fn is_excluded(&self) -> bool {
matches!(self, Status::Excluded)
}
pub fn icon(&self) -> &str {
match self {
Status::Ok(_) => "",
Status::Redirected(_) => "🔀️",
Status::Excluded => "👻",
Status::Failed(_) => "🚫",
Status::Error(_) => "",
Status::Timeout(_) => "",
}
}
}
impl From<reqwest::Error> for Status {
fn from(e: reqwest::Error) -> Self {
if e.is_timeout() {
Status::Timeout
Status::Timeout(e.status())
} else {
Status::Error(e.to_string())
}
@ -78,22 +157,19 @@ impl From<reqwest::Error> for Status {
#[cfg(test)]
mod test {
use super::*;
use crate::test_utils::website;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use url::Url;
#[test]
fn test_uri_host_ip_v4() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri =
Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6"));
let uri = website("https://[2020::0010]");
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
@ -103,7 +179,7 @@ mod test {
#[test]
fn test_uri_host_ip_no_ip() {
let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI"));
let uri = website("https://some.cryptic/url");
let ip = uri.host_ip();
assert!(ip.is_none());
}

View file

@ -65,7 +65,7 @@ impl Display for Uri {
#[cfg(test)]
mod test {
use reqwest::Url;
use crate::test_utils::website;
use super::*;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
@ -75,7 +75,7 @@ mod test {
assert!(matches!(Uri::try_from(""), Err(_)));
assert_eq!(
Uri::try_from("http://example.com").unwrap(),
Uri::Website(url::Url::parse("http://example.com").unwrap())
website("http://example.com")
);
assert_eq!(
Uri::try_from("mail@example.com").unwrap(),
@ -89,16 +89,14 @@ mod test {
#[test]
fn test_uri_host_ip_v4() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}
#[test]
fn test_uri_host_ip_v6() {
let uri =
Uri::Website(Url::parse("https://[2020::0010]").expect("Expected URI with valid IPv6"));
let uri = website("https://[2020::0010]");
let ip = uri.host_ip().expect("Expected a valid IPv6");
assert_eq!(
ip,
@ -108,15 +106,14 @@ mod test {
#[test]
fn test_uri_host_ip_no_ip() {
let uri = Uri::Website(Url::parse("https://some.cryptic/url").expect("Expected valid URI"));
let uri = website("https://some.cryptic/url");
let ip = uri.host_ip();
assert!(ip.is_none());
}
#[test]
fn test_mail() {
let uri =
Uri::Website(Url::parse("http://127.0.0.1").expect("Expected URI with valid IPv4"));
let uri = website("http://127.0.0.1");
let ip = uri.host_ip().expect("Expected a valid IPv4");
assert_eq!(ip, IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)));
}

View file

@ -30,10 +30,10 @@ mod cli {
.arg(test_all_private_path)
.assert()
.success()
.stdout(contains("Total: 7"))
.stdout(contains("Excluded: 7"))
.stdout(contains("Successful: 0"))
.stdout(contains("Errors: 0"));
.stdout(contains("Total............7"))
.stdout(contains("Excluded.........7"))
.stdout(contains("Successful.......0"))
.stdout(contains("Errors...........0"));
}
/// Test that a GitHub link can be checked without specifying the token.
@ -46,10 +46,10 @@ mod cli {
.arg(test_github_path)
.assert()
.success()
.stdout(contains("Total: 1"))
.stdout(contains("Excluded: 0"))
.stdout(contains("Successful: 1"))
.stdout(contains("Errors: 0"));
.stdout(contains("Total............1"))
.stdout(contains("Excluded.........0"))
.stdout(contains("Successful.......1"))
.stdout(contains("Errors...........0"));
}
#[tokio::test]
@ -164,7 +164,7 @@ mod cli {
.arg("--verbose")
.assert()
.success()
.stdout(contains("Total: 2"));
.stdout(contains("Total............2"));
Ok(())
}
@ -188,7 +188,7 @@ mod cli {
.arg("--glob-ignore-case")
.assert()
.success()
.stdout(contains("Total: 2"));
.stdout(contains("Total............2"));
Ok(())
}
@ -211,7 +211,7 @@ mod cli {
.arg("--verbose")
.assert()
.success()
.stdout(contains("Total: 1"));
.stdout(contains("Total............1"));
Ok(())
}
@ -231,9 +231,9 @@ mod cli {
.assert()
.success();
let expected = r##"{"total":11,"successful":11,"failures":[],"timeouts":[],"redirects":[],"excludes":[],"errors":[]}"##;
let expected = r##"{"total":11,"successful":11,"failures":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"##;
let output = fs::read_to_string(&outfile)?;
assert_eq!(output, expected);
assert_eq!(output.split_whitespace().collect::<String>(), expected);
fs::remove_file(outfile)?;
Ok(())
}