diff --git a/Cargo.toml b/Cargo.toml index d38ad69..ea8dfee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ linkify = "*" log = "0.4" pretty_env_logger = "0.4" pulldown-cmark = "0.7.2" -regex = "1.3.9" +regex = "*" serde_json = "1.0.56" url = "2.1.1" diff --git a/README.md b/README.md index 34a25eb..da317b9 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ lychee can... - handle gzip compression - fake user agents (required for some firewalls) - skip non-links like anchors or relative URLs +- exclude some websites with regular expressions - SOON: automatically retry and backoff - SOON: optionally ignore SSL certificate errors - SOON: optionally handle redirects -- SOON: exclude some websites with regular expressions ## How? diff --git a/src/checker.rs b/src/checker.rs index c186b74..852f01d 100644 --- a/src/checker.rs +++ b/src/checker.rs @@ -1,7 +1,7 @@ use anyhow::{Context, Result}; use github_rs::client::{Executor, Github}; use github_rs::StatusCode; -use regex::Regex; +use regex::{Regex, RegexSet}; use reqwest::header::{self, HeaderValue}; use serde_json::Value; use url::Url; @@ -11,12 +11,13 @@ use url::Url; pub(crate) struct Checker { reqwest_client: reqwest::Client, gh_client: Github, + excludes: RegexSet, verbose: bool, } impl Checker { /// Creates a new link checker - pub fn try_new(token: String, verbose: bool) -> Result { + pub fn try_new(token: String, excludes: RegexSet, verbose: bool) -> Result { let mut headers = header::HeaderMap::new(); // Faking the user agent is necessary for some websites, unfortunately. // Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com). @@ -32,6 +33,7 @@ impl Checker { Ok(Checker { reqwest_client, gh_client, + excludes, verbose, }) } @@ -89,6 +91,12 @@ impl Checker { } pub async fn check(&self, url: &Url) -> bool { + // TODO: Indicate that the URL was skipped in the return value. + // (Perhaps we want to return an enum value here: Status::Skipped) + if self.excludes.is_match(url.as_str()) { + return true; + } + let ret = self.check_real(&url).await; match ret { true => { diff --git a/src/main.rs b/src/main.rs index b12afe1..b4002a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ extern crate log; use anyhow::Result; +use regex::RegexSet; use std::env; use std::fs; @@ -27,7 +28,7 @@ struct LycheeOptions { // Accumulate all exclusions in a vector #[options(help = "Exclude URLs from checking (supports regex)")] - excludes: Vec, + exclude: Vec, } #[tokio::main] @@ -36,7 +37,9 @@ async fn main() -> Result<()> { let opts = LycheeOptions::parse_args_default_or_exit(); - let checker = Checker::try_new(env::var("GITHUB_TOKEN")?, opts.verbose)?; + let excludes = RegexSet::new(opts.exclude).unwrap(); + + let checker = Checker::try_new(env::var("GITHUB_TOKEN")?, excludes, opts.verbose)?; let md = fs::read_to_string(opts.input.unwrap_or_else(|| "README.md".into()))?; let links = extract_links(&md);