Add support for excluding URLs

This commit is contained in:
Matthias Endler 2020-08-11 22:48:50 +02:00
parent 1d235b578b
commit d4a3b09790
4 changed files with 17 additions and 6 deletions

View file

@ -18,7 +18,7 @@ linkify = "*"
log = "0.4"
pretty_env_logger = "0.4"
pulldown-cmark = "0.7.2"
regex = "1.3.9"
regex = "*"
serde_json = "1.0.56"
url = "2.1.1"

View file

@ -22,10 +22,10 @@ lychee can...
- handle gzip compression
- fake user agents (required for some firewalls)
- skip non-links like anchors or relative URLs
- exclude some websites with regular expressions
- SOON: automatically retry and backoff
- SOON: optionally ignore SSL certificate errors
- SOON: optionally handle redirects
- SOON: exclude some websites with regular expressions
## How?

View file

@ -1,7 +1,7 @@
use anyhow::{Context, Result};
use github_rs::client::{Executor, Github};
use github_rs::StatusCode;
use regex::Regex;
use regex::{Regex, RegexSet};
use reqwest::header::{self, HeaderValue};
use serde_json::Value;
use url::Url;
@ -11,12 +11,13 @@ use url::Url;
pub(crate) struct Checker {
reqwest_client: reqwest::Client,
gh_client: Github,
excludes: RegexSet,
verbose: bool,
}
impl Checker {
/// Creates a new link checker
pub fn try_new(token: String, verbose: bool) -> Result<Self> {
pub fn try_new(token: String, excludes: RegexSet, verbose: bool) -> Result<Self> {
let mut headers = header::HeaderMap::new();
// Faking the user agent is necessary for some websites, unfortunately.
// Otherwise we get a 403 from the firewall (e.g. Sucuri/Cloudproxy on ldra.com).
@ -32,6 +33,7 @@ impl Checker {
Ok(Checker {
reqwest_client,
gh_client,
excludes,
verbose,
})
}
@ -89,6 +91,12 @@ impl Checker {
}
pub async fn check(&self, url: &Url) -> bool {
// TODO: Indicate that the URL was skipped in the return value.
// (Perhaps we want to return an enum value here: Status::Skipped)
if self.excludes.is_match(url.as_str()) {
return true;
}
let ret = self.check_real(&url).await;
match ret {
true => {

View file

@ -2,6 +2,7 @@
extern crate log;
use anyhow::Result;
use regex::RegexSet;
use std::env;
use std::fs;
@ -27,7 +28,7 @@ struct LycheeOptions {
// Accumulate all exclusions in a vector
#[options(help = "Exclude URLs from checking (supports regex)")]
excludes: Vec<String>,
exclude: Vec<String>,
}
#[tokio::main]
@ -36,7 +37,9 @@ async fn main() -> Result<()> {
let opts = LycheeOptions::parse_args_default_or_exit();
let checker = Checker::try_new(env::var("GITHUB_TOKEN")?, opts.verbose)?;
let excludes = RegexSet::new(opts.exclude).unwrap();
let checker = Checker::try_new(env::var("GITHUB_TOKEN")?, excludes, opts.verbose)?;
let md = fs::read_to_string(opts.input.unwrap_or_else(|| "README.md".into()))?;
let links = extract_links(&md);