mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-07 06:51:00 +00:00
Merge pull request #153 from lycheeverse/filetype
Add "Filter" module that combines includes and excludes
This commit is contained in:
commit
27709d25e3
20 changed files with 393 additions and 338 deletions
6
.github/workflows/links.yml
vendored
6
.github/workflows/links.yml
vendored
|
|
@ -11,12 +11,12 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
|
||||
- name: Link Checker
|
||||
uses: lycheeverse/lychee-action@master
|
||||
with:
|
||||
args: --exclude https://example.com/README.md
|
||||
|
||||
args: --verbose --no-progress --exclude 'https://example.org/README.md'
|
||||
|
||||
- name: Create Issue From File
|
||||
uses: peter-evans/create-issue-from-file@v2
|
||||
with:
|
||||
|
|
|
|||
|
|
@ -163,7 +163,8 @@ FLAGS:
|
|||
--glob-ignore-case Ignore case when expanding filesystem path glob inputs
|
||||
--help Prints help information
|
||||
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
|
||||
-p, --progress Show progress
|
||||
-n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for
|
||||
continuos integration)
|
||||
--skip-missing Skip missing input files (default is to error if they don't exist)
|
||||
-V, --version Prints version information
|
||||
-v, --verbose Verbose program output
|
||||
|
|
@ -191,7 +192,7 @@ OPTIONS:
|
|||
|
||||
ARGS:
|
||||
<inputs>... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs
|
||||
(e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.com/README.md`) or standard
|
||||
(e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard
|
||||
input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments
|
||||
[default: README.md]
|
||||
```
|
||||
|
|
|
|||
|
|
@ -16,10 +16,10 @@ Some more complex formatting to test that Markdown parsing works.
|
|||
[](https://creativecommons.org/publicdomain/zero/1.0/)
|
||||
|
||||
Test HTTP and HTTPS for the same site.
|
||||
http://example.com
|
||||
https://example.com
|
||||
http://example.org
|
||||
https://example.org
|
||||
|
||||
https://www.peerlyst.com/posts/a-list-of-static-analysis-tools-for-c-c-peerlyst
|
||||
|
||||
test@example.com
|
||||
mailto:test2@example.com
|
||||
test@example.org
|
||||
mailto:test2@example.org
|
||||
|
|
|
|||
|
|
@ -2,13 +2,13 @@
|
|||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link rel="home" href="https://example.com/head/home">
|
||||
<link rel="home" href="https://example.org/head/home">
|
||||
<title>Test</title>
|
||||
<meta name="description" content="Test HTML5 parsing (not valid XML)">
|
||||
|
||||
<!-- The links below have no closing tags (not valid XML) -->
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
|
||||
<link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
|
||||
<link rel="stylesheet" type="text/css" href="https://example.org/css/style_full_url.css">
|
||||
<link rel="stylesheet" type="text/css" href="css/style_relative_url.css">
|
||||
|
||||
<!-- The defer attribute has no value (not valid XML) -->
|
||||
|
|
@ -16,8 +16,8 @@
|
|||
</head>
|
||||
<body>
|
||||
Hello world.
|
||||
<a href="https://example.com/body/a">Link in body</a>
|
||||
<a href="https://example.org/body/a">Link in body</a>
|
||||
<!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
|
||||
<div><a href="https://example.com/body/div_empty_a"/></div>
|
||||
<div><a href="https://example.org/body/div_empty_a"/></div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<some-weird-element href="https://example.com/some-weird-element"></some-weird-element>
|
||||
<even-weirder fake-attr src="https://example.com/even-weirder-src" href="https://example.com/even-weirder-href"></even-weirder>
|
||||
<citations cite="https://example.com/citations"></citations>
|
||||
<some-weird-element href="https://example.org/some-weird-element"></some-weird-element>
|
||||
<even-weirder fake-attr src="https://example.org/even-weirder-src" href="https://example.org/even-weirder-href"></even-weirder>
|
||||
<citations cite="https://example.org/citations"></citations>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -3,6 +3,6 @@
|
|||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<a href="https://example.com/body/a">Link in body</a>
|
||||
<a href="https://example.org/body/a">Link in body</a>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<a href="https;//example.com/malformed_one">Malformed link</a>
|
||||
<a href="https://example]com/malformed_two">Malformed link</a>
|
||||
<a href="https://example.com/valid">Valid link</a>
|
||||
<a href="https;//example.org/malformed_one">Malformed link</a>
|
||||
<a href="https://example]org/malformed_two">Malformed link</a>
|
||||
<a href="https://example.org/valid">Valid link</a>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
<!DOCTYPE html><html class=no-js lang=en><head><link href=https://example.com/ rel=canonical><link href=https://example.com/favicon.ico rel="shortcut icon"><link crossorigin="" href=https://fonts.externalsite.com rel=preconnect><body><div></div><header><nav><a href=https://example.com/docs/ title=Docs></a><div><a href=https://example.com/ title=Home></a></div></nav></header><div><nav><div><ul><li><a href=https://example.com/forum>Forum</a></ul></div></nav></div>
|
||||
<!DOCTYPE html><html class=no-js lang=en><head><link href=https://example.org/ rel=canonical><link href=https://example.org/favicon.ico rel="shortcut icon"><link crossorigin="" href=https://fonts.externalsite.com rel=preconnect><body><div></div><header><nav><a href=https://example.org/docs/ title=Docs></a><div><a href=https://example.org/ title=Home></a></div></nav></header><div><nav><div><ul><li><a href=https://example.org/forum>Forum</a></ul></div></nav></div>
|
||||
|
|
|
|||
|
|
@ -140,15 +140,16 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
)
|
||||
.await?;
|
||||
|
||||
let pb = if cfg.progress {
|
||||
let bar =
|
||||
ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template(
|
||||
let pb = match cfg.no_progress {
|
||||
true => None,
|
||||
false => {
|
||||
let bar = ProgressBar::new(links.len() as u64)
|
||||
.with_style(ProgressStyle::default_bar().template(
|
||||
"{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}",
|
||||
));
|
||||
bar.enable_steady_tick(100);
|
||||
Some(bar)
|
||||
} else {
|
||||
None
|
||||
bar.enable_steady_tick(100);
|
||||
Some(bar)
|
||||
}
|
||||
};
|
||||
|
||||
let (send_req, recv_req) = mpsc::channel(max_concurrency);
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ macro_rules! fold_in {
|
|||
pub(crate) struct LycheeOptions {
|
||||
/// The inputs (where to get links to check from).
|
||||
/// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`),
|
||||
/// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`).
|
||||
/// remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`).
|
||||
/// Prefix with `--` to separate inputs from options that allow multiple arguments.
|
||||
#[structopt(name = "inputs", default_value = "README.md")]
|
||||
raw_inputs: Vec<String>,
|
||||
|
|
@ -116,10 +116,12 @@ pub struct Config {
|
|||
#[serde(default)]
|
||||
pub verbose: bool,
|
||||
|
||||
/// Show progress
|
||||
/// Do not show progress bar.
|
||||
/// This is recommended for non-interactive shells (e.g. for continuos
|
||||
/// integration)
|
||||
#[structopt(short, long)]
|
||||
#[serde(default)]
|
||||
pub progress: bool,
|
||||
pub no_progress: bool,
|
||||
|
||||
/// Maximum number of allowed redirects
|
||||
#[structopt(short, long, default_value = &MAX_REDIRECTS_STR)]
|
||||
|
|
@ -273,7 +275,7 @@ impl Config {
|
|||
|
||||
// Keys with defaults to assign
|
||||
verbose: false;
|
||||
progress: false;
|
||||
no_progress: false;
|
||||
max_redirects: MAX_REDIRECTS;
|
||||
max_concurrency: MAX_CONCURRENCY;
|
||||
threads: None;
|
||||
|
|
|
|||
|
|
@ -116,17 +116,17 @@ mod test_super {
|
|||
fn test_stats() {
|
||||
let mut stats = ResponseStats::new();
|
||||
stats.add(Response {
|
||||
uri: website("http://example.com/ok"),
|
||||
uri: website("http://example.org/ok"),
|
||||
status: Status::Ok(http::StatusCode::OK),
|
||||
source: Input::Stdin,
|
||||
});
|
||||
stats.add(Response {
|
||||
uri: website("http://example.com/failed"),
|
||||
uri: website("http://example.org/failed"),
|
||||
status: Status::Failed(http::StatusCode::BAD_GATEWAY),
|
||||
source: Input::Stdin,
|
||||
});
|
||||
stats.add(Response {
|
||||
uri: website("http://example.com/redirect"),
|
||||
uri: website("http://example.org/redirect"),
|
||||
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
|
||||
source: Input::Stdin,
|
||||
});
|
||||
|
|
@ -135,12 +135,12 @@ mod test_super {
|
|||
Input::Stdin,
|
||||
vec![
|
||||
Response {
|
||||
uri: website("http://example.com/failed"),
|
||||
uri: website("http://example.org/failed"),
|
||||
status: Status::Failed(http::StatusCode::BAD_GATEWAY),
|
||||
source: Input::Stdin,
|
||||
},
|
||||
Response {
|
||||
uri: website("http://example.com/redirect"),
|
||||
uri: website("http://example.org/redirect"),
|
||||
status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
|
||||
source: Input::Stdin,
|
||||
},
|
||||
|
|
|
|||
291
src/client.rs
291
src/client.rs
|
|
@ -10,9 +10,12 @@ use std::{collections::HashSet, time::Duration};
|
|||
use tokio::time::sleep;
|
||||
use url::Url;
|
||||
|
||||
use crate::filter::Excludes;
|
||||
use crate::filter::Filter;
|
||||
use crate::filter::Includes;
|
||||
use crate::types::{Response, Status};
|
||||
use crate::uri::Uri;
|
||||
use crate::{excludes::Excludes, Request};
|
||||
use crate::Request;
|
||||
|
||||
const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
|
|
@ -21,9 +24,7 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
|
|||
pub struct Client {
|
||||
reqwest_client: reqwest::Client,
|
||||
github: Option<Github>,
|
||||
includes: Option<RegexSet>,
|
||||
excludes: Excludes,
|
||||
scheme: Option<String>,
|
||||
filter: Filter,
|
||||
method: reqwest::Method,
|
||||
accepted: Option<HashSet<reqwest::StatusCode>>,
|
||||
}
|
||||
|
|
@ -90,6 +91,12 @@ impl ClientBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
fn build_includes(&mut self) -> Includes {
|
||||
Includes {
|
||||
regex: self.includes.clone().unwrap_or_default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// The build method instantiates the client.
|
||||
pub fn build(&mut self) -> Result<Client> {
|
||||
let mut headers = HeaderMap::new();
|
||||
|
|
@ -140,12 +147,15 @@ impl ClientBuilder {
|
|||
let scheme = self.scheme.clone().unwrap_or(None);
|
||||
let scheme = scheme.map(|s| s.to_lowercase());
|
||||
|
||||
let includes = self.build_includes();
|
||||
let excludes = self.build_excludes();
|
||||
|
||||
let filter = Filter::new(Some(includes), Some(excludes), scheme);
|
||||
|
||||
Ok(Client {
|
||||
reqwest_client,
|
||||
github,
|
||||
includes: self.includes.clone().unwrap_or(None),
|
||||
excludes: self.build_excludes(),
|
||||
scheme,
|
||||
filter,
|
||||
method: self.method.clone().unwrap_or(reqwest::Method::GET),
|
||||
accepted: self.accepted.clone().unwrap_or(None),
|
||||
})
|
||||
|
|
@ -156,9 +166,9 @@ impl Client {
|
|||
pub async fn check<T: TryInto<Request>>(&self, request: T) -> Result<Response> {
|
||||
let request: Request = match request.try_into() {
|
||||
Ok(request) => request,
|
||||
Err(_e) => bail!("Invalid URI:"),
|
||||
Err(_e) => bail!("Invalid URI"),
|
||||
};
|
||||
if self.excluded(&request) {
|
||||
if self.filter.excluded(&request) {
|
||||
return Ok(Response::new(request.uri, Status::Excluded, request.source));
|
||||
}
|
||||
let status = match request.uri {
|
||||
|
|
@ -252,37 +262,6 @@ impl Client {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn excluded(&self, request: &Request) -> bool {
|
||||
if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() {
|
||||
return true;
|
||||
}
|
||||
if self.excludes.ip(&request.uri) {
|
||||
return true;
|
||||
}
|
||||
if let Some(includes) = &self.includes {
|
||||
if includes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if includes.is_match(request.uri.as_str()) {
|
||||
// Includes take precedence over excludes
|
||||
return false;
|
||||
} else {
|
||||
// In case we have includes and no excludes,
|
||||
// skip everything that was not included
|
||||
if self.excludes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if self.excludes.regex(request.uri.as_str()) {
|
||||
return true;
|
||||
}
|
||||
if self.scheme.is_none() {
|
||||
return false;
|
||||
}
|
||||
request.uri.scheme() != self.scheme
|
||||
}
|
||||
}
|
||||
|
||||
/// A convenience function to check a single URI
|
||||
|
|
@ -295,41 +274,12 @@ pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::collector::Input;
|
||||
|
||||
use super::*;
|
||||
use http::StatusCode;
|
||||
use std::time::{Duration, Instant};
|
||||
use url::Url;
|
||||
use wiremock::matchers::method;
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
|
||||
// Note: the standard library as of Rust stable 1.47.0 does not expose
|
||||
// "link-local" or "private" IPv6 checks. However, one might argue
|
||||
// that these concepts do exist in IPv6, albeit the naming is different.
|
||||
// See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
|
||||
// See: https://en.wikipedia.org/wiki/Private_network#IPv6
|
||||
// See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
|
||||
const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
|
||||
const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
|
||||
const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
|
||||
|
||||
const V4_LOOPBACK: &str = "http://127.0.0.1";
|
||||
const V6_LOOPBACK: &str = "http://[::1]";
|
||||
|
||||
const V4_LINK_LOCAL: &str = "http://169.254.0.1";
|
||||
|
||||
// IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
|
||||
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
|
||||
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
|
||||
|
||||
fn website_url(s: &str) -> Request {
|
||||
Request::new(
|
||||
Uri::Website(Url::parse(s).expect("Expected valid Website URI")),
|
||||
Input::Stdin,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_nonexistent() {
|
||||
let template = ResponseTemplate::new(404);
|
||||
|
|
@ -361,7 +311,7 @@ mod test {
|
|||
let res = ClientBuilder::default()
|
||||
.build()
|
||||
.unwrap()
|
||||
.check(website_url(&mock_server.uri()))
|
||||
.check(mock_server.uri())
|
||||
.await
|
||||
.unwrap();
|
||||
let end = start.elapsed();
|
||||
|
|
@ -390,7 +340,7 @@ mod test {
|
|||
ClientBuilder::default()
|
||||
.build()
|
||||
.unwrap()
|
||||
.check(website_url("https://github.com/lycheeverse/lychee"))
|
||||
.check("https://github.com/lycheeverse/lychee")
|
||||
.await
|
||||
.unwrap()
|
||||
.status,
|
||||
|
|
@ -422,7 +372,7 @@ mod test {
|
|||
let res = ClientBuilder::default()
|
||||
.build()
|
||||
.unwrap()
|
||||
.check(website_url(&mock_server.uri()))
|
||||
.check(mock_server.uri())
|
||||
.await
|
||||
.unwrap()
|
||||
.status;
|
||||
|
|
@ -455,7 +405,7 @@ mod test {
|
|||
let res = ClientBuilder::default()
|
||||
.build()
|
||||
.unwrap()
|
||||
.check(website_url("https://crates.io/crates/lychee"))
|
||||
.check("https://crates.io/crates/lychee")
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND)));
|
||||
|
|
@ -469,7 +419,7 @@ mod test {
|
|||
.custom_headers(custom)
|
||||
.build()
|
||||
.unwrap()
|
||||
.check(website_url("https://crates.io/crates/lychee"))
|
||||
.check("https://crates.io/crates/lychee")
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(matches!(res.status, Status::Ok(_)));
|
||||
|
|
@ -496,198 +446,7 @@ mod test {
|
|||
.build()
|
||||
.unwrap();
|
||||
|
||||
let resp = client.check(website_url(&mock_server.uri())).await.unwrap();
|
||||
let resp = client.check(mock_server.uri()).await.unwrap();
|
||||
assert!(matches!(resp.status, Status::Timeout(_)));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_include_regex() {
|
||||
let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
|
||||
|
||||
let client = ClientBuilder::default().includes(includes).build().unwrap();
|
||||
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://foo.github.com")),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://bar.github.com")),
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_includes_and_excludes_empty() {
|
||||
// This is the pre-configured, empty set of excludes for a client
|
||||
// In this case, only the requests matching the include set will be checked
|
||||
let exclude = Some(RegexSet::empty());
|
||||
let includes = RegexSet::empty();
|
||||
|
||||
let client = ClientBuilder::default()
|
||||
.includes(includes)
|
||||
.excludes(exclude)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://foo.github.com")),
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_include_with_empty_exclude() {
|
||||
// This is the pre-configured, empty set of excludes for a client
|
||||
// In this case, only the requests matching the include set will be checked
|
||||
let exclude = Some(RegexSet::empty());
|
||||
let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
|
||||
|
||||
let client = ClientBuilder::default()
|
||||
.includes(includes)
|
||||
.excludes(exclude)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://foo.github.com")),
|
||||
false
|
||||
);
|
||||
assert_eq!(client.excluded(&website_url("https://github.com")), true);
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://bar.github.com")),
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_exclude_include_regex() {
|
||||
let exclude = Some(RegexSet::new(&[r"github.com"]).unwrap());
|
||||
let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
|
||||
|
||||
let client = ClientBuilder::default()
|
||||
.includes(includes)
|
||||
.excludes(exclude)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://foo.github.com")),
|
||||
false
|
||||
);
|
||||
assert_eq!(client.excluded(&website_url("https://github.com")), true);
|
||||
assert_eq!(
|
||||
client.excluded(&website_url("https://bar.github.com")),
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_exclude_regex() {
|
||||
let exclude =
|
||||
Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap());
|
||||
|
||||
let client = ClientBuilder::default().excludes(exclude).build().unwrap();
|
||||
|
||||
assert_eq!(client.excluded(&website_url("http://github.com")), true);
|
||||
assert_eq!(client.excluded(&website_url("http://exclude.org")), true);
|
||||
assert_eq!(
|
||||
client.excluded(&Request::new(
|
||||
Uri::Mail("mail@example.com".to_string()),
|
||||
Input::Stdin,
|
||||
)),
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
client.excluded(&Request::new(
|
||||
Uri::Mail("foo@bar.dev".to_string()),
|
||||
Input::Stdin,
|
||||
)),
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_const_sanity() {
|
||||
let get_host = |s| {
|
||||
Url::parse(s)
|
||||
.expect("Expected valid URL")
|
||||
.host()
|
||||
.expect("Expected host address")
|
||||
.to_owned()
|
||||
};
|
||||
let into_v4 = |host| match host {
|
||||
url::Host::Ipv4(ipv4) => ipv4,
|
||||
_ => panic!("Not IPv4"),
|
||||
};
|
||||
let into_v6 = |host| match host {
|
||||
url::Host::Ipv6(ipv6) => ipv6,
|
||||
_ => panic!("Not IPv6"),
|
||||
};
|
||||
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
|
||||
|
||||
assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
|
||||
assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
|
||||
|
||||
assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_excludes_no_private_ips_by_default() {
|
||||
let client = ClientBuilder::default().build().unwrap();
|
||||
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false);
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false);
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false);
|
||||
assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false);
|
||||
assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false);
|
||||
|
||||
assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_private() {
|
||||
let mut client = ClientBuilder::default().build().unwrap();
|
||||
client.excludes.private_ips = true;
|
||||
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true);
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true);
|
||||
assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_link_local() {
|
||||
let mut client = ClientBuilder::default().build().unwrap();
|
||||
client.excludes.link_local_ips = true;
|
||||
|
||||
assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_loopback() {
|
||||
let mut client = ClientBuilder::default().build().unwrap();
|
||||
client.excludes.loopback_ips = true;
|
||||
|
||||
assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true);
|
||||
assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
|
||||
let mut client = ClientBuilder::default().build().unwrap();
|
||||
client.excludes.private_ips = true;
|
||||
client.excludes.link_local_ips = true;
|
||||
|
||||
// if these were pure IPv4, we would exclude
|
||||
assert_eq!(
|
||||
client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)),
|
||||
false
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -289,7 +289,7 @@ mod test {
|
|||
#[test]
|
||||
fn test_non_markdown_links() {
|
||||
let input =
|
||||
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
|
||||
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org";
|
||||
let links: HashSet<Uri> =
|
||||
extract_links(&InputContent::from_string(input, FileType::Plaintext), None)
|
||||
.into_iter()
|
||||
|
|
@ -299,7 +299,7 @@ mod test {
|
|||
let expected = [
|
||||
website("https://endler.dev"),
|
||||
website("https://hello-rust.show/foo/bar?lol=1"),
|
||||
Uri::Mail("test@example.com".to_string()),
|
||||
Uri::Mail("test@example.org".to_string()),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
@ -330,11 +330,11 @@ mod test {
|
|||
.collect();
|
||||
|
||||
let expected_links = [
|
||||
website("https://example.com/head/home"),
|
||||
website("https://example.com/css/style_full_url.css"),
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.com/body/a"),
|
||||
website("https://example.com/body/div_empty_a"),
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
@ -348,21 +348,21 @@ mod test {
|
|||
let input = load_fixture("TEST_HTML5.html");
|
||||
let links: HashSet<Uri> = extract_links(
|
||||
&InputContent::from_string(&input, FileType::Html),
|
||||
Some(Url::parse("https://example.com").unwrap()),
|
||||
Some(Url::parse("https://example.org").unwrap()),
|
||||
)
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect();
|
||||
|
||||
let expected_links = [
|
||||
website("https://example.com/head/home"),
|
||||
website("https://example.com/images/icon.png"),
|
||||
website("https://example.com/css/style_relative_url.css"),
|
||||
website("https://example.com/css/style_full_url.css"),
|
||||
website("https://example.com/js/script.js"),
|
||||
website("https://example.org/head/home"),
|
||||
website("https://example.org/images/icon.png"),
|
||||
website("https://example.org/css/style_relative_url.css"),
|
||||
website("https://example.org/css/style_full_url.css"),
|
||||
website("https://example.org/js/script.js"),
|
||||
// the body links wouldn't be present if the file was parsed strictly as XML
|
||||
website("https://example.com/body/a"),
|
||||
website("https://example.com/body/div_empty_a"),
|
||||
website("https://example.org/body/a"),
|
||||
website("https://example.org/body/div_empty_a"),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
@ -381,7 +381,7 @@ mod test {
|
|||
.map(|r| r.uri)
|
||||
.collect();
|
||||
|
||||
let expected_links = [website("https://example.com/body/a")]
|
||||
let expected_links = [website("https://example.org/body/a")]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
|
@ -400,11 +400,11 @@ mod test {
|
|||
.collect();
|
||||
|
||||
let expected_links = [
|
||||
website("https://example.com/"),
|
||||
website("https://example.com/favicon.ico"),
|
||||
website("https://example.org/"),
|
||||
website("https://example.org/favicon.ico"),
|
||||
website("https://fonts.externalsite.com"),
|
||||
website("https://example.com/docs/"),
|
||||
website("https://example.com/forum"),
|
||||
website("https://example.org/docs/"),
|
||||
website("https://example.org/forum"),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
@ -424,7 +424,7 @@ mod test {
|
|||
.collect();
|
||||
|
||||
let expected_links = [Uri::Website(
|
||||
Url::parse("https://example.com/valid").unwrap(),
|
||||
Url::parse("https://example.org/valid").unwrap(),
|
||||
)]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
@ -444,10 +444,10 @@ mod test {
|
|||
.collect();
|
||||
|
||||
let expected_links = [
|
||||
website("https://example.com/some-weird-element"),
|
||||
website("https://example.com/even-weirder-src"),
|
||||
website("https://example.com/even-weirder-href"),
|
||||
website("https://example.com/citations"),
|
||||
website("https://example.org/some-weird-element"),
|
||||
website("https://example.org/even-weirder-src"),
|
||||
website("https://example.org/even-weirder-href"),
|
||||
website("https://example.org/citations"),
|
||||
]
|
||||
.iter()
|
||||
.cloned()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
use std::net::IpAddr;
|
||||
|
||||
use regex::RegexSet;
|
||||
use std::net::IpAddr;
|
||||
|
||||
use crate::Uri;
|
||||
|
||||
32
src/filter/includes.rs
Normal file
32
src/filter/includes.rs
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
use regex::RegexSet;
|
||||
|
||||
/// Include configuration for the link checker.
|
||||
/// You can include links based on regex patterns
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Includes {
|
||||
pub regex: Option<RegexSet>,
|
||||
}
|
||||
|
||||
impl Default for Includes {
|
||||
fn default() -> Self {
|
||||
Self { regex: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl Includes {
|
||||
pub fn regex(&self, input: &str) -> bool {
|
||||
if let Some(includes) = &self.regex {
|
||||
if includes.is_match(input) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match &self.regex {
|
||||
None => true,
|
||||
Some(regex_set) => regex_set.is_empty(),
|
||||
}
|
||||
}
|
||||
}
|
||||
259
src/filter/mod.rs
Normal file
259
src/filter/mod.rs
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
mod excludes;
|
||||
mod includes;
|
||||
|
||||
pub use excludes::Excludes;
|
||||
pub use includes::Includes;
|
||||
|
||||
use crate::uri::Uri;
|
||||
use crate::Request;
|
||||
|
||||
/// A generic URI filter
|
||||
/// Used to decide if a given URI should be checked or skipped
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Filter {
|
||||
includes: Includes,
|
||||
excludes: Excludes,
|
||||
scheme: Option<String>,
|
||||
}
|
||||
|
||||
impl Filter {
|
||||
pub fn new(
|
||||
includes: Option<Includes>,
|
||||
excludes: Option<Excludes>,
|
||||
scheme: Option<String>,
|
||||
) -> Self {
|
||||
let includes = match includes {
|
||||
Some(includes) => includes,
|
||||
None => Includes::default(),
|
||||
};
|
||||
let excludes = match excludes {
|
||||
Some(excludes) => excludes,
|
||||
None => Excludes::default(),
|
||||
};
|
||||
Filter {
|
||||
includes,
|
||||
excludes,
|
||||
scheme,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn excluded(&self, request: &Request) -> bool {
|
||||
// Skip mail?
|
||||
if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() {
|
||||
return true;
|
||||
}
|
||||
// Skip specific IP address?
|
||||
if self.excludes.ip(&request.uri) {
|
||||
return true;
|
||||
}
|
||||
// No regex includes/excludes at all?
|
||||
if self.includes.is_empty() && self.excludes.is_empty() {
|
||||
return false;
|
||||
}
|
||||
if self.includes.regex(request.uri.as_str()) {
|
||||
// Includes take precedence over excludes
|
||||
return false;
|
||||
}
|
||||
// In case we have includes and no excludes,
|
||||
// skip everything that was not included
|
||||
if !self.includes.is_empty() && self.excludes.is_empty() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// We have no includes. Check regex excludes
|
||||
if self.excludes.regex(request.uri.as_str()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if self.scheme.is_none() {
|
||||
return false;
|
||||
}
|
||||
request.uri.scheme() != self.scheme
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
// Note: the standard library as of Rust stable 1.47.0 does not expose
|
||||
// "link-local" or "private" IPv6 checks. However, one might argue
|
||||
// that these concepts do exist in IPv6, albeit the naming is different.
|
||||
// See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
|
||||
// See: https://en.wikipedia.org/wiki/Private_network#IPv6
|
||||
// See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
|
||||
const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
|
||||
const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
|
||||
const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
|
||||
|
||||
const V4_LOOPBACK: &str = "http://127.0.0.1";
|
||||
const V6_LOOPBACK: &str = "http://[::1]";
|
||||
|
||||
const V4_LINK_LOCAL: &str = "http://169.254.0.1";
|
||||
|
||||
// IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
|
||||
const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
|
||||
const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
|
||||
|
||||
use regex::RegexSet;
|
||||
use reqwest::Url;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{test_utils::website, Input};
|
||||
|
||||
/// Helper method to convert a string into a Request
|
||||
/// Note: This panics on error, so it should only be used for testing
|
||||
pub fn request(url: &str) -> Request {
|
||||
Request::new(website(url), Input::Stdin)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_const_sanity() {
|
||||
let get_host = |s| {
|
||||
Url::parse(s)
|
||||
.expect("Expected valid URL")
|
||||
.host()
|
||||
.expect("Expected host address")
|
||||
.to_owned()
|
||||
};
|
||||
let into_v4 = |host| match host {
|
||||
url::Host::Ipv4(ipv4) => ipv4,
|
||||
_ => panic!("Not IPv4"),
|
||||
};
|
||||
let into_v6 = |host| match host {
|
||||
url::Host::Ipv6(ipv6) => ipv6,
|
||||
_ => panic!("Not IPv6"),
|
||||
};
|
||||
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
|
||||
assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
|
||||
|
||||
assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
|
||||
assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
|
||||
|
||||
assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_includes_and_excludes_empty() {
|
||||
// This is the pre-configured, empty set of excludes for a client
|
||||
// In this case, only the requests matching the include set will be checked
|
||||
let includes = Some(Includes::default());
|
||||
let excludes = Some(Excludes::default());
|
||||
let filter = Filter::new(includes, excludes, None);
|
||||
assert_eq!(filter.excluded(&request("https://example.org")), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_include_regex() {
|
||||
let includes = Some(Includes {
|
||||
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
|
||||
});
|
||||
let filter = Filter::new(includes, None, None);
|
||||
|
||||
// Only the requests matching the include set will be checked
|
||||
assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
|
||||
assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
|
||||
assert_eq!(filter.excluded(&request("https://example.org")), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_regex() {
|
||||
let excludes = Excludes {
|
||||
regex: Some(
|
||||
RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(),
|
||||
),
|
||||
..Default::default()
|
||||
};
|
||||
let filter = Filter::new(None, Some(excludes), None);
|
||||
|
||||
assert_eq!(filter.excluded(&request("http://github.com")), true);
|
||||
assert_eq!(filter.excluded(&request("http://exclude.org")), true);
|
||||
assert_eq!(
|
||||
filter.excluded(&Request::new(
|
||||
Uri::Mail("mail@example.org".to_string()),
|
||||
Input::Stdin,
|
||||
)),
|
||||
true
|
||||
);
|
||||
|
||||
assert_eq!(filter.excluded(&request("http://bar.dev")), false);
|
||||
assert_eq!(
|
||||
filter.excluded(&Request::new(
|
||||
Uri::Mail("foo@bar.dev".to_string()),
|
||||
Input::Stdin,
|
||||
)),
|
||||
false
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_exclude_include_regex() {
|
||||
let includes = Some(Includes {
|
||||
regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
|
||||
});
|
||||
let excludes = Excludes {
|
||||
regex: Some(RegexSet::new(&[r"example.org"]).unwrap()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let filter = Filter::new(includes, Some(excludes), None);
|
||||
|
||||
// Includes take preference over excludes
|
||||
assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
|
||||
|
||||
assert_eq!(filter.excluded(&request("https://example.org")), true);
|
||||
assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_excludes_no_private_ips_by_default() {
|
||||
let filter = Filter::new(None, None, None);
|
||||
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false);
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false);
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false);
|
||||
assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false);
|
||||
assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false);
|
||||
assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_private_ips() {
|
||||
let mut filter = Filter::new(None, None, None);
|
||||
filter.excludes.private_ips = true;
|
||||
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true);
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true);
|
||||
assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_link_local() {
|
||||
let mut filter = Filter::new(None, None, None);
|
||||
filter.excludes.link_local_ips = true;
|
||||
assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_loopback() {
|
||||
let mut filter = Filter::new(None, None, None);
|
||||
filter.excludes.loopback_ips = true;
|
||||
|
||||
assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true);
|
||||
assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
|
||||
let mut filter = Filter::new(None, None, None);
|
||||
filter.excludes.private_ips = true;
|
||||
filter.excludes.link_local_ips = true;
|
||||
|
||||
// if these were pure IPv4, we would exclude
|
||||
assert_eq!(
|
||||
filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)),
|
||||
false
|
||||
);
|
||||
assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false);
|
||||
}
|
||||
}
|
||||
|
|
@ -41,7 +41,7 @@ doctest!("../README.md");
|
|||
|
||||
mod client;
|
||||
mod client_pool;
|
||||
mod excludes;
|
||||
mod filter;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
|
|
@ -53,6 +53,5 @@ pub use client::check;
|
|||
pub use client::ClientBuilder;
|
||||
pub use client_pool::ClientPool;
|
||||
pub use collector::Input;
|
||||
pub use excludes::Excludes;
|
||||
pub use types::*;
|
||||
pub use uri::Uri;
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ where
|
|||
mock_server
|
||||
}
|
||||
|
||||
/// Helper method to convert a string into a URI
|
||||
/// Note: This panics on error, so it should only be used for testing
|
||||
pub fn website(url: &str) -> Uri {
|
||||
Uri::Website(Url::parse(url).unwrap())
|
||||
Uri::Website(Url::parse(url).expect("Expected valid Website URI"))
|
||||
}
|
||||
|
|
|
|||
12
src/uri.rs
12
src/uri.rs
|
|
@ -74,16 +74,16 @@ mod test {
|
|||
fn test_uri_from_str() {
|
||||
assert!(matches!(Uri::try_from(""), Err(_)));
|
||||
assert_eq!(
|
||||
Uri::try_from("http://example.com").unwrap(),
|
||||
website("http://example.com")
|
||||
Uri::try_from("http://example.org").unwrap(),
|
||||
website("http://example.org")
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("mail@example.com").unwrap(),
|
||||
Uri::Mail("mail@example.com".to_string())
|
||||
Uri::try_from("mail@example.org").unwrap(),
|
||||
Uri::Mail("mail@example.org".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
Uri::try_from("mailto:mail@example.com").unwrap(),
|
||||
Uri::Mail("mail@example.com".to_string())
|
||||
Uri::try_from("mailto:mail@example.org").unwrap(),
|
||||
Uri::Mail("mail@example.org".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ mod cli {
|
|||
let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md");
|
||||
|
||||
cmd.arg(test_github_404_path)
|
||||
.arg("--no-progress")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.failure()
|
||||
|
|
|
|||
Loading…
Reference in a new issue