diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index a1820d3..9f0a166 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -11,12 +11,12 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - + - name: Link Checker uses: lycheeverse/lychee-action@master with: - args: --exclude https://example.com/README.md - + args: --verbose --no-progress --exclude 'https://example.org/README.md' + - name: Create Issue From File uses: peter-evans/create-issue-from-file@v2 with: diff --git a/README.md b/README.md index f6c53d6..98d4b15 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,8 @@ FLAGS: --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information -i, --insecure Proceed for server connections considered insecure (invalid TLS) - -p, --progress Show progress + -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for + continuos integration) --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information -v, --verbose Verbose program output @@ -191,7 +192,7 @@ OPTIONS: ARGS: ... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs - (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.com/README.md`) or standard + (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments [default: README.md] ``` diff --git a/fixtures/TEST.md b/fixtures/TEST.md index 81eff74..90c7f5c 100644 --- a/fixtures/TEST.md +++ b/fixtures/TEST.md @@ -16,10 +16,10 @@ Some more complex formatting to test that Markdown parsing works. [![CC0](https://i.creativecommons.org/p/zero/1.0/88x31.png)](https://creativecommons.org/publicdomain/zero/1.0/) Test HTTP and HTTPS for the same site. -http://example.com -https://example.com +http://example.org +https://example.org https://www.peerlyst.com/posts/a-list-of-static-analysis-tools-for-c-c-peerlyst -test@example.com -mailto:test2@example.com +test@example.org +mailto:test2@example.org diff --git a/fixtures/TEST_HTML5.html b/fixtures/TEST_HTML5.html index f9ff015..b6ae28d 100644 --- a/fixtures/TEST_HTML5.html +++ b/fixtures/TEST_HTML5.html @@ -2,13 +2,13 @@ - + Test - + @@ -16,8 +16,8 @@ Hello world. - Link in body + Link in body -
+
diff --git a/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html b/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html index c9464c2..31ede82 100644 --- a/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html +++ b/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html @@ -3,8 +3,8 @@ - - - + + + diff --git a/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html b/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html index 1418934..58ae7b3 100644 --- a/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html +++ b/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html @@ -3,6 +3,6 @@ - Link in body + Link in body diff --git a/fixtures/TEST_HTML5_MALFORMED_LINKS.html b/fixtures/TEST_HTML5_MALFORMED_LINKS.html index 76c2061..ee40f66 100644 --- a/fixtures/TEST_HTML5_MALFORMED_LINKS.html +++ b/fixtures/TEST_HTML5_MALFORMED_LINKS.html @@ -3,8 +3,8 @@ - Malformed link - Malformed link - Valid link + Malformed link + Malformed link + Valid link diff --git a/fixtures/TEST_HTML5_MINIFIED.html b/fixtures/TEST_HTML5_MINIFIED.html index 84c8673..4651593 100644 --- a/fixtures/TEST_HTML5_MINIFIED.html +++ b/fixtures/TEST_HTML5_MINIFIED.html @@ -1 +1 @@ -
+
diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 870890d..732c25a 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -140,15 +140,16 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { ) .await?; - let pb = if cfg.progress { - let bar = - ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template( + let pb = match cfg.no_progress { + true => None, + false => { + let bar = ProgressBar::new(links.len() as u64) + .with_style(ProgressStyle::default_bar().template( "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}", )); - bar.enable_steady_tick(100); - Some(bar) - } else { - None + bar.enable_steady_tick(100); + Some(bar) + } }; let (send_req, recv_req) = mpsc::channel(max_concurrency); diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 5227d0e..912cd96 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -83,7 +83,7 @@ macro_rules! fold_in { pub(crate) struct LycheeOptions { /// The inputs (where to get links to check from). /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), - /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`). + /// remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`). /// Prefix with `--` to separate inputs from options that allow multiple arguments. #[structopt(name = "inputs", default_value = "README.md")] raw_inputs: Vec, @@ -116,10 +116,12 @@ pub struct Config { #[serde(default)] pub verbose: bool, - /// Show progress + /// Do not show progress bar. + /// This is recommended for non-interactive shells (e.g. for continuos + /// integration) #[structopt(short, long)] #[serde(default)] - pub progress: bool, + pub no_progress: bool, /// Maximum number of allowed redirects #[structopt(short, long, default_value = &MAX_REDIRECTS_STR)] @@ -273,7 +275,7 @@ impl Config { // Keys with defaults to assign verbose: false; - progress: false; + no_progress: false; max_redirects: MAX_REDIRECTS; max_concurrency: MAX_CONCURRENCY; threads: None; diff --git a/src/bin/lychee/stats.rs b/src/bin/lychee/stats.rs index de53eaa..43ffb23 100644 --- a/src/bin/lychee/stats.rs +++ b/src/bin/lychee/stats.rs @@ -116,17 +116,17 @@ mod test_super { fn test_stats() { let mut stats = ResponseStats::new(); stats.add(Response { - uri: website("http://example.com/ok"), + uri: website("http://example.org/ok"), status: Status::Ok(http::StatusCode::OK), source: Input::Stdin, }); stats.add(Response { - uri: website("http://example.com/failed"), + uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, }); stats.add(Response { - uri: website("http://example.com/redirect"), + uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, }); @@ -135,12 +135,12 @@ mod test_super { Input::Stdin, vec![ Response { - uri: website("http://example.com/failed"), + uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, }, Response { - uri: website("http://example.com/redirect"), + uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, }, diff --git a/src/client.rs b/src/client.rs index 8ef305c..eb493bd 100644 --- a/src/client.rs +++ b/src/client.rs @@ -10,9 +10,12 @@ use std::{collections::HashSet, time::Duration}; use tokio::time::sleep; use url::Url; +use crate::filter::Excludes; +use crate::filter::Filter; +use crate::filter::Includes; use crate::types::{Response, Status}; use crate::uri::Uri; -use crate::{excludes::Excludes, Request}; +use crate::Request; const VERSION: &str = env!("CARGO_PKG_VERSION"); const DEFAULT_MAX_REDIRECTS: usize = 5; @@ -21,9 +24,7 @@ const DEFAULT_MAX_REDIRECTS: usize = 5; pub struct Client { reqwest_client: reqwest::Client, github: Option, - includes: Option, - excludes: Excludes, - scheme: Option, + filter: Filter, method: reqwest::Method, accepted: Option>, } @@ -90,6 +91,12 @@ impl ClientBuilder { } } + fn build_includes(&mut self) -> Includes { + Includes { + regex: self.includes.clone().unwrap_or_default(), + } + } + /// The build method instantiates the client. pub fn build(&mut self) -> Result { let mut headers = HeaderMap::new(); @@ -140,12 +147,15 @@ impl ClientBuilder { let scheme = self.scheme.clone().unwrap_or(None); let scheme = scheme.map(|s| s.to_lowercase()); + let includes = self.build_includes(); + let excludes = self.build_excludes(); + + let filter = Filter::new(Some(includes), Some(excludes), scheme); + Ok(Client { reqwest_client, github, - includes: self.includes.clone().unwrap_or(None), - excludes: self.build_excludes(), - scheme, + filter, method: self.method.clone().unwrap_or(reqwest::Method::GET), accepted: self.accepted.clone().unwrap_or(None), }) @@ -156,9 +166,9 @@ impl Client { pub async fn check>(&self, request: T) -> Result { let request: Request = match request.try_into() { Ok(request) => request, - Err(_e) => bail!("Invalid URI:"), + Err(_e) => bail!("Invalid URI"), }; - if self.excluded(&request) { + if self.filter.excluded(&request) { return Ok(Response::new(request.uri, Status::Excluded, request.source)); } let status = match request.uri { @@ -252,37 +262,6 @@ impl Client { } } } - - pub fn excluded(&self, request: &Request) -> bool { - if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { - return true; - } - if self.excludes.ip(&request.uri) { - return true; - } - if let Some(includes) = &self.includes { - if includes.is_empty() { - return false; - } - if includes.is_match(request.uri.as_str()) { - // Includes take precedence over excludes - return false; - } else { - // In case we have includes and no excludes, - // skip everything that was not included - if self.excludes.is_empty() { - return true; - } - } - } - if self.excludes.regex(request.uri.as_str()) { - return true; - } - if self.scheme.is_none() { - return false; - } - request.uri.scheme() != self.scheme - } } /// A convenience function to check a single URI @@ -295,41 +274,12 @@ pub async fn check>(request: T) -> Result { #[cfg(test)] mod test { - use crate::collector::Input; - use super::*; use http::StatusCode; use std::time::{Duration, Instant}; - use url::Url; use wiremock::matchers::method; use wiremock::{Mock, MockServer, ResponseTemplate}; - // Note: the standard library as of Rust stable 1.47.0 does not expose - // "link-local" or "private" IPv6 checks. However, one might argue - // that these concepts do exist in IPv6, albeit the naming is different. - // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 - // See: https://en.wikipedia.org/wiki/Private_network#IPv6 - // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local - const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; - const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; - const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; - - const V4_LOOPBACK: &str = "http://127.0.0.1"; - const V6_LOOPBACK: &str = "http://[::1]"; - - const V4_LINK_LOCAL: &str = "http://169.254.0.1"; - - // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) - const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; - const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; - - fn website_url(s: &str) -> Request { - Request::new( - Uri::Website(Url::parse(s).expect("Expected valid Website URI")), - Input::Stdin, - ) - } - #[tokio::test] async fn test_nonexistent() { let template = ResponseTemplate::new(404); @@ -361,7 +311,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url(&mock_server.uri())) + .check(mock_server.uri()) .await .unwrap(); let end = start.elapsed(); @@ -390,7 +340,7 @@ mod test { ClientBuilder::default() .build() .unwrap() - .check(website_url("https://github.com/lycheeverse/lychee")) + .check("https://github.com/lycheeverse/lychee") .await .unwrap() .status, @@ -422,7 +372,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url(&mock_server.uri())) + .check(mock_server.uri()) .await .unwrap() .status; @@ -455,7 +405,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url("https://crates.io/crates/lychee")) + .check("https://crates.io/crates/lychee") .await .unwrap(); assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND))); @@ -469,7 +419,7 @@ mod test { .custom_headers(custom) .build() .unwrap() - .check(website_url("https://crates.io/crates/lychee")) + .check("https://crates.io/crates/lychee") .await .unwrap(); assert!(matches!(res.status, Status::Ok(_))); @@ -496,198 +446,7 @@ mod test { .build() .unwrap(); - let resp = client.check(website_url(&mock_server.uri())).await.unwrap(); + let resp = client.check(mock_server.uri()).await.unwrap(); assert!(matches!(resp.status, Status::Timeout(_))); } - - #[tokio::test] - async fn test_include_regex() { - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default().includes(includes).build().unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_includes_and_excludes_empty() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let exclude = Some(RegexSet::empty()); - let includes = RegexSet::empty(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - } - - #[tokio::test] - async fn test_include_with_empty_exclude() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let exclude = Some(RegexSet::empty()); - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!(client.excluded(&website_url("https://github.com")), true); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_exclude_include_regex() { - let exclude = Some(RegexSet::new(&[r"github.com"]).unwrap()); - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!(client.excluded(&website_url("https://github.com")), true); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_exclude_regex() { - let exclude = - Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap()); - - let client = ClientBuilder::default().excludes(exclude).build().unwrap(); - - assert_eq!(client.excluded(&website_url("http://github.com")), true); - assert_eq!(client.excluded(&website_url("http://exclude.org")), true); - assert_eq!( - client.excluded(&Request::new( - Uri::Mail("mail@example.com".to_string()), - Input::Stdin, - )), - true - ); - assert_eq!( - client.excluded(&Request::new( - Uri::Mail("foo@bar.dev".to_string()), - Input::Stdin, - )), - false - ); - } - - #[test] - fn test_const_sanity() { - let get_host = |s| { - Url::parse(s) - .expect("Expected valid URL") - .host() - .expect("Expected host address") - .to_owned() - }; - let into_v4 = |host| match host { - url::Host::Ipv4(ipv4) => ipv4, - _ => panic!("Not IPv4"), - }; - let into_v6 = |host| match host { - url::Host::Ipv6(ipv6) => ipv6, - _ => panic!("Not IPv6"), - }; - - assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); - - assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); - assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); - - assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); - } - - #[test] - fn test_excludes_no_private_ips_by_default() { - let client = ClientBuilder::default().build().unwrap(); - - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false); - assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false); - assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false); - - assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false); - } - - #[test] - fn test_exclude_private() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.private_ips = true; - - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true); - } - - #[test] - fn test_exclude_link_local() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.link_local_ips = true; - - assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true); - } - - #[test] - fn test_exclude_loopback() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.loopback_ips = true; - - assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true); - assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true); - } - - #[test] - fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.private_ips = true; - client.excludes.link_local_ips = true; - - // if these were pure IPv4, we would exclude - assert_eq!( - client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)), - false - ); - assert_eq!( - client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)), - false - ); - } } diff --git a/src/extract.rs b/src/extract.rs index a5be89e..b6f7202 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -289,7 +289,7 @@ mod test { #[test] fn test_non_markdown_links() { let input = - "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com"; + "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org"; let links: HashSet = extract_links(&InputContent::from_string(input, FileType::Plaintext), None) .into_iter() @@ -299,7 +299,7 @@ mod test { let expected = [ website("https://endler.dev"), website("https://hello-rust.show/foo/bar?lol=1"), - Uri::Mail("test@example.com".to_string()), + Uri::Mail("test@example.org".to_string()), ] .iter() .cloned() @@ -330,11 +330,11 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/head/home"), - website("https://example.com/css/style_full_url.css"), + website("https://example.org/head/home"), + website("https://example.org/css/style_full_url.css"), // the body links wouldn't be present if the file was parsed strictly as XML - website("https://example.com/body/a"), - website("https://example.com/body/div_empty_a"), + website("https://example.org/body/a"), + website("https://example.org/body/div_empty_a"), ] .iter() .cloned() @@ -348,21 +348,21 @@ mod test { let input = load_fixture("TEST_HTML5.html"); let links: HashSet = extract_links( &InputContent::from_string(&input, FileType::Html), - Some(Url::parse("https://example.com").unwrap()), + Some(Url::parse("https://example.org").unwrap()), ) .into_iter() .map(|r| r.uri) .collect(); let expected_links = [ - website("https://example.com/head/home"), - website("https://example.com/images/icon.png"), - website("https://example.com/css/style_relative_url.css"), - website("https://example.com/css/style_full_url.css"), - website("https://example.com/js/script.js"), + website("https://example.org/head/home"), + website("https://example.org/images/icon.png"), + website("https://example.org/css/style_relative_url.css"), + website("https://example.org/css/style_full_url.css"), + website("https://example.org/js/script.js"), // the body links wouldn't be present if the file was parsed strictly as XML - website("https://example.com/body/a"), - website("https://example.com/body/div_empty_a"), + website("https://example.org/body/a"), + website("https://example.org/body/div_empty_a"), ] .iter() .cloned() @@ -381,7 +381,7 @@ mod test { .map(|r| r.uri) .collect(); - let expected_links = [website("https://example.com/body/a")] + let expected_links = [website("https://example.org/body/a")] .iter() .cloned() .collect(); @@ -400,11 +400,11 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/"), - website("https://example.com/favicon.ico"), + website("https://example.org/"), + website("https://example.org/favicon.ico"), website("https://fonts.externalsite.com"), - website("https://example.com/docs/"), - website("https://example.com/forum"), + website("https://example.org/docs/"), + website("https://example.org/forum"), ] .iter() .cloned() @@ -424,7 +424,7 @@ mod test { .collect(); let expected_links = [Uri::Website( - Url::parse("https://example.com/valid").unwrap(), + Url::parse("https://example.org/valid").unwrap(), )] .iter() .cloned() @@ -444,10 +444,10 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/some-weird-element"), - website("https://example.com/even-weirder-src"), - website("https://example.com/even-weirder-href"), - website("https://example.com/citations"), + website("https://example.org/some-weird-element"), + website("https://example.org/even-weirder-src"), + website("https://example.org/even-weirder-href"), + website("https://example.org/citations"), ] .iter() .cloned() diff --git a/src/excludes.rs b/src/filter/excludes.rs similarity index 99% rename from src/excludes.rs rename to src/filter/excludes.rs index 2d981cb..118a541 100644 --- a/src/excludes.rs +++ b/src/filter/excludes.rs @@ -1,6 +1,5 @@ -use std::net::IpAddr; - use regex::RegexSet; +use std::net::IpAddr; use crate::Uri; diff --git a/src/filter/includes.rs b/src/filter/includes.rs new file mode 100644 index 0000000..5ff0751 --- /dev/null +++ b/src/filter/includes.rs @@ -0,0 +1,32 @@ +use regex::RegexSet; + +/// Include configuration for the link checker. +/// You can include links based on regex patterns +#[derive(Clone, Debug)] +pub struct Includes { + pub regex: Option, +} + +impl Default for Includes { + fn default() -> Self { + Self { regex: None } + } +} + +impl Includes { + pub fn regex(&self, input: &str) -> bool { + if let Some(includes) = &self.regex { + if includes.is_match(input) { + return true; + } + } + false + } + + pub fn is_empty(&self) -> bool { + match &self.regex { + None => true, + Some(regex_set) => regex_set.is_empty(), + } + } +} diff --git a/src/filter/mod.rs b/src/filter/mod.rs new file mode 100644 index 0000000..adc0f3a --- /dev/null +++ b/src/filter/mod.rs @@ -0,0 +1,259 @@ +mod excludes; +mod includes; + +pub use excludes::Excludes; +pub use includes::Includes; + +use crate::uri::Uri; +use crate::Request; + +/// A generic URI filter +/// Used to decide if a given URI should be checked or skipped +#[derive(Clone, Debug)] +pub struct Filter { + includes: Includes, + excludes: Excludes, + scheme: Option, +} + +impl Filter { + pub fn new( + includes: Option, + excludes: Option, + scheme: Option, + ) -> Self { + let includes = match includes { + Some(includes) => includes, + None => Includes::default(), + }; + let excludes = match excludes { + Some(excludes) => excludes, + None => Excludes::default(), + }; + Filter { + includes, + excludes, + scheme, + } + } + + pub fn excluded(&self, request: &Request) -> bool { + // Skip mail? + if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { + return true; + } + // Skip specific IP address? + if self.excludes.ip(&request.uri) { + return true; + } + // No regex includes/excludes at all? + if self.includes.is_empty() && self.excludes.is_empty() { + return false; + } + if self.includes.regex(request.uri.as_str()) { + // Includes take precedence over excludes + return false; + } + // In case we have includes and no excludes, + // skip everything that was not included + if !self.includes.is_empty() && self.excludes.is_empty() { + return true; + } + + // We have no includes. Check regex excludes + if self.excludes.regex(request.uri.as_str()) { + return true; + } + + if self.scheme.is_none() { + return false; + } + request.uri.scheme() != self.scheme + } +} + +#[cfg(test)] +mod test { + // Note: the standard library as of Rust stable 1.47.0 does not expose + // "link-local" or "private" IPv6 checks. However, one might argue + // that these concepts do exist in IPv6, albeit the naming is different. + // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 + // See: https://en.wikipedia.org/wiki/Private_network#IPv6 + // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local + const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; + const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; + const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; + + const V4_LOOPBACK: &str = "http://127.0.0.1"; + const V6_LOOPBACK: &str = "http://[::1]"; + + const V4_LINK_LOCAL: &str = "http://169.254.0.1"; + + // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) + const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; + const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; + + use regex::RegexSet; + use reqwest::Url; + + use super::*; + + use crate::{test_utils::website, Input}; + + /// Helper method to convert a string into a Request + /// Note: This panics on error, so it should only be used for testing + pub fn request(url: &str) -> Request { + Request::new(website(url), Input::Stdin) + } + + #[test] + fn test_const_sanity() { + let get_host = |s| { + Url::parse(s) + .expect("Expected valid URL") + .host() + .expect("Expected host address") + .to_owned() + }; + let into_v4 = |host| match host { + url::Host::Ipv4(ipv4) => ipv4, + _ => panic!("Not IPv4"), + }; + let into_v6 = |host| match host { + url::Host::Ipv6(ipv6) => ipv6, + _ => panic!("Not IPv6"), + }; + + assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); + + assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); + assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); + + assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); + } + + #[test] + fn test_includes_and_excludes_empty() { + // This is the pre-configured, empty set of excludes for a client + // In this case, only the requests matching the include set will be checked + let includes = Some(Includes::default()); + let excludes = Some(Excludes::default()); + let filter = Filter::new(includes, excludes, None); + assert_eq!(filter.excluded(&request("https://example.org")), false); + } + + #[test] + fn test_include_regex() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + let filter = Filter::new(includes, None, None); + + // Only the requests matching the include set will be checked + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + assert_eq!(filter.excluded(&request("https://example.org")), true); + } + + #[test] + fn test_exclude_regex() { + let excludes = Excludes { + regex: Some( + RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), + ), + ..Default::default() + }; + let filter = Filter::new(None, Some(excludes), None); + + assert_eq!(filter.excluded(&request("http://github.com")), true); + assert_eq!(filter.excluded(&request("http://exclude.org")), true); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("mail@example.org".to_string()), + Input::Stdin, + )), + true + ); + + assert_eq!(filter.excluded(&request("http://bar.dev")), false); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("foo@bar.dev".to_string()), + Input::Stdin, + )), + false + ); + } + #[test] + fn test_exclude_include_regex() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + let excludes = Excludes { + regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), + ..Default::default() + }; + + let filter = Filter::new(includes, Some(excludes), None); + + // Includes take preference over excludes + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_excludes_no_private_ips_by_default() { + let filter = Filter::new(None, None, None); + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false); + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false); + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false); + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false); + } + + #[test] + fn test_exclude_private_ips() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true); + } + + #[test] + fn test_exclude_link_local() { + let mut filter = Filter::new(None, None, None); + filter.excludes.link_local_ips = true; + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true); + } + + #[test] + fn test_exclude_loopback() { + let mut filter = Filter::new(None, None, None); + filter.excludes.loopback_ips = true; + + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true); + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true); + } + + #[test] + fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + filter.excludes.link_local_ips = true; + + // if these were pure IPv4, we would exclude + assert_eq!( + filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)), + false + ); + assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6cd698e..4afc504 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,7 +41,7 @@ doctest!("../README.md"); mod client; mod client_pool; -mod excludes; +mod filter; mod types; mod uri; @@ -53,6 +53,5 @@ pub use client::check; pub use client::ClientBuilder; pub use client_pool::ClientPool; pub use collector::Input; -pub use excludes::Excludes; pub use types::*; pub use uri::Uri; diff --git a/src/test_utils.rs b/src/test_utils.rs index 811c4c3..9d6b429 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -34,6 +34,8 @@ where mock_server } +/// Helper method to convert a string into a URI +/// Note: This panics on error, so it should only be used for testing pub fn website(url: &str) -> Uri { - Uri::Website(Url::parse(url).unwrap()) + Uri::Website(Url::parse(url).expect("Expected valid Website URI")) } diff --git a/src/uri.rs b/src/uri.rs index 5834a5e..9e2b271 100644 --- a/src/uri.rs +++ b/src/uri.rs @@ -74,16 +74,16 @@ mod test { fn test_uri_from_str() { assert!(matches!(Uri::try_from(""), Err(_))); assert_eq!( - Uri::try_from("http://example.com").unwrap(), - website("http://example.com") + Uri::try_from("http://example.org").unwrap(), + website("http://example.org") ); assert_eq!( - Uri::try_from("mail@example.com").unwrap(), - Uri::Mail("mail@example.com".to_string()) + Uri::try_from("mail@example.org").unwrap(), + Uri::Mail("mail@example.org".to_string()) ); assert_eq!( - Uri::try_from("mailto:mail@example.com").unwrap(), - Uri::Mail("mail@example.com".to_string()) + Uri::try_from("mailto:mail@example.org").unwrap(), + Uri::Mail("mail@example.org".to_string()) ); } diff --git a/tests/cli.rs b/tests/cli.rs index 6b98c5d..096779a 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -75,6 +75,7 @@ mod cli { let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); cmd.arg(test_github_404_path) + .arg("--no-progress") .env_clear() .assert() .failure()