From 8d165a3cdac319d9210324a87030089747fab205 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 09:37:49 +0100 Subject: [PATCH 01/10] Add support and tests for `.markdown` files --- src/extract.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index 07fe944..a5be89e 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -9,7 +9,7 @@ use std::path::Path; use std::{collections::HashSet, convert::TryFrom}; use url::Url; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum FileType { Html, Markdown, @@ -28,7 +28,7 @@ impl> From

for FileType { let path = p.as_ref(); match path.extension() { Some(ext) => match ext { - _ if ext == "md" => FileType::Markdown, + _ if (ext == "md" || ext == "markdown") => FileType::Markdown, _ if (ext == "htm" || ext == "html") => FileType::Html, _ => FileType::Plaintext, }, @@ -202,6 +202,25 @@ mod test { content } + #[test] + fn test_file_type() { + assert_eq!(FileType::from(Path::new("test.md")), FileType::Markdown); + assert_eq!( + FileType::from(Path::new("test.markdown")), + FileType::Markdown + ); + assert_eq!(FileType::from(Path::new("test.html")), FileType::Html); + assert_eq!(FileType::from(Path::new("test.txt")), FileType::Plaintext); + assert_eq!( + FileType::from(Path::new("test.something")), + FileType::Plaintext + ); + assert_eq!( + FileType::from(Path::new("/absolute/path/to/test.something")), + FileType::Plaintext + ); + } + #[test] fn test_extract_markdown_links() { let input = "This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)"; From e00cdbf1aefca49a307186ca8b72dd7f88c993d2 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:33:33 +0100 Subject: [PATCH 02/10] example.com -> example.org --- .github/workflows/links.yml | 6 +-- README.md | 2 +- fixtures/TEST.md | 8 ++-- fixtures/TEST_HTML5.html | 8 ++-- fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html | 6 +-- fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html | 2 +- fixtures/TEST_HTML5_MALFORMED_LINKS.html | 6 +-- fixtures/TEST_HTML5_MINIFIED.html | 2 +- src/bin/lychee/options.rs | 2 +- src/bin/lychee/stats.rs | 10 ++--- src/extract.rs | 48 +++++++++++----------- src/uri.rs | 12 +++--- 12 files changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index a1820d3..bd088d5 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -11,12 +11,12 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - + - name: Link Checker uses: lycheeverse/lychee-action@master with: - args: --exclude https://example.com/README.md - + args: --exclude 'https://example.org/README.md' + - name: Create Issue From File uses: peter-evans/create-issue-from-file@v2 with: diff --git a/README.md b/README.md index f6c53d6..80676e1 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ OPTIONS: ARGS: ... The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs - (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.com/README.md`) or standard + (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments [default: README.md] ``` diff --git a/fixtures/TEST.md b/fixtures/TEST.md index 81eff74..90c7f5c 100644 --- a/fixtures/TEST.md +++ b/fixtures/TEST.md @@ -16,10 +16,10 @@ Some more complex formatting to test that Markdown parsing works. [![CC0](https://i.creativecommons.org/p/zero/1.0/88x31.png)](https://creativecommons.org/publicdomain/zero/1.0/) Test HTTP and HTTPS for the same site. -http://example.com -https://example.com +http://example.org +https://example.org https://www.peerlyst.com/posts/a-list-of-static-analysis-tools-for-c-c-peerlyst -test@example.com -mailto:test2@example.com +test@example.org +mailto:test2@example.org diff --git a/fixtures/TEST_HTML5.html b/fixtures/TEST_HTML5.html index f9ff015..b6ae28d 100644 --- a/fixtures/TEST_HTML5.html +++ b/fixtures/TEST_HTML5.html @@ -2,13 +2,13 @@ - + Test - + @@ -16,8 +16,8 @@ Hello world. - Link in body + Link in body -

+
diff --git a/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html b/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html index c9464c2..31ede82 100644 --- a/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html +++ b/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html @@ -3,8 +3,8 @@ - - - + + + diff --git a/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html b/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html index 1418934..58ae7b3 100644 --- a/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html +++ b/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html @@ -3,6 +3,6 @@ - Link in body + Link in body diff --git a/fixtures/TEST_HTML5_MALFORMED_LINKS.html b/fixtures/TEST_HTML5_MALFORMED_LINKS.html index 76c2061..ee40f66 100644 --- a/fixtures/TEST_HTML5_MALFORMED_LINKS.html +++ b/fixtures/TEST_HTML5_MALFORMED_LINKS.html @@ -3,8 +3,8 @@ - Malformed link - Malformed link - Valid link + Malformed link + Malformed link + Valid link diff --git a/fixtures/TEST_HTML5_MINIFIED.html b/fixtures/TEST_HTML5_MINIFIED.html index 84c8673..4651593 100644 --- a/fixtures/TEST_HTML5_MINIFIED.html +++ b/fixtures/TEST_HTML5_MINIFIED.html @@ -1 +1 @@ -
+
diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index 5227d0e..f72b941 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -83,7 +83,7 @@ macro_rules! fold_in { pub(crate) struct LycheeOptions { /// The inputs (where to get links to check from). /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`), - /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`). + /// remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`). /// Prefix with `--` to separate inputs from options that allow multiple arguments. #[structopt(name = "inputs", default_value = "README.md")] raw_inputs: Vec, diff --git a/src/bin/lychee/stats.rs b/src/bin/lychee/stats.rs index de53eaa..43ffb23 100644 --- a/src/bin/lychee/stats.rs +++ b/src/bin/lychee/stats.rs @@ -116,17 +116,17 @@ mod test_super { fn test_stats() { let mut stats = ResponseStats::new(); stats.add(Response { - uri: website("http://example.com/ok"), + uri: website("http://example.org/ok"), status: Status::Ok(http::StatusCode::OK), source: Input::Stdin, }); stats.add(Response { - uri: website("http://example.com/failed"), + uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, }); stats.add(Response { - uri: website("http://example.com/redirect"), + uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, }); @@ -135,12 +135,12 @@ mod test_super { Input::Stdin, vec![ Response { - uri: website("http://example.com/failed"), + uri: website("http://example.org/failed"), status: Status::Failed(http::StatusCode::BAD_GATEWAY), source: Input::Stdin, }, Response { - uri: website("http://example.com/redirect"), + uri: website("http://example.org/redirect"), status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT), source: Input::Stdin, }, diff --git a/src/extract.rs b/src/extract.rs index a5be89e..b6f7202 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -289,7 +289,7 @@ mod test { #[test] fn test_non_markdown_links() { let input = - "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com"; + "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org"; let links: HashSet = extract_links(&InputContent::from_string(input, FileType::Plaintext), None) .into_iter() @@ -299,7 +299,7 @@ mod test { let expected = [ website("https://endler.dev"), website("https://hello-rust.show/foo/bar?lol=1"), - Uri::Mail("test@example.com".to_string()), + Uri::Mail("test@example.org".to_string()), ] .iter() .cloned() @@ -330,11 +330,11 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/head/home"), - website("https://example.com/css/style_full_url.css"), + website("https://example.org/head/home"), + website("https://example.org/css/style_full_url.css"), // the body links wouldn't be present if the file was parsed strictly as XML - website("https://example.com/body/a"), - website("https://example.com/body/div_empty_a"), + website("https://example.org/body/a"), + website("https://example.org/body/div_empty_a"), ] .iter() .cloned() @@ -348,21 +348,21 @@ mod test { let input = load_fixture("TEST_HTML5.html"); let links: HashSet = extract_links( &InputContent::from_string(&input, FileType::Html), - Some(Url::parse("https://example.com").unwrap()), + Some(Url::parse("https://example.org").unwrap()), ) .into_iter() .map(|r| r.uri) .collect(); let expected_links = [ - website("https://example.com/head/home"), - website("https://example.com/images/icon.png"), - website("https://example.com/css/style_relative_url.css"), - website("https://example.com/css/style_full_url.css"), - website("https://example.com/js/script.js"), + website("https://example.org/head/home"), + website("https://example.org/images/icon.png"), + website("https://example.org/css/style_relative_url.css"), + website("https://example.org/css/style_full_url.css"), + website("https://example.org/js/script.js"), // the body links wouldn't be present if the file was parsed strictly as XML - website("https://example.com/body/a"), - website("https://example.com/body/div_empty_a"), + website("https://example.org/body/a"), + website("https://example.org/body/div_empty_a"), ] .iter() .cloned() @@ -381,7 +381,7 @@ mod test { .map(|r| r.uri) .collect(); - let expected_links = [website("https://example.com/body/a")] + let expected_links = [website("https://example.org/body/a")] .iter() .cloned() .collect(); @@ -400,11 +400,11 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/"), - website("https://example.com/favicon.ico"), + website("https://example.org/"), + website("https://example.org/favicon.ico"), website("https://fonts.externalsite.com"), - website("https://example.com/docs/"), - website("https://example.com/forum"), + website("https://example.org/docs/"), + website("https://example.org/forum"), ] .iter() .cloned() @@ -424,7 +424,7 @@ mod test { .collect(); let expected_links = [Uri::Website( - Url::parse("https://example.com/valid").unwrap(), + Url::parse("https://example.org/valid").unwrap(), )] .iter() .cloned() @@ -444,10 +444,10 @@ mod test { .collect(); let expected_links = [ - website("https://example.com/some-weird-element"), - website("https://example.com/even-weirder-src"), - website("https://example.com/even-weirder-href"), - website("https://example.com/citations"), + website("https://example.org/some-weird-element"), + website("https://example.org/even-weirder-src"), + website("https://example.org/even-weirder-href"), + website("https://example.org/citations"), ] .iter() .cloned() diff --git a/src/uri.rs b/src/uri.rs index 5834a5e..9e2b271 100644 --- a/src/uri.rs +++ b/src/uri.rs @@ -74,16 +74,16 @@ mod test { fn test_uri_from_str() { assert!(matches!(Uri::try_from(""), Err(_))); assert_eq!( - Uri::try_from("http://example.com").unwrap(), - website("http://example.com") + Uri::try_from("http://example.org").unwrap(), + website("http://example.org") ); assert_eq!( - Uri::try_from("mail@example.com").unwrap(), - Uri::Mail("mail@example.com".to_string()) + Uri::try_from("mail@example.org").unwrap(), + Uri::Mail("mail@example.org".to_string()) ); assert_eq!( - Uri::try_from("mailto:mail@example.com").unwrap(), - Uri::Mail("mail@example.com".to_string()) + Uri::try_from("mailto:mail@example.org").unwrap(), + Uri::Mail("mail@example.org".to_string()) ); } From ca71a5df2d31048071eb078348b3697fbadfc37d Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:34:22 +0100 Subject: [PATCH 03/10] Add filter module that encapsulates includes/excludes --- src/client.rs | 291 ++++------------------------------------- src/excludes.rs | 79 ----------- src/filter/filter.rs | 260 ++++++++++++++++++++++++++++++++++++ src/filter/includes.rs | 36 +++++ src/filter/mod.rs | 7 + src/lib.rs | 3 +- 6 files changed, 329 insertions(+), 347 deletions(-) delete mode 100644 src/excludes.rs create mode 100644 src/filter/filter.rs create mode 100644 src/filter/includes.rs create mode 100644 src/filter/mod.rs diff --git a/src/client.rs b/src/client.rs index 8ef305c..eb493bd 100644 --- a/src/client.rs +++ b/src/client.rs @@ -10,9 +10,12 @@ use std::{collections::HashSet, time::Duration}; use tokio::time::sleep; use url::Url; +use crate::filter::Excludes; +use crate::filter::Filter; +use crate::filter::Includes; use crate::types::{Response, Status}; use crate::uri::Uri; -use crate::{excludes::Excludes, Request}; +use crate::Request; const VERSION: &str = env!("CARGO_PKG_VERSION"); const DEFAULT_MAX_REDIRECTS: usize = 5; @@ -21,9 +24,7 @@ const DEFAULT_MAX_REDIRECTS: usize = 5; pub struct Client { reqwest_client: reqwest::Client, github: Option, - includes: Option, - excludes: Excludes, - scheme: Option, + filter: Filter, method: reqwest::Method, accepted: Option>, } @@ -90,6 +91,12 @@ impl ClientBuilder { } } + fn build_includes(&mut self) -> Includes { + Includes { + regex: self.includes.clone().unwrap_or_default(), + } + } + /// The build method instantiates the client. pub fn build(&mut self) -> Result { let mut headers = HeaderMap::new(); @@ -140,12 +147,15 @@ impl ClientBuilder { let scheme = self.scheme.clone().unwrap_or(None); let scheme = scheme.map(|s| s.to_lowercase()); + let includes = self.build_includes(); + let excludes = self.build_excludes(); + + let filter = Filter::new(Some(includes), Some(excludes), scheme); + Ok(Client { reqwest_client, github, - includes: self.includes.clone().unwrap_or(None), - excludes: self.build_excludes(), - scheme, + filter, method: self.method.clone().unwrap_or(reqwest::Method::GET), accepted: self.accepted.clone().unwrap_or(None), }) @@ -156,9 +166,9 @@ impl Client { pub async fn check>(&self, request: T) -> Result { let request: Request = match request.try_into() { Ok(request) => request, - Err(_e) => bail!("Invalid URI:"), + Err(_e) => bail!("Invalid URI"), }; - if self.excluded(&request) { + if self.filter.excluded(&request) { return Ok(Response::new(request.uri, Status::Excluded, request.source)); } let status = match request.uri { @@ -252,37 +262,6 @@ impl Client { } } } - - pub fn excluded(&self, request: &Request) -> bool { - if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { - return true; - } - if self.excludes.ip(&request.uri) { - return true; - } - if let Some(includes) = &self.includes { - if includes.is_empty() { - return false; - } - if includes.is_match(request.uri.as_str()) { - // Includes take precedence over excludes - return false; - } else { - // In case we have includes and no excludes, - // skip everything that was not included - if self.excludes.is_empty() { - return true; - } - } - } - if self.excludes.regex(request.uri.as_str()) { - return true; - } - if self.scheme.is_none() { - return false; - } - request.uri.scheme() != self.scheme - } } /// A convenience function to check a single URI @@ -295,41 +274,12 @@ pub async fn check>(request: T) -> Result { #[cfg(test)] mod test { - use crate::collector::Input; - use super::*; use http::StatusCode; use std::time::{Duration, Instant}; - use url::Url; use wiremock::matchers::method; use wiremock::{Mock, MockServer, ResponseTemplate}; - // Note: the standard library as of Rust stable 1.47.0 does not expose - // "link-local" or "private" IPv6 checks. However, one might argue - // that these concepts do exist in IPv6, albeit the naming is different. - // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 - // See: https://en.wikipedia.org/wiki/Private_network#IPv6 - // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local - const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; - const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; - const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; - - const V4_LOOPBACK: &str = "http://127.0.0.1"; - const V6_LOOPBACK: &str = "http://[::1]"; - - const V4_LINK_LOCAL: &str = "http://169.254.0.1"; - - // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) - const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; - const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; - - fn website_url(s: &str) -> Request { - Request::new( - Uri::Website(Url::parse(s).expect("Expected valid Website URI")), - Input::Stdin, - ) - } - #[tokio::test] async fn test_nonexistent() { let template = ResponseTemplate::new(404); @@ -361,7 +311,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url(&mock_server.uri())) + .check(mock_server.uri()) .await .unwrap(); let end = start.elapsed(); @@ -390,7 +340,7 @@ mod test { ClientBuilder::default() .build() .unwrap() - .check(website_url("https://github.com/lycheeverse/lychee")) + .check("https://github.com/lycheeverse/lychee") .await .unwrap() .status, @@ -422,7 +372,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url(&mock_server.uri())) + .check(mock_server.uri()) .await .unwrap() .status; @@ -455,7 +405,7 @@ mod test { let res = ClientBuilder::default() .build() .unwrap() - .check(website_url("https://crates.io/crates/lychee")) + .check("https://crates.io/crates/lychee") .await .unwrap(); assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND))); @@ -469,7 +419,7 @@ mod test { .custom_headers(custom) .build() .unwrap() - .check(website_url("https://crates.io/crates/lychee")) + .check("https://crates.io/crates/lychee") .await .unwrap(); assert!(matches!(res.status, Status::Ok(_))); @@ -496,198 +446,7 @@ mod test { .build() .unwrap(); - let resp = client.check(website_url(&mock_server.uri())).await.unwrap(); + let resp = client.check(mock_server.uri()).await.unwrap(); assert!(matches!(resp.status, Status::Timeout(_))); } - - #[tokio::test] - async fn test_include_regex() { - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default().includes(includes).build().unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_includes_and_excludes_empty() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let exclude = Some(RegexSet::empty()); - let includes = RegexSet::empty(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - } - - #[tokio::test] - async fn test_include_with_empty_exclude() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let exclude = Some(RegexSet::empty()); - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!(client.excluded(&website_url("https://github.com")), true); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_exclude_include_regex() { - let exclude = Some(RegexSet::new(&[r"github.com"]).unwrap()); - let includes = RegexSet::new(&[r"foo.github.com"]).unwrap(); - - let client = ClientBuilder::default() - .includes(includes) - .excludes(exclude) - .build() - .unwrap(); - - assert_eq!( - client.excluded(&website_url("https://foo.github.com")), - false - ); - assert_eq!(client.excluded(&website_url("https://github.com")), true); - assert_eq!( - client.excluded(&website_url("https://bar.github.com")), - true - ); - } - - #[tokio::test] - async fn test_exclude_regex() { - let exclude = - Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap()); - - let client = ClientBuilder::default().excludes(exclude).build().unwrap(); - - assert_eq!(client.excluded(&website_url("http://github.com")), true); - assert_eq!(client.excluded(&website_url("http://exclude.org")), true); - assert_eq!( - client.excluded(&Request::new( - Uri::Mail("mail@example.com".to_string()), - Input::Stdin, - )), - true - ); - assert_eq!( - client.excluded(&Request::new( - Uri::Mail("foo@bar.dev".to_string()), - Input::Stdin, - )), - false - ); - } - - #[test] - fn test_const_sanity() { - let get_host = |s| { - Url::parse(s) - .expect("Expected valid URL") - .host() - .expect("Expected host address") - .to_owned() - }; - let into_v4 = |host| match host { - url::Host::Ipv4(ipv4) => ipv4, - _ => panic!("Not IPv4"), - }; - let into_v6 = |host| match host { - url::Host::Ipv6(ipv6) => ipv6, - _ => panic!("Not IPv6"), - }; - - assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); - - assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); - assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); - - assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); - } - - #[test] - fn test_excludes_no_private_ips_by_default() { - let client = ClientBuilder::default().build().unwrap(); - - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false); - assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false); - assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false); - - assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false); - } - - #[test] - fn test_exclude_private() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.private_ips = true; - - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true); - assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true); - } - - #[test] - fn test_exclude_link_local() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.link_local_ips = true; - - assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true); - } - - #[test] - fn test_exclude_loopback() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.loopback_ips = true; - - assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true); - assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true); - } - - #[test] - fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { - let mut client = ClientBuilder::default().build().unwrap(); - client.excludes.private_ips = true; - client.excludes.link_local_ips = true; - - // if these were pure IPv4, we would exclude - assert_eq!( - client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)), - false - ); - assert_eq!( - client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)), - false - ); - } } diff --git a/src/excludes.rs b/src/excludes.rs deleted file mode 100644 index 2d981cb..0000000 --- a/src/excludes.rs +++ /dev/null @@ -1,79 +0,0 @@ -use std::net::IpAddr; - -use regex::RegexSet; - -use crate::Uri; - -/// Exclude configuration for the link checker. -/// You can ignore links based on regex patterns or pre-defined IP ranges. -#[derive(Clone, Debug)] -pub struct Excludes { - pub regex: Option, - /// Example: 192.168.0.1 - pub private_ips: bool, - /// Example: 169.254.0.0 - pub link_local_ips: bool, - /// For IPv4: 127.0. 0.1/8 - /// For IPv6: ::1/128 - pub loopback_ips: bool, - /// Example: octocat@github.com - pub mail: bool, -} - -impl Default for Excludes { - fn default() -> Self { - Self { - regex: None, - private_ips: false, - link_local_ips: false, - loopback_ips: false, - mail: false, - } - } -} - -impl Excludes { - pub fn regex(&self, input: &str) -> bool { - if let Some(excludes) = &self.regex { - if excludes.is_match(input) { - return true; - } - } - false - } - - pub fn ip(&self, uri: &Uri) -> bool { - if let Some(ipaddr) = uri.host_ip() { - if self.loopback_ips && ipaddr.is_loopback() { - return true; - } - - // Note: in a pathological case, an IPv6 address can be IPv4-mapped - // (IPv4 address embedded in a IPv6). We purposefully - // don't deal with it here, and assume if an address is IPv6, - // we shouldn't attempt to map it to IPv4. - // See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2 - if let IpAddr::V4(v4addr) = ipaddr { - if self.private_ips && v4addr.is_private() { - return true; - } - if self.link_local_ips && v4addr.is_link_local() { - return true; - } - } - } - - false - } - - pub fn is_mail_excluded(&self) -> bool { - self.mail - } - - pub fn is_empty(&self) -> bool { - match &self.regex { - None => true, - Some(regex_set) => regex_set.is_empty(), - } - } -} diff --git a/src/filter/filter.rs b/src/filter/filter.rs new file mode 100644 index 0000000..34350ef --- /dev/null +++ b/src/filter/filter.rs @@ -0,0 +1,260 @@ +use crate::uri::Uri; +use crate::Request; + +use super::{excludes::Excludes, includes::Includes}; + +/// A generic URI filter +/// Used to decide if a given URI should be checked or skipped +#[derive(Clone, Debug)] +pub struct Filter { + includes: Includes, + excludes: Excludes, + scheme: Option, +} + +impl Filter { + pub fn new( + includes: Option, + excludes: Option, + scheme: Option, + ) -> Self { + let includes = match includes { + Some(includes) => includes, + None => Includes::default(), + }; + let excludes = match excludes { + Some(excludes) => excludes, + None => Excludes::default(), + }; + Filter { + includes, + excludes, + scheme, + } + } + + pub fn excluded(&self, request: &Request) -> bool { + // Skip mail? + if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { + return true; + } + // Skip specific IP address? + if self.excludes.ip(&request.uri) { + return true; + } + // No regex includes/excludes at all? + if self.includes.is_empty() && self.excludes.is_empty() { + return false; + } + if self.includes.regex(request.uri.as_str()) { + // Includes take precedence over excludes + return false; + } + // In case we have includes and no excludes, + // skip everything that was not included + if !self.includes.is_empty() && self.excludes.is_empty() { + return true; + } + + // We have no includes. Check regex excludes + if self.excludes.regex(request.uri.as_str()) { + return true; + } + + if self.scheme.is_none() { + return false; + } + request.uri.scheme() != self.scheme + } +} + +#[cfg(test)] +mod test { + // Note: the standard library as of Rust stable 1.47.0 does not expose + // "link-local" or "private" IPv6 checks. However, one might argue + // that these concepts do exist in IPv6, albeit the naming is different. + // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 + // See: https://en.wikipedia.org/wiki/Private_network#IPv6 + // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local + const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; + const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; + const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; + + const V4_LOOPBACK: &str = "http://127.0.0.1"; + const V6_LOOPBACK: &str = "http://[::1]"; + + const V4_LINK_LOCAL: &str = "http://169.254.0.1"; + + // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) + const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; + const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; + + use regex::RegexSet; + use reqwest::Url; + + use super::*; + + use crate::{test_utils::website, Input}; + + /// Helper method to convert a string into a Request + /// Note: This panics on error, so it should only be used for testing + pub fn request(url: &str) -> Request { + Request::new(website(url), Input::Stdin) + } + + #[test] + fn test_const_sanity() { + let get_host = |s| { + Url::parse(s) + .expect("Expected valid URL") + .host() + .expect("Expected host address") + .to_owned() + }; + let into_v4 = |host| match host { + url::Host::Ipv4(ipv4) => ipv4, + _ => panic!("Not IPv4"), + }; + let into_v6 = |host| match host { + url::Host::Ipv6(ipv6) => ipv6, + _ => panic!("Not IPv6"), + }; + + assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); + + assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); + assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); + + assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); + } + + #[test] + fn test_include_regex() { + let includes = Some(Includes::new(Some( + RegexSet::new(&[r"foo.example.org"]).unwrap(), + ))); + let filter = Filter::new(includes, None, None); + + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_includes_and_excludes_empty() { + // This is the pre-configured, empty set of excludes for a client + // In this case, only the requests matching the include set will be checked + let includes = Some(Includes::default()); + let excludes = Some(Excludes::default()); + let filter = Filter::new(includes, excludes, None); + assert_eq!(filter.excluded(&request("https://example.org")), false); + } + + #[test] + fn test_include_with_empty_exclude() { + let includes = Some(Includes::new(Some( + RegexSet::new(&[r"foo.example.org"]).unwrap(), + ))); + // This is the pre-configured, empty set of excludes for a client + let excludes = Some(Excludes::default()); + let filter = Filter::new(includes, excludes, None); + + // In this case, only the requests matching the include set will be checked + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_exclude_include_regex() { + let includes = Some(Includes::new(Some( + RegexSet::new(&[r"foo.example.org"]).unwrap(), + ))); + let mut excludes = Excludes::default(); + excludes.regex = Some(RegexSet::new(&[r"example.org"]).unwrap()); + let filter = Filter::new(includes, Some(excludes), None); + + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_exclude_regex() { + let mut excludes = Excludes::default(); + excludes.regex = + Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap()); + let filter = Filter::new(None, Some(excludes), None); + + assert_eq!(filter.excluded(&request("http://github.com")), true); + assert_eq!(filter.excluded(&request("http://exclude.org")), true); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("mail@example.org".to_string()), + Input::Stdin, + )), + true + ); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("foo@bar.dev".to_string()), + Input::Stdin, + )), + false + ); + } + + #[test] + fn test_excludes_no_private_ips_by_default() { + let filter = Filter::new(None, None, None); + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false); + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false); + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false); + + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false); + } + + #[test] + fn test_exclude_private() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true); + } + + #[test] + fn test_exclude_link_local() { + let mut filter = Filter::new(None, None, None); + filter.excludes.link_local_ips = true; + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true); + } + + #[test] + fn test_exclude_loopback() { + let mut filter = Filter::new(None, None, None); + filter.excludes.loopback_ips = true; + + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true); + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true); + } + + #[test] + fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + filter.excludes.link_local_ips = true; + + // if these were pure IPv4, we would exclude + assert_eq!( + filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)), + false + ); + assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false); + } +} diff --git a/src/filter/includes.rs b/src/filter/includes.rs new file mode 100644 index 0000000..d9e0c6c --- /dev/null +++ b/src/filter/includes.rs @@ -0,0 +1,36 @@ +use regex::RegexSet; + +/// Include configuration for the link checker. +/// You can include links based on regex patterns +#[derive(Clone, Debug)] +pub struct Includes { + pub regex: Option, +} + +impl Default for Includes { + fn default() -> Self { + Self { regex: None } + } +} + +impl Includes { + pub fn new(regex: Option) -> Self { + Self { regex } + } + + pub fn regex(&self, input: &str) -> bool { + if let Some(includes) = &self.regex { + if includes.is_match(input) { + return true; + } + } + false + } + + pub fn is_empty(&self) -> bool { + match &self.regex { + None => true, + Some(regex_set) => regex_set.is_empty(), + } + } +} diff --git a/src/filter/mod.rs b/src/filter/mod.rs new file mode 100644 index 0000000..158541e --- /dev/null +++ b/src/filter/mod.rs @@ -0,0 +1,7 @@ +mod excludes; +mod filter; +mod includes; + +pub use excludes::Excludes; +pub use filter::Filter; +pub use includes::Includes; diff --git a/src/lib.rs b/src/lib.rs index 6cd698e..4afc504 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,7 +41,7 @@ doctest!("../README.md"); mod client; mod client_pool; -mod excludes; +mod filter; mod types; mod uri; @@ -53,6 +53,5 @@ pub use client::check; pub use client::ClientBuilder; pub use client_pool::ClientPool; pub use collector::Input; -pub use excludes::Excludes; pub use types::*; pub use uri::Uri; From 9fe1244f8ccc346dbb087699ba0d1211449a2314 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:34:46 +0100 Subject: [PATCH 04/10] Add expect for website function in test_utils --- src/test_utils.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test_utils.rs b/src/test_utils.rs index 811c4c3..9d6b429 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -34,6 +34,8 @@ where mock_server } +/// Helper method to convert a string into a URI +/// Note: This panics on error, so it should only be used for testing pub fn website(url: &str) -> Uri { - Uri::Website(Url::parse(url).unwrap()) + Uri::Website(Url::parse(url).expect("Expected valid Website URI")) } From 1f6cbd1aa48ba373a7b5b0fe8a58b3494486cbc2 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:35:13 +0100 Subject: [PATCH 05/10] Add excludes to filter module --- src/filter/excludes.rs | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/filter/excludes.rs diff --git a/src/filter/excludes.rs b/src/filter/excludes.rs new file mode 100644 index 0000000..118a541 --- /dev/null +++ b/src/filter/excludes.rs @@ -0,0 +1,78 @@ +use regex::RegexSet; +use std::net::IpAddr; + +use crate::Uri; + +/// Exclude configuration for the link checker. +/// You can ignore links based on regex patterns or pre-defined IP ranges. +#[derive(Clone, Debug)] +pub struct Excludes { + pub regex: Option, + /// Example: 192.168.0.1 + pub private_ips: bool, + /// Example: 169.254.0.0 + pub link_local_ips: bool, + /// For IPv4: 127.0. 0.1/8 + /// For IPv6: ::1/128 + pub loopback_ips: bool, + /// Example: octocat@github.com + pub mail: bool, +} + +impl Default for Excludes { + fn default() -> Self { + Self { + regex: None, + private_ips: false, + link_local_ips: false, + loopback_ips: false, + mail: false, + } + } +} + +impl Excludes { + pub fn regex(&self, input: &str) -> bool { + if let Some(excludes) = &self.regex { + if excludes.is_match(input) { + return true; + } + } + false + } + + pub fn ip(&self, uri: &Uri) -> bool { + if let Some(ipaddr) = uri.host_ip() { + if self.loopback_ips && ipaddr.is_loopback() { + return true; + } + + // Note: in a pathological case, an IPv6 address can be IPv4-mapped + // (IPv4 address embedded in a IPv6). We purposefully + // don't deal with it here, and assume if an address is IPv6, + // we shouldn't attempt to map it to IPv4. + // See: https://tools.ietf.org/html/rfc4291#section-2.5.5.2 + if let IpAddr::V4(v4addr) = ipaddr { + if self.private_ips && v4addr.is_private() { + return true; + } + if self.link_local_ips && v4addr.is_link_local() { + return true; + } + } + } + + false + } + + pub fn is_mail_excluded(&self) -> bool { + self.mail + } + + pub fn is_empty(&self) -> bool { + match &self.regex { + None => true, + Some(regex_set) => regex_set.is_empty(), + } + } +} From fa1952dd98c1215c05aa705d0305b8c3dab586c0 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:42:09 +0100 Subject: [PATCH 06/10] cleanup --- src/filter/filter.rs | 260 ----------------------------------------- src/filter/includes.rs | 4 - src/filter/mod.rs | 260 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 258 insertions(+), 266 deletions(-) delete mode 100644 src/filter/filter.rs diff --git a/src/filter/filter.rs b/src/filter/filter.rs deleted file mode 100644 index 34350ef..0000000 --- a/src/filter/filter.rs +++ /dev/null @@ -1,260 +0,0 @@ -use crate::uri::Uri; -use crate::Request; - -use super::{excludes::Excludes, includes::Includes}; - -/// A generic URI filter -/// Used to decide if a given URI should be checked or skipped -#[derive(Clone, Debug)] -pub struct Filter { - includes: Includes, - excludes: Excludes, - scheme: Option, -} - -impl Filter { - pub fn new( - includes: Option, - excludes: Option, - scheme: Option, - ) -> Self { - let includes = match includes { - Some(includes) => includes, - None => Includes::default(), - }; - let excludes = match excludes { - Some(excludes) => excludes, - None => Excludes::default(), - }; - Filter { - includes, - excludes, - scheme, - } - } - - pub fn excluded(&self, request: &Request) -> bool { - // Skip mail? - if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { - return true; - } - // Skip specific IP address? - if self.excludes.ip(&request.uri) { - return true; - } - // No regex includes/excludes at all? - if self.includes.is_empty() && self.excludes.is_empty() { - return false; - } - if self.includes.regex(request.uri.as_str()) { - // Includes take precedence over excludes - return false; - } - // In case we have includes and no excludes, - // skip everything that was not included - if !self.includes.is_empty() && self.excludes.is_empty() { - return true; - } - - // We have no includes. Check regex excludes - if self.excludes.regex(request.uri.as_str()) { - return true; - } - - if self.scheme.is_none() { - return false; - } - request.uri.scheme() != self.scheme - } -} - -#[cfg(test)] -mod test { - // Note: the standard library as of Rust stable 1.47.0 does not expose - // "link-local" or "private" IPv6 checks. However, one might argue - // that these concepts do exist in IPv6, albeit the naming is different. - // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 - // See: https://en.wikipedia.org/wiki/Private_network#IPv6 - // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local - const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; - const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; - const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; - - const V4_LOOPBACK: &str = "http://127.0.0.1"; - const V6_LOOPBACK: &str = "http://[::1]"; - - const V4_LINK_LOCAL: &str = "http://169.254.0.1"; - - // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) - const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; - const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; - - use regex::RegexSet; - use reqwest::Url; - - use super::*; - - use crate::{test_utils::website, Input}; - - /// Helper method to convert a string into a Request - /// Note: This panics on error, so it should only be used for testing - pub fn request(url: &str) -> Request { - Request::new(website(url), Input::Stdin) - } - - #[test] - fn test_const_sanity() { - let get_host = |s| { - Url::parse(s) - .expect("Expected valid URL") - .host() - .expect("Expected host address") - .to_owned() - }; - let into_v4 = |host| match host { - url::Host::Ipv4(ipv4) => ipv4, - _ => panic!("Not IPv4"), - }; - let into_v6 = |host| match host { - url::Host::Ipv6(ipv6) => ipv6, - _ => panic!("Not IPv6"), - }; - - assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); - assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); - - assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); - assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); - - assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); - } - - #[test] - fn test_include_regex() { - let includes = Some(Includes::new(Some( - RegexSet::new(&[r"foo.example.org"]).unwrap(), - ))); - let filter = Filter::new(includes, None, None); - - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - - #[test] - fn test_includes_and_excludes_empty() { - // This is the pre-configured, empty set of excludes for a client - // In this case, only the requests matching the include set will be checked - let includes = Some(Includes::default()); - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); - assert_eq!(filter.excluded(&request("https://example.org")), false); - } - - #[test] - fn test_include_with_empty_exclude() { - let includes = Some(Includes::new(Some( - RegexSet::new(&[r"foo.example.org"]).unwrap(), - ))); - // This is the pre-configured, empty set of excludes for a client - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); - - // In this case, only the requests matching the include set will be checked - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://example.org")), true); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - - #[test] - fn test_exclude_include_regex() { - let includes = Some(Includes::new(Some( - RegexSet::new(&[r"foo.example.org"]).unwrap(), - ))); - let mut excludes = Excludes::default(); - excludes.regex = Some(RegexSet::new(&[r"example.org"]).unwrap()); - let filter = Filter::new(includes, Some(excludes), None); - - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://example.org")), true); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - - #[test] - fn test_exclude_regex() { - let mut excludes = Excludes::default(); - excludes.regex = - Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap()); - let filter = Filter::new(None, Some(excludes), None); - - assert_eq!(filter.excluded(&request("http://github.com")), true); - assert_eq!(filter.excluded(&request("http://exclude.org")), true); - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("mail@example.org".to_string()), - Input::Stdin, - )), - true - ); - assert_eq!( - filter.excluded(&Request::new( - Uri::Mail("foo@bar.dev".to_string()), - Input::Stdin, - )), - false - ); - } - - #[test] - fn test_excludes_no_private_ips_by_default() { - let filter = Filter::new(None, None, None); - - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false); - assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false); - assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false); - - assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false); - } - - #[test] - fn test_exclude_private() { - let mut filter = Filter::new(None, None, None); - filter.excludes.private_ips = true; - - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true); - assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true); - } - - #[test] - fn test_exclude_link_local() { - let mut filter = Filter::new(None, None, None); - filter.excludes.link_local_ips = true; - assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true); - } - - #[test] - fn test_exclude_loopback() { - let mut filter = Filter::new(None, None, None); - filter.excludes.loopback_ips = true; - - assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true); - assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true); - } - - #[test] - fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { - let mut filter = Filter::new(None, None, None); - filter.excludes.private_ips = true; - filter.excludes.link_local_ips = true; - - // if these were pure IPv4, we would exclude - assert_eq!( - filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)), - false - ); - assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false); - } -} diff --git a/src/filter/includes.rs b/src/filter/includes.rs index d9e0c6c..5ff0751 100644 --- a/src/filter/includes.rs +++ b/src/filter/includes.rs @@ -14,10 +14,6 @@ impl Default for Includes { } impl Includes { - pub fn new(regex: Option) -> Self { - Self { regex } - } - pub fn regex(&self, input: &str) -> bool { if let Some(includes) = &self.regex { if includes.is_match(input) { diff --git a/src/filter/mod.rs b/src/filter/mod.rs index 158541e..83d46f2 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -1,7 +1,263 @@ mod excludes; -mod filter; mod includes; pub use excludes::Excludes; -pub use filter::Filter; pub use includes::Includes; + +use crate::uri::Uri; +use crate::Request; + +/// A generic URI filter +/// Used to decide if a given URI should be checked or skipped +#[derive(Clone, Debug)] +pub struct Filter { + includes: Includes, + excludes: Excludes, + scheme: Option, +} + +impl Filter { + pub fn new( + includes: Option, + excludes: Option, + scheme: Option, + ) -> Self { + let includes = match includes { + Some(includes) => includes, + None => Includes::default(), + }; + let excludes = match excludes { + Some(excludes) => excludes, + None => Excludes::default(), + }; + Filter { + includes, + excludes, + scheme, + } + } + + pub fn excluded(&self, request: &Request) -> bool { + // Skip mail? + if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() { + return true; + } + // Skip specific IP address? + if self.excludes.ip(&request.uri) { + return true; + } + // No regex includes/excludes at all? + if self.includes.is_empty() && self.excludes.is_empty() { + return false; + } + if self.includes.regex(request.uri.as_str()) { + // Includes take precedence over excludes + return false; + } + // In case we have includes and no excludes, + // skip everything that was not included + if !self.includes.is_empty() && self.excludes.is_empty() { + return true; + } + + // We have no includes. Check regex excludes + if self.excludes.regex(request.uri.as_str()) { + return true; + } + + if self.scheme.is_none() { + return false; + } + request.uri.scheme() != self.scheme + } +} + +#[cfg(test)] +mod test { + // Note: the standard library as of Rust stable 1.47.0 does not expose + // "link-local" or "private" IPv6 checks. However, one might argue + // that these concepts do exist in IPv6, albeit the naming is different. + // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6 + // See: https://en.wikipedia.org/wiki/Private_network#IPv6 + // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local + const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1"; + const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1"; + const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1"; + + const V4_LOOPBACK: &str = "http://127.0.0.1"; + const V6_LOOPBACK: &str = "http://[::1]"; + + const V4_LINK_LOCAL: &str = "http://169.254.0.1"; + + // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6) + const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]"; + const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]"; + + use regex::RegexSet; + use reqwest::Url; + + use super::*; + + use crate::{test_utils::website, Input}; + + /// Helper method to convert a string into a Request + /// Note: This panics on error, so it should only be used for testing + pub fn request(url: &str) -> Request { + Request::new(website(url), Input::Stdin) + } + + #[test] + fn test_const_sanity() { + let get_host = |s| { + Url::parse(s) + .expect("Expected valid URL") + .host() + .expect("Expected host address") + .to_owned() + }; + let into_v4 = |host| match host { + url::Host::Ipv4(ipv4) => ipv4, + _ => panic!("Not IPv4"), + }; + let into_v6 = |host| match host { + url::Host::Ipv6(ipv6) => ipv6, + _ => panic!("Not IPv6"), + }; + + assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private()); + assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private()); + + assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback()); + assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback()); + + assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); + } + + #[test] + fn test_include_regex() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + let filter = Filter::new(includes, None, None); + + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_includes_and_excludes_empty() { + // This is the pre-configured, empty set of excludes for a client + // In this case, only the requests matching the include set will be checked + let includes = Some(Includes::default()); + let excludes = Some(Excludes::default()); + let filter = Filter::new(includes, excludes, None); + assert_eq!(filter.excluded(&request("https://example.org")), false); + } + + #[test] + fn test_include_with_empty_exclude() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + // This is the pre-configured, empty set of excludes for a client + let excludes = Some(Excludes::default()); + let filter = Filter::new(includes, excludes, None); + + // In this case, only the requests matching the include set will be checked + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_exclude_include_regex() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + let mut excludes = Excludes::default(); + excludes.regex = Some(RegexSet::new(&[r"example.org"]).unwrap()); + let filter = Filter::new(includes, Some(excludes), None); + + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } + + #[test] + fn test_exclude_regex() { + let mut excludes = Excludes::default(); + excludes.regex = + Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap()); + let filter = Filter::new(None, Some(excludes), None); + + assert_eq!(filter.excluded(&request("http://github.com")), true); + assert_eq!(filter.excluded(&request("http://exclude.org")), true); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("mail@example.org".to_string()), + Input::Stdin, + )), + true + ); + assert_eq!( + filter.excluded(&Request::new( + Uri::Mail("foo@bar.dev".to_string()), + Input::Stdin, + )), + false + ); + } + + #[test] + fn test_excludes_no_private_ips_by_default() { + let filter = Filter::new(None, None, None); + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false); + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false); + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false); + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false); + } + + #[test] + fn test_exclude_private() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true); + assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true); + } + + #[test] + fn test_exclude_link_local() { + let mut filter = Filter::new(None, None, None); + filter.excludes.link_local_ips = true; + assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true); + } + + #[test] + fn test_exclude_loopback() { + let mut filter = Filter::new(None, None, None); + filter.excludes.loopback_ips = true; + + assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true); + assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true); + } + + #[test] + fn test_exclude_ip_v4_mapped_ip_v6_not_supported() { + let mut filter = Filter::new(None, None, None); + filter.excludes.private_ips = true; + filter.excludes.link_local_ips = true; + + // if these were pure IPv4, we would exclude + assert_eq!( + filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)), + false + ); + assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false); + } +} From 14d47d9108981fcf532601654860f51da404a12c Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 16:55:26 +0100 Subject: [PATCH 07/10] Initialize exclude using Default and field overwriting --- src/filter/mod.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/filter/mod.rs b/src/filter/mod.rs index 83d46f2..a4da6ee 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -175,8 +175,11 @@ mod test { let includes = Some(Includes { regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), }); - let mut excludes = Excludes::default(); - excludes.regex = Some(RegexSet::new(&[r"example.org"]).unwrap()); + let excludes = Excludes { + regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), + ..Default::default() + }; + let filter = Filter::new(includes, Some(excludes), None); assert_eq!(filter.excluded(&request("https://foo.example.org")), false); @@ -186,9 +189,12 @@ mod test { #[test] fn test_exclude_regex() { - let mut excludes = Excludes::default(); - excludes.regex = - Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap()); + let excludes = Excludes { + regex: Some( + RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(), + ), + ..Default::default() + }; let filter = Filter::new(None, Some(excludes), None); assert_eq!(filter.excluded(&request("http://github.com")), true); From 09ceda5931ba1d5fc0f7da21fe1ff541d996d297 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 17:00:44 +0100 Subject: [PATCH 08/10] Rearrange and extend tests --- src/filter/mod.rs | 58 ++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/src/filter/mod.rs b/src/filter/mod.rs index a4da6ee..adc0f3a 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -134,17 +134,6 @@ mod test { assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local()); } - #[test] - fn test_include_regex() { - let includes = Some(Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), - }); - let filter = Filter::new(includes, None, None); - - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - #[test] fn test_includes_and_excludes_empty() { // This is the pre-configured, empty set of excludes for a client @@ -156,35 +145,16 @@ mod test { } #[test] - fn test_include_with_empty_exclude() { + fn test_include_regex() { let includes = Some(Includes { regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), }); - // This is the pre-configured, empty set of excludes for a client - let excludes = Some(Excludes::default()); - let filter = Filter::new(includes, excludes, None); + let filter = Filter::new(includes, None, None); - // In this case, only the requests matching the include set will be checked + // Only the requests matching the include set will be checked assert_eq!(filter.excluded(&request("https://foo.example.org")), false); - assert_eq!(filter.excluded(&request("https://example.org")), true); assert_eq!(filter.excluded(&request("https://bar.example.org")), true); - } - - #[test] - fn test_exclude_include_regex() { - let includes = Some(Includes { - regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), - }); - let excludes = Excludes { - regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), - ..Default::default() - }; - - let filter = Filter::new(includes, Some(excludes), None); - - assert_eq!(filter.excluded(&request("https://foo.example.org")), false); assert_eq!(filter.excluded(&request("https://example.org")), true); - assert_eq!(filter.excluded(&request("https://bar.example.org")), true); } #[test] @@ -206,6 +176,8 @@ mod test { )), true ); + + assert_eq!(filter.excluded(&request("http://bar.dev")), false); assert_eq!( filter.excluded(&Request::new( Uri::Mail("foo@bar.dev".to_string()), @@ -214,6 +186,24 @@ mod test { false ); } + #[test] + fn test_exclude_include_regex() { + let includes = Some(Includes { + regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()), + }); + let excludes = Excludes { + regex: Some(RegexSet::new(&[r"example.org"]).unwrap()), + ..Default::default() + }; + + let filter = Filter::new(includes, Some(excludes), None); + + // Includes take preference over excludes + assert_eq!(filter.excluded(&request("https://foo.example.org")), false); + + assert_eq!(filter.excluded(&request("https://example.org")), true); + assert_eq!(filter.excluded(&request("https://bar.example.org")), true); + } #[test] fn test_excludes_no_private_ips_by_default() { @@ -228,7 +218,7 @@ mod test { } #[test] - fn test_exclude_private() { + fn test_exclude_private_ips() { let mut filter = Filter::new(None, None, None); filter.excludes.private_ips = true; From 2272ad1a48731bc50359d5d9111d30d3327cb1a4 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sun, 21 Feb 2021 17:19:32 +0100 Subject: [PATCH 09/10] Show progress bar by default (can be disabled with --no-progress) --- README.md | 3 ++- src/bin/lychee/main.rs | 15 ++++++++------- src/bin/lychee/options.rs | 8 +++++--- tests/cli.rs | 1 + 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 80676e1..98d4b15 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,8 @@ FLAGS: --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information -i, --insecure Proceed for server connections considered insecure (invalid TLS) - -p, --progress Show progress + -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for + continuos integration) --skip-missing Skip missing input files (default is to error if they don't exist) -V, --version Prints version information -v, --verbose Verbose program output diff --git a/src/bin/lychee/main.rs b/src/bin/lychee/main.rs index 870890d..732c25a 100644 --- a/src/bin/lychee/main.rs +++ b/src/bin/lychee/main.rs @@ -140,15 +140,16 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { ) .await?; - let pb = if cfg.progress { - let bar = - ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template( + let pb = match cfg.no_progress { + true => None, + false => { + let bar = ProgressBar::new(links.len() as u64) + .with_style(ProgressStyle::default_bar().template( "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}", )); - bar.enable_steady_tick(100); - Some(bar) - } else { - None + bar.enable_steady_tick(100); + Some(bar) + } }; let (send_req, recv_req) = mpsc::channel(max_concurrency); diff --git a/src/bin/lychee/options.rs b/src/bin/lychee/options.rs index f72b941..912cd96 100644 --- a/src/bin/lychee/options.rs +++ b/src/bin/lychee/options.rs @@ -116,10 +116,12 @@ pub struct Config { #[serde(default)] pub verbose: bool, - /// Show progress + /// Do not show progress bar. + /// This is recommended for non-interactive shells (e.g. for continuos + /// integration) #[structopt(short, long)] #[serde(default)] - pub progress: bool, + pub no_progress: bool, /// Maximum number of allowed redirects #[structopt(short, long, default_value = &MAX_REDIRECTS_STR)] @@ -273,7 +275,7 @@ impl Config { // Keys with defaults to assign verbose: false; - progress: false; + no_progress: false; max_redirects: MAX_REDIRECTS; max_concurrency: MAX_CONCURRENCY; threads: None; diff --git a/tests/cli.rs b/tests/cli.rs index 6b98c5d..096779a 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -75,6 +75,7 @@ mod cli { let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md"); cmd.arg(test_github_404_path) + .arg("--no-progress") .env_clear() .assert() .failure() From 551c988708e9c84bea202c0e401453a1d615e271 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 21 Feb 2021 17:27:32 +0100 Subject: [PATCH 10/10] Update links.yml --- .github/workflows/links.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml index bd088d5..9f0a166 100644 --- a/.github/workflows/links.yml +++ b/.github/workflows/links.yml @@ -15,7 +15,7 @@ jobs: - name: Link Checker uses: lycheeverse/lychee-action@master with: - args: --exclude 'https://example.org/README.md' + args: --verbose --no-progress --exclude 'https://example.org/README.md' - name: Create Issue From File uses: peter-evans/create-issue-from-file@v2