Merge pull request #153 from lycheeverse/filetype

Add "Filter" module that combines includes and excludes
2026-05-22 20:35:49 +00:00 · 2021-02-21 17:42:22 +01:00 · 2021-02-21 17:42:22 +01:00 · 27709d25e3
commit 27709d25e3
parent 41b82cb459 551c988708
20 changed files with 393 additions and 338 deletions
--- a/.github/workflows/links.yml
+++ b/.github/workflows/links.yml
@ -11,12 +11,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
-        
+
      - name: Link Checker
        uses: lycheeverse/lychee-action@master
        with:
-          args: --exclude https://example.com/README.md
-      
+          args: --verbose --no-progress --exclude 'https://example.org/README.md'
+
      - name: Create Issue From File
        uses: peter-evans/create-issue-from-file@v2
        with:
--- a/README.md
+++ b/README.md
@ -163,7 +163,8 @@ FLAGS:
        --glob-ignore-case       Ignore case when expanding filesystem path glob inputs
        --help                   Prints help information
    -i, --insecure               Proceed for server connections considered insecure (invalid TLS)
-    -p, --progress               Show progress
+    -n, --no-progress            Do not show progress bar. This is recommended for non-interactive shells (e.g. for
+                                 continuos integration)
        --skip-missing           Skip missing input files (default is to error if they don't exist)
    -V, --version                Prints version information
    -v, --verbose                Verbose program output
@ -191,7 +192,7 @@ OPTIONS:

 ARGS:
    <inputs>...    The inputs (where to get links to check from). These can be: files (e.g. `README.md`), file globs
-                   (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.com/README.md`) or standard
+                   (e.g. `"~/git/*/README.md"`), remote URLs (e.g. `https://example.org/README.md`) or standard
                   input (`-`). Prefix with `--` to separate inputs from options that allow multiple arguments
                   [default: README.md]
 ```
--- a/fixtures/TEST.md
+++ b/fixtures/TEST.md
@ -16,10 +16,10 @@ Some more complex formatting to test that Markdown parsing works.
 [![CC0](https://i.creativecommons.org/p/zero/1.0/88x31.png)](https://creativecommons.org/publicdomain/zero/1.0/)

 Test HTTP and HTTPS for the same site.
-http://example.com
-https://example.com
+http://example.org
+https://example.org

 https://www.peerlyst.com/posts/a-list-of-static-analysis-tools-for-c-c-peerlyst

-test@example.com
-mailto:test2@example.com
+test@example.org
+mailto:test2@example.org
--- a/fixtures/TEST_HTML5.html
+++ b/fixtures/TEST_HTML5.html
@ -2,13 +2,13 @@
 <html lang="en">
 <head>
  <meta charset="utf-8">
-  <link rel="home" href="https://example.com/head/home">
+  <link rel="home" href="https://example.org/head/home">
  <title>Test</title>
  <meta name="description" content="Test HTML5 parsing (not valid XML)">

  <!-- The links below have no closing tags (not valid XML) -->
  <link rel="icon" type="image/png" sizes="32x32" href="images/icon.png">
-  <link rel="stylesheet" type="text/css" href="https://example.com/css/style_full_url.css">
+  <link rel="stylesheet" type="text/css" href="https://example.org/css/style_full_url.css">
  <link rel="stylesheet" type="text/css" href="css/style_relative_url.css">

  <!-- The defer attribute has no value (not valid XML) -->
@ -16,8 +16,8 @@
 </head>
 <body>
  Hello world.
-  <a href="https://example.com/body/a">Link in body</a>
+  <a href="https://example.org/body/a">Link in body</a>
  <!-- Empty a tag might be problematic (in terms of browser support), but should still be parsed -->
-  <div><a href="https://example.com/body/div_empty_a"/></div>
+  <div><a href="https://example.org/body/div_empty_a"/></div>
 </body>
 </html>
--- a/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html
+++ b/fixtures/TEST_HTML5_CUSTOM_ELEMENTS.html
@ -3,8 +3,8 @@
 <head>
 </head>
 <body>
-  <some-weird-element href="https://example.com/some-weird-element"></some-weird-element>
-  <even-weirder fake-attr src="https://example.com/even-weirder-src" href="https://example.com/even-weirder-href"></even-weirder>
-  <citations cite="https://example.com/citations"></citations>
+  <some-weird-element href="https://example.org/some-weird-element"></some-weird-element>
+  <even-weirder fake-attr src="https://example.org/even-weirder-src" href="https://example.org/even-weirder-href"></even-weirder>
+  <citations cite="https://example.org/citations"></citations>
 </body>
 </html>
--- a/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html
+++ b/fixtures/TEST_HTML5_LOWERCASE_DOCTYPE.html
@ -3,6 +3,6 @@
 <head>
 </head>
 <body>
-  <a href="https://example.com/body/a">Link in body</a>
+  <a href="https://example.org/body/a">Link in body</a>
 </body>
 </html>
--- a/fixtures/TEST_HTML5_MALFORMED_LINKS.html
+++ b/fixtures/TEST_HTML5_MALFORMED_LINKS.html
@ -3,8 +3,8 @@
 <head>
 </head>
 <body>
-  <a href="https;//example.com/malformed_one">Malformed link</a>
-  <a href="https://example]com/malformed_two">Malformed link</a>
-  <a href="https://example.com/valid">Valid link</a>
+  <a href="https;//example.org/malformed_one">Malformed link</a>
+  <a href="https://example]org/malformed_two">Malformed link</a>
+  <a href="https://example.org/valid">Valid link</a>
 </body>
 </html>
--- a/fixtures/TEST_HTML5_MINIFIED.html
+++ b/fixtures/TEST_HTML5_MINIFIED.html
@ -1 +1 @@
-<!DOCTYPE html><html class=no-js lang=en><head><link href=https://example.com/ rel=canonical><link href=https://example.com/favicon.ico rel="shortcut icon"><link crossorigin="" href=https://fonts.externalsite.com rel=preconnect><body><div></div><header><nav><a href=https://example.com/docs/ title=Docs></a><div><a href=https://example.com/ title=Home></a></div></nav></header><div><nav><div><ul><li><a href=https://example.com/forum>Forum</a></ul></div></nav></div>
+<!DOCTYPE html><html class=no-js lang=en><head><link href=https://example.org/ rel=canonical><link href=https://example.org/favicon.ico rel="shortcut icon"><link crossorigin="" href=https://fonts.externalsite.com rel=preconnect><body><div></div><header><nav><a href=https://example.org/docs/ title=Docs></a><div><a href=https://example.org/ title=Home></a></div></nav></header><div><nav><div><ul><li><a href=https://example.org/forum>Forum</a></ul></div></nav></div>
--- a/src/bin/lychee/main.rs
+++ b/src/bin/lychee/main.rs
@ -140,15 +140,16 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
    )
    .await?;

-    let pb = if cfg.progress {
-        let bar =
-            ProgressBar::new(links.len() as u64).with_style(ProgressStyle::default_bar().template(
+    let pb = match cfg.no_progress {
+        true => None,
+        false => {
+            let bar = ProgressBar::new(links.len() as u64)
+                .with_style(ProgressStyle::default_bar().template(
                "{spinner:.red.bright} {pos}/{len:.dim} [{elapsed_precise}] {bar:25} {wide_msg}",
            ));
-        bar.enable_steady_tick(100);
-        Some(bar)
-    } else {
-        None
+            bar.enable_steady_tick(100);
+            Some(bar)
+        }
    };

    let (send_req, recv_req) = mpsc::channel(max_concurrency);
--- a/src/bin/lychee/options.rs
+++ b/src/bin/lychee/options.rs
@ -83,7 +83,7 @@ macro_rules! fold_in {
 pub(crate) struct LycheeOptions {
    /// The inputs (where to get links to check from).
    /// These can be: files (e.g. `README.md`), file globs (e.g. `"~/git/*/README.md"`),
-    /// remote URLs (e.g. `https://example.com/README.md`) or standard input (`-`).
+    /// remote URLs (e.g. `https://example.org/README.md`) or standard input (`-`).
    /// Prefix with `--` to separate inputs from options that allow multiple arguments.
    #[structopt(name = "inputs", default_value = "README.md")]
    raw_inputs: Vec<String>,
@ -116,10 +116,12 @@ pub struct Config {
    #[serde(default)]
    pub verbose: bool,

-    /// Show progress
+    /// Do not show progress bar.
+    /// This is recommended for non-interactive shells (e.g. for continuos
+    /// integration)
    #[structopt(short, long)]
    #[serde(default)]
-    pub progress: bool,
+    pub no_progress: bool,

    /// Maximum number of allowed redirects
    #[structopt(short, long, default_value = &MAX_REDIRECTS_STR)]
@ -273,7 +275,7 @@ impl Config {

            // Keys with defaults to assign
            verbose: false;
-            progress: false;
+            no_progress: false;
            max_redirects: MAX_REDIRECTS;
            max_concurrency: MAX_CONCURRENCY;
            threads: None;
--- a/src/bin/lychee/stats.rs
+++ b/src/bin/lychee/stats.rs
@ -116,17 +116,17 @@ mod test_super {
    fn test_stats() {
        let mut stats = ResponseStats::new();
        stats.add(Response {
-            uri: website("http://example.com/ok"),
+            uri: website("http://example.org/ok"),
            status: Status::Ok(http::StatusCode::OK),
            source: Input::Stdin,
        });
        stats.add(Response {
-            uri: website("http://example.com/failed"),
+            uri: website("http://example.org/failed"),
            status: Status::Failed(http::StatusCode::BAD_GATEWAY),
            source: Input::Stdin,
        });
        stats.add(Response {
-            uri: website("http://example.com/redirect"),
+            uri: website("http://example.org/redirect"),
            status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
            source: Input::Stdin,
        });
@ -135,12 +135,12 @@ mod test_super {
            Input::Stdin,
            vec![
                Response {
-                    uri: website("http://example.com/failed"),
+                    uri: website("http://example.org/failed"),
                    status: Status::Failed(http::StatusCode::BAD_GATEWAY),
                    source: Input::Stdin,
                },
                Response {
-                    uri: website("http://example.com/redirect"),
+                    uri: website("http://example.org/redirect"),
                    status: Status::Redirected(http::StatusCode::PERMANENT_REDIRECT),
                    source: Input::Stdin,
                },
--- a/src/client.rs
+++ b/src/client.rs
@ -10,9 +10,12 @@ use std::{collections::HashSet, time::Duration};
 use tokio::time::sleep;
 use url::Url;

+use crate::filter::Excludes;
+use crate::filter::Filter;
+use crate::filter::Includes;
 use crate::types::{Response, Status};
 use crate::uri::Uri;
-use crate::{excludes::Excludes, Request};
+use crate::Request;

 const VERSION: &str = env!("CARGO_PKG_VERSION");
 const DEFAULT_MAX_REDIRECTS: usize = 5;
@ -21,9 +24,7 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
 pub struct Client {
    reqwest_client: reqwest::Client,
    github: Option<Github>,
-    includes: Option<RegexSet>,
-    excludes: Excludes,
-    scheme: Option<String>,
+    filter: Filter,
    method: reqwest::Method,
    accepted: Option<HashSet<reqwest::StatusCode>>,
 }
@ -90,6 +91,12 @@ impl ClientBuilder {
        }
    }

+    fn build_includes(&mut self) -> Includes {
+        Includes {
+            regex: self.includes.clone().unwrap_or_default(),
+        }
+    }
+
    /// The build method instantiates the client.
    pub fn build(&mut self) -> Result<Client> {
        let mut headers = HeaderMap::new();
@ -140,12 +147,15 @@ impl ClientBuilder {
        let scheme = self.scheme.clone().unwrap_or(None);
        let scheme = scheme.map(|s| s.to_lowercase());

+        let includes = self.build_includes();
+        let excludes = self.build_excludes();
+
+        let filter = Filter::new(Some(includes), Some(excludes), scheme);
+
        Ok(Client {
            reqwest_client,
            github,
-            includes: self.includes.clone().unwrap_or(None),
-            excludes: self.build_excludes(),
-            scheme,
+            filter,
            method: self.method.clone().unwrap_or(reqwest::Method::GET),
            accepted: self.accepted.clone().unwrap_or(None),
        })
@ -156,9 +166,9 @@ impl Client {
    pub async fn check<T: TryInto<Request>>(&self, request: T) -> Result<Response> {
        let request: Request = match request.try_into() {
            Ok(request) => request,
-            Err(_e) => bail!("Invalid URI:"),
+            Err(_e) => bail!("Invalid URI"),
        };
-        if self.excluded(&request) {
+        if self.filter.excluded(&request) {
            return Ok(Response::new(request.uri, Status::Excluded, request.source));
        }
        let status = match request.uri {
@ -252,37 +262,6 @@ impl Client {
            }
        }
    }
-
-    pub fn excluded(&self, request: &Request) -> bool {
-        if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() {
-            return true;
-        }
-        if self.excludes.ip(&request.uri) {
-            return true;
-        }
-        if let Some(includes) = &self.includes {
-            if includes.is_empty() {
-                return false;
-            }
-            if includes.is_match(request.uri.as_str()) {
-                // Includes take precedence over excludes
-                return false;
-            } else {
-                // In case we have includes and no excludes,
-                // skip everything that was not included
-                if self.excludes.is_empty() {
-                    return true;
-                }
-            }
-        }
-        if self.excludes.regex(request.uri.as_str()) {
-            return true;
-        }
-        if self.scheme.is_none() {
-            return false;
-        }
-        request.uri.scheme() != self.scheme
-    }
 }

 /// A convenience function to check a single URI
@ -295,41 +274,12 @@ pub async fn check<T: TryInto<Request>>(request: T) -> Result<Response> {

 #[cfg(test)]
 mod test {
-    use crate::collector::Input;
-
    use super::*;
    use http::StatusCode;
    use std::time::{Duration, Instant};
-    use url::Url;
    use wiremock::matchers::method;
    use wiremock::{Mock, MockServer, ResponseTemplate};

-    // Note: the standard library as of Rust stable 1.47.0 does not expose
-    // "link-local" or "private" IPv6 checks.  However, one might argue
-    // that these concepts do exist in IPv6, albeit the naming is different.
-    // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
-    // See: https://en.wikipedia.org/wiki/Private_network#IPv6
-    // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
-    const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
-    const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
-    const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
-
-    const V4_LOOPBACK: &str = "http://127.0.0.1";
-    const V6_LOOPBACK: &str = "http://[::1]";
-
-    const V4_LINK_LOCAL: &str = "http://169.254.0.1";
-
-    // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
-    const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
-    const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
-
-    fn website_url(s: &str) -> Request {
-        Request::new(
-            Uri::Website(Url::parse(s).expect("Expected valid Website URI")),
-            Input::Stdin,
-        )
-    }
-
    #[tokio::test]
    async fn test_nonexistent() {
        let template = ResponseTemplate::new(404);
@ -361,7 +311,7 @@ mod test {
        let res = ClientBuilder::default()
            .build()
            .unwrap()
-            .check(website_url(&mock_server.uri()))
+            .check(mock_server.uri())
            .await
            .unwrap();
        let end = start.elapsed();
@ -390,7 +340,7 @@ mod test {
            ClientBuilder::default()
                .build()
                .unwrap()
-                .check(website_url("https://github.com/lycheeverse/lychee"))
+                .check("https://github.com/lycheeverse/lychee")
                .await
                .unwrap()
                .status,
@ -422,7 +372,7 @@ mod test {
        let res = ClientBuilder::default()
            .build()
            .unwrap()
-            .check(website_url(&mock_server.uri()))
+            .check(mock_server.uri())
            .await
            .unwrap()
            .status;
@ -455,7 +405,7 @@ mod test {
        let res = ClientBuilder::default()
            .build()
            .unwrap()
-            .check(website_url("https://crates.io/crates/lychee"))
+            .check("https://crates.io/crates/lychee")
            .await
            .unwrap();
        assert!(matches!(res.status, Status::Failed(StatusCode::NOT_FOUND)));
@ -469,7 +419,7 @@ mod test {
            .custom_headers(custom)
            .build()
            .unwrap()
-            .check(website_url("https://crates.io/crates/lychee"))
+            .check("https://crates.io/crates/lychee")
            .await
            .unwrap();
        assert!(matches!(res.status, Status::Ok(_)));
@ -496,198 +446,7 @@ mod test {
            .build()
            .unwrap();

-        let resp = client.check(website_url(&mock_server.uri())).await.unwrap();
+        let resp = client.check(mock_server.uri()).await.unwrap();
        assert!(matches!(resp.status, Status::Timeout(_)));
    }
-
-    #[tokio::test]
-    async fn test_include_regex() {
-        let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
-
-        let client = ClientBuilder::default().includes(includes).build().unwrap();
-
-        assert_eq!(
-            client.excluded(&website_url("https://foo.github.com")),
-            false
-        );
-        assert_eq!(
-            client.excluded(&website_url("https://bar.github.com")),
-            true
-        );
-    }
-
-    #[tokio::test]
-    async fn test_includes_and_excludes_empty() {
-        // This is the pre-configured, empty set of excludes for a client
-        // In this case, only the requests matching the include set will be checked
-        let exclude = Some(RegexSet::empty());
-        let includes = RegexSet::empty();
-
-        let client = ClientBuilder::default()
-            .includes(includes)
-            .excludes(exclude)
-            .build()
-            .unwrap();
-
-        assert_eq!(
-            client.excluded(&website_url("https://foo.github.com")),
-            false
-        );
-    }
-
-    #[tokio::test]
-    async fn test_include_with_empty_exclude() {
-        // This is the pre-configured, empty set of excludes for a client
-        // In this case, only the requests matching the include set will be checked
-        let exclude = Some(RegexSet::empty());
-        let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
-
-        let client = ClientBuilder::default()
-            .includes(includes)
-            .excludes(exclude)
-            .build()
-            .unwrap();
-
-        assert_eq!(
-            client.excluded(&website_url("https://foo.github.com")),
-            false
-        );
-        assert_eq!(client.excluded(&website_url("https://github.com")), true);
-        assert_eq!(
-            client.excluded(&website_url("https://bar.github.com")),
-            true
-        );
-    }
-
-    #[tokio::test]
-    async fn test_exclude_include_regex() {
-        let exclude = Some(RegexSet::new(&[r"github.com"]).unwrap());
-        let includes = RegexSet::new(&[r"foo.github.com"]).unwrap();
-
-        let client = ClientBuilder::default()
-            .includes(includes)
-            .excludes(exclude)
-            .build()
-            .unwrap();
-
-        assert_eq!(
-            client.excluded(&website_url("https://foo.github.com")),
-            false
-        );
-        assert_eq!(client.excluded(&website_url("https://github.com")), true);
-        assert_eq!(
-            client.excluded(&website_url("https://bar.github.com")),
-            true
-        );
-    }
-
-    #[tokio::test]
-    async fn test_exclude_regex() {
-        let exclude =
-            Some(RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.com"]).unwrap());
-
-        let client = ClientBuilder::default().excludes(exclude).build().unwrap();
-
-        assert_eq!(client.excluded(&website_url("http://github.com")), true);
-        assert_eq!(client.excluded(&website_url("http://exclude.org")), true);
-        assert_eq!(
-            client.excluded(&Request::new(
-                Uri::Mail("mail@example.com".to_string()),
-                Input::Stdin,
-            )),
-            true
-        );
-        assert_eq!(
-            client.excluded(&Request::new(
-                Uri::Mail("foo@bar.dev".to_string()),
-                Input::Stdin,
-            )),
-            false
-        );
-    }
-
-    #[test]
-    fn test_const_sanity() {
-        let get_host = |s| {
-            Url::parse(s)
-                .expect("Expected valid URL")
-                .host()
-                .expect("Expected host address")
-                .to_owned()
-        };
-        let into_v4 = |host| match host {
-            url::Host::Ipv4(ipv4) => ipv4,
-            _ => panic!("Not IPv4"),
-        };
-        let into_v6 = |host| match host {
-            url::Host::Ipv6(ipv6) => ipv6,
-            _ => panic!("Not IPv6"),
-        };
-
-        assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
-        assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
-        assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
-
-        assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
-        assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
-
-        assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
-    }
-
-    #[test]
-    fn test_excludes_no_private_ips_by_default() {
-        let client = ClientBuilder::default().build().unwrap();
-
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), false);
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), false);
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), false);
-        assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), false);
-        assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), false);
-
-        assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), false);
-    }
-
-    #[test]
-    fn test_exclude_private() {
-        let mut client = ClientBuilder::default().build().unwrap();
-        client.excludes.private_ips = true;
-
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_A)), true);
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_B)), true);
-        assert_eq!(client.excluded(&website_url(V4_PRIVATE_CLASS_C)), true);
-    }
-
-    #[test]
-    fn test_exclude_link_local() {
-        let mut client = ClientBuilder::default().build().unwrap();
-        client.excludes.link_local_ips = true;
-
-        assert_eq!(client.excluded(&website_url(V4_LINK_LOCAL)), true);
-    }
-
-    #[test]
-    fn test_exclude_loopback() {
-        let mut client = ClientBuilder::default().build().unwrap();
-        client.excludes.loopback_ips = true;
-
-        assert_eq!(client.excluded(&website_url(V4_LOOPBACK)), true);
-        assert_eq!(client.excluded(&website_url(V6_LOOPBACK)), true);
-    }
-
-    #[test]
-    fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
-        let mut client = ClientBuilder::default().build().unwrap();
-        client.excludes.private_ips = true;
-        client.excludes.link_local_ips = true;
-
-        // if these were pure IPv4, we would exclude
-        assert_eq!(
-            client.excluded(&website_url(V6_MAPPED_V4_PRIVATE_CLASS_A)),
-            false
-        );
-        assert_eq!(
-            client.excluded(&website_url(V6_MAPPED_V4_LINK_LOCAL)),
-            false
-        );
-    }
 }
--- a/src/extract.rs
+++ b/src/extract.rs
@ -289,7 +289,7 @@ mod test {
    #[test]
    fn test_non_markdown_links() {
        let input =
-            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
+            "https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.org";
        let links: HashSet<Uri> =
            extract_links(&InputContent::from_string(input, FileType::Plaintext), None)
                .into_iter()
@ -299,7 +299,7 @@ mod test {
        let expected = [
            website("https://endler.dev"),
            website("https://hello-rust.show/foo/bar?lol=1"),
-            Uri::Mail("test@example.com".to_string()),
+            Uri::Mail("test@example.org".to_string()),
        ]
        .iter()
        .cloned()
@ -330,11 +330,11 @@ mod test {
                .collect();

        let expected_links = [
-            website("https://example.com/head/home"),
-            website("https://example.com/css/style_full_url.css"),
+            website("https://example.org/head/home"),
+            website("https://example.org/css/style_full_url.css"),
            // the body links wouldn't be present if the file was parsed strictly as XML
-            website("https://example.com/body/a"),
-            website("https://example.com/body/div_empty_a"),
+            website("https://example.org/body/a"),
+            website("https://example.org/body/div_empty_a"),
        ]
        .iter()
        .cloned()
@ -348,21 +348,21 @@ mod test {
        let input = load_fixture("TEST_HTML5.html");
        let links: HashSet<Uri> = extract_links(
            &InputContent::from_string(&input, FileType::Html),
-            Some(Url::parse("https://example.com").unwrap()),
+            Some(Url::parse("https://example.org").unwrap()),
        )
        .into_iter()
        .map(|r| r.uri)
        .collect();

        let expected_links = [
-            website("https://example.com/head/home"),
-            website("https://example.com/images/icon.png"),
-            website("https://example.com/css/style_relative_url.css"),
-            website("https://example.com/css/style_full_url.css"),
-            website("https://example.com/js/script.js"),
+            website("https://example.org/head/home"),
+            website("https://example.org/images/icon.png"),
+            website("https://example.org/css/style_relative_url.css"),
+            website("https://example.org/css/style_full_url.css"),
+            website("https://example.org/js/script.js"),
            // the body links wouldn't be present if the file was parsed strictly as XML
-            website("https://example.com/body/a"),
-            website("https://example.com/body/div_empty_a"),
+            website("https://example.org/body/a"),
+            website("https://example.org/body/div_empty_a"),
        ]
        .iter()
        .cloned()
@ -381,7 +381,7 @@ mod test {
                .map(|r| r.uri)
                .collect();

-        let expected_links = [website("https://example.com/body/a")]
+        let expected_links = [website("https://example.org/body/a")]
            .iter()
            .cloned()
            .collect();
@ -400,11 +400,11 @@ mod test {
                .collect();

        let expected_links = [
-            website("https://example.com/"),
-            website("https://example.com/favicon.ico"),
+            website("https://example.org/"),
+            website("https://example.org/favicon.ico"),
            website("https://fonts.externalsite.com"),
-            website("https://example.com/docs/"),
-            website("https://example.com/forum"),
+            website("https://example.org/docs/"),
+            website("https://example.org/forum"),
        ]
        .iter()
        .cloned()
@ -424,7 +424,7 @@ mod test {
                .collect();

        let expected_links = [Uri::Website(
-            Url::parse("https://example.com/valid").unwrap(),
+            Url::parse("https://example.org/valid").unwrap(),
        )]
        .iter()
        .cloned()
@ -444,10 +444,10 @@ mod test {
                .collect();

        let expected_links = [
-            website("https://example.com/some-weird-element"),
-            website("https://example.com/even-weirder-src"),
-            website("https://example.com/even-weirder-href"),
-            website("https://example.com/citations"),
+            website("https://example.org/some-weird-element"),
+            website("https://example.org/even-weirder-src"),
+            website("https://example.org/even-weirder-href"),
+            website("https://example.org/citations"),
        ]
        .iter()
        .cloned()
--- a/src/filter/excludes.rs
+++ b/src/filter/excludes.rs
@ -1,6 +1,5 @@
-use std::net::IpAddr;
-
 use regex::RegexSet;
+use std::net::IpAddr;

 use crate::Uri;

--- a/src/filter/includes.rs
+++ b/src/filter/includes.rs
@ -0,0 +1,32 @@
+use regex::RegexSet;
+
+/// Include configuration for the link checker.
+/// You can include links based on regex patterns
+#[derive(Clone, Debug)]
+pub struct Includes {
+    pub regex: Option<RegexSet>,
+}
+
+impl Default for Includes {
+    fn default() -> Self {
+        Self { regex: None }
+    }
+}
+
+impl Includes {
+    pub fn regex(&self, input: &str) -> bool {
+        if let Some(includes) = &self.regex {
+            if includes.is_match(input) {
+                return true;
+            }
+        }
+        false
+    }
+
+    pub fn is_empty(&self) -> bool {
+        match &self.regex {
+            None => true,
+            Some(regex_set) => regex_set.is_empty(),
+        }
+    }
+}
--- a/src/filter/mod.rs
+++ b/src/filter/mod.rs
@ -0,0 +1,259 @@
+mod excludes;
+mod includes;
+
+pub use excludes::Excludes;
+pub use includes::Includes;
+
+use crate::uri::Uri;
+use crate::Request;
+
+/// A generic URI filter
+/// Used to decide if a given URI should be checked or skipped
+#[derive(Clone, Debug)]
+pub struct Filter {
+    includes: Includes,
+    excludes: Excludes,
+    scheme: Option<String>,
+}
+
+impl Filter {
+    pub fn new(
+        includes: Option<Includes>,
+        excludes: Option<Excludes>,
+        scheme: Option<String>,
+    ) -> Self {
+        let includes = match includes {
+            Some(includes) => includes,
+            None => Includes::default(),
+        };
+        let excludes = match excludes {
+            Some(excludes) => excludes,
+            None => Excludes::default(),
+        };
+        Filter {
+            includes,
+            excludes,
+            scheme,
+        }
+    }
+
+    pub fn excluded(&self, request: &Request) -> bool {
+        // Skip mail?
+        if matches!(request.uri, Uri::Mail(_)) && self.excludes.is_mail_excluded() {
+            return true;
+        }
+        // Skip specific IP address?
+        if self.excludes.ip(&request.uri) {
+            return true;
+        }
+        // No regex includes/excludes at all?
+        if self.includes.is_empty() && self.excludes.is_empty() {
+            return false;
+        }
+        if self.includes.regex(request.uri.as_str()) {
+            // Includes take precedence over excludes
+            return false;
+        }
+        // In case we have includes and no excludes,
+        // skip everything that was not included
+        if !self.includes.is_empty() && self.excludes.is_empty() {
+            return true;
+        }
+
+        // We have no includes. Check regex excludes
+        if self.excludes.regex(request.uri.as_str()) {
+            return true;
+        }
+
+        if self.scheme.is_none() {
+            return false;
+        }
+        request.uri.scheme() != self.scheme
+    }
+}
+
+#[cfg(test)]
+mod test {
+    // Note: the standard library as of Rust stable 1.47.0 does not expose
+    // "link-local" or "private" IPv6 checks.  However, one might argue
+    // that these concepts do exist in IPv6, albeit the naming is different.
+    // See: https://en.wikipedia.org/wiki/Link-local_address#IPv6
+    // See: https://en.wikipedia.org/wiki/Private_network#IPv6
+    // See: https://doc.rust-lang.org/stable/std/net/struct.Ipv6Addr.html#method.is_unicast_link_local
+    const V4_PRIVATE_CLASS_A: &str = "http://10.0.0.1";
+    const V4_PRIVATE_CLASS_B: &str = "http://172.16.0.1";
+    const V4_PRIVATE_CLASS_C: &str = "http://192.168.0.1";
+
+    const V4_LOOPBACK: &str = "http://127.0.0.1";
+    const V6_LOOPBACK: &str = "http://[::1]";
+
+    const V4_LINK_LOCAL: &str = "http://169.254.0.1";
+
+    // IPv4-Mapped IPv6 addresses (IPv4 embedded in IPv6)
+    const V6_MAPPED_V4_PRIVATE_CLASS_A: &str = "http://[::ffff:10.0.0.1]";
+    const V6_MAPPED_V4_LINK_LOCAL: &str = "http://[::ffff:169.254.0.1]";
+
+    use regex::RegexSet;
+    use reqwest::Url;
+
+    use super::*;
+
+    use crate::{test_utils::website, Input};
+
+    /// Helper method to convert a string into a Request
+    /// Note: This panics on error, so it should only be used for testing
+    pub fn request(url: &str) -> Request {
+        Request::new(website(url), Input::Stdin)
+    }
+
+    #[test]
+    fn test_const_sanity() {
+        let get_host = |s| {
+            Url::parse(s)
+                .expect("Expected valid URL")
+                .host()
+                .expect("Expected host address")
+                .to_owned()
+        };
+        let into_v4 = |host| match host {
+            url::Host::Ipv4(ipv4) => ipv4,
+            _ => panic!("Not IPv4"),
+        };
+        let into_v6 = |host| match host {
+            url::Host::Ipv6(ipv6) => ipv6,
+            _ => panic!("Not IPv6"),
+        };
+
+        assert!(into_v4(get_host(V4_PRIVATE_CLASS_A)).is_private());
+        assert!(into_v4(get_host(V4_PRIVATE_CLASS_B)).is_private());
+        assert!(into_v4(get_host(V4_PRIVATE_CLASS_C)).is_private());
+
+        assert!(into_v4(get_host(V4_LOOPBACK)).is_loopback());
+        assert!(into_v6(get_host(V6_LOOPBACK)).is_loopback());
+
+        assert!(into_v4(get_host(V4_LINK_LOCAL)).is_link_local());
+    }
+
+    #[test]
+    fn test_includes_and_excludes_empty() {
+        // This is the pre-configured, empty set of excludes for a client
+        // In this case, only the requests matching the include set will be checked
+        let includes = Some(Includes::default());
+        let excludes = Some(Excludes::default());
+        let filter = Filter::new(includes, excludes, None);
+        assert_eq!(filter.excluded(&request("https://example.org")), false);
+    }
+
+    #[test]
+    fn test_include_regex() {
+        let includes = Some(Includes {
+            regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
+        });
+        let filter = Filter::new(includes, None, None);
+
+        // Only the requests matching the include set will be checked
+        assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
+        assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
+        assert_eq!(filter.excluded(&request("https://example.org")), true);
+    }
+
+    #[test]
+    fn test_exclude_regex() {
+        let excludes = Excludes {
+            regex: Some(
+                RegexSet::new(&[r"github.com", r"[a-z]+\.(org|net)", r"@example.org"]).unwrap(),
+            ),
+            ..Default::default()
+        };
+        let filter = Filter::new(None, Some(excludes), None);
+
+        assert_eq!(filter.excluded(&request("http://github.com")), true);
+        assert_eq!(filter.excluded(&request("http://exclude.org")), true);
+        assert_eq!(
+            filter.excluded(&Request::new(
+                Uri::Mail("mail@example.org".to_string()),
+                Input::Stdin,
+            )),
+            true
+        );
+
+        assert_eq!(filter.excluded(&request("http://bar.dev")), false);
+        assert_eq!(
+            filter.excluded(&Request::new(
+                Uri::Mail("foo@bar.dev".to_string()),
+                Input::Stdin,
+            )),
+            false
+        );
+    }
+    #[test]
+    fn test_exclude_include_regex() {
+        let includes = Some(Includes {
+            regex: Some(RegexSet::new(&[r"foo.example.org"]).unwrap()),
+        });
+        let excludes = Excludes {
+            regex: Some(RegexSet::new(&[r"example.org"]).unwrap()),
+            ..Default::default()
+        };
+
+        let filter = Filter::new(includes, Some(excludes), None);
+
+        // Includes take preference over excludes
+        assert_eq!(filter.excluded(&request("https://foo.example.org")), false);
+
+        assert_eq!(filter.excluded(&request("https://example.org")), true);
+        assert_eq!(filter.excluded(&request("https://bar.example.org")), true);
+    }
+
+    #[test]
+    fn test_excludes_no_private_ips_by_default() {
+        let filter = Filter::new(None, None, None);
+
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), false);
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), false);
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), false);
+        assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), false);
+        assert_eq!(filter.excluded(&request(V4_LOOPBACK)), false);
+        assert_eq!(filter.excluded(&request(V6_LOOPBACK)), false);
+    }
+
+    #[test]
+    fn test_exclude_private_ips() {
+        let mut filter = Filter::new(None, None, None);
+        filter.excludes.private_ips = true;
+
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_A)), true);
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_B)), true);
+        assert_eq!(filter.excluded(&request(V4_PRIVATE_CLASS_C)), true);
+    }
+
+    #[test]
+    fn test_exclude_link_local() {
+        let mut filter = Filter::new(None, None, None);
+        filter.excludes.link_local_ips = true;
+        assert_eq!(filter.excluded(&request(V4_LINK_LOCAL)), true);
+    }
+
+    #[test]
+    fn test_exclude_loopback() {
+        let mut filter = Filter::new(None, None, None);
+        filter.excludes.loopback_ips = true;
+
+        assert_eq!(filter.excluded(&request(V4_LOOPBACK)), true);
+        assert_eq!(filter.excluded(&request(V6_LOOPBACK)), true);
+    }
+
+    #[test]
+    fn test_exclude_ip_v4_mapped_ip_v6_not_supported() {
+        let mut filter = Filter::new(None, None, None);
+        filter.excludes.private_ips = true;
+        filter.excludes.link_local_ips = true;
+
+        // if these were pure IPv4, we would exclude
+        assert_eq!(
+            filter.excluded(&request(V6_MAPPED_V4_PRIVATE_CLASS_A)),
+            false
+        );
+        assert_eq!(filter.excluded(&request(V6_MAPPED_V4_LINK_LOCAL)), false);
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -41,7 +41,7 @@ doctest!("../README.md");

 mod client;
 mod client_pool;
-mod excludes;
+mod filter;
 mod types;
 mod uri;

@ -53,6 +53,5 @@ pub use client::check;
 pub use client::ClientBuilder;
 pub use client_pool::ClientPool;
 pub use collector::Input;
-pub use excludes::Excludes;
 pub use types::*;
 pub use uri::Uri;
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@ -34,6 +34,8 @@ where
    mock_server
 }

+/// Helper method to convert a string into a URI
+/// Note: This panics on error, so it should only be used for testing
 pub fn website(url: &str) -> Uri {
-    Uri::Website(Url::parse(url).unwrap())
+    Uri::Website(Url::parse(url).expect("Expected valid Website URI"))
 }
--- a/src/uri.rs
+++ b/src/uri.rs
@ -74,16 +74,16 @@ mod test {
    fn test_uri_from_str() {
        assert!(matches!(Uri::try_from(""), Err(_)));
        assert_eq!(
-            Uri::try_from("http://example.com").unwrap(),
-            website("http://example.com")
+            Uri::try_from("http://example.org").unwrap(),
+            website("http://example.org")
        );
        assert_eq!(
-            Uri::try_from("mail@example.com").unwrap(),
-            Uri::Mail("mail@example.com".to_string())
+            Uri::try_from("mail@example.org").unwrap(),
+            Uri::Mail("mail@example.org".to_string())
        );
        assert_eq!(
-            Uri::try_from("mailto:mail@example.com").unwrap(),
-            Uri::Mail("mail@example.com".to_string())
+            Uri::try_from("mailto:mail@example.org").unwrap(),
+            Uri::Mail("mail@example.org".to_string())
        );
    }

--- a/tests/cli.rs
+++ b/tests/cli.rs
@ -75,6 +75,7 @@ mod cli {
        let test_github_404_path = fixtures_path().join("TEST_GITHUB_404.md");

        cmd.arg(test_github_404_path)
+            .arg("--no-progress")
            .env_clear()
            .assert()
            .failure()