From 63ba63f7c9cebe8f535c7066cc083ca2eee8b0d7 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Fri, 5 Jan 2024 18:48:15 +0100 Subject: [PATCH] Exclude example TLDs from RFC 2606 (#1335) Fixes https://github.com/lycheeverse/lychee/issues/1283 --- fixtures/TEST_EXAMPLE_DOMAINS.md | 6 ++++++ fixtures/TEST_EXAMPLE_DOMAINS_FALSE_POSITIVES.md | 8 +++++++- lychee-bin/tests/example_domains.rs | 2 +- lychee-lib/src/filter/mod.rs | 13 ++++++++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fixtures/TEST_EXAMPLE_DOMAINS.md b/fixtures/TEST_EXAMPLE_DOMAINS.md index 105825a..a86dc29 100644 --- a/fixtures/TEST_EXAMPLE_DOMAINS.md +++ b/fixtures/TEST_EXAMPLE_DOMAINS.md @@ -12,3 +12,9 @@ mailto:hello@example.com?subject=hello http://example.net/foo/bar mail@example.com mail@somedomain.com + +https://test.localhost +http://foo.bar.invalid +foo.bar.invalid/some/path +https://example.example +http://integration.test diff --git a/fixtures/TEST_EXAMPLE_DOMAINS_FALSE_POSITIVES.md b/fixtures/TEST_EXAMPLE_DOMAINS_FALSE_POSITIVES.md index 7621771..0e56a49 100644 --- a/fixtures/TEST_EXAMPLE_DOMAINS_FALSE_POSITIVES.md +++ b/fixtures/TEST_EXAMPLE_DOMAINS_FALSE_POSITIVES.md @@ -1,3 +1,9 @@ http://gobyexample.com/ https://examples.com/ -https://texample.net/ \ No newline at end of file +https://texample.net/ + +http://foo.isnotinvalid +http://foo.bar.invalid2 +http://integration.text +https://test.possiblylocalhost +https://example.examplenotexample \ No newline at end of file diff --git a/lychee-bin/tests/example_domains.rs b/lychee-bin/tests/example_domains.rs index 2846c1b..819b745 100644 --- a/lychee-bin/tests/example_domains.rs +++ b/lychee-bin/tests/example_domains.rs @@ -65,7 +65,7 @@ mod cli { let output = cmd.get_output(); let output = std::str::from_utf8(&output.stdout).unwrap(); - assert_eq!(output.lines().count(), 3); + assert_eq!(output.lines().count(), 8); Ok(()) } diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index a8cea45..f970cf2 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -17,10 +17,19 @@ use crate::Uri; static EXAMPLE_DOMAINS: Lazy> = Lazy::new(|| HashSet::from_iter(["example.com", "example.org", "example.net", "example.edu"])); +#[cfg(all(not(test), not(feature = "check_example_domains")))] +/// We also exclude the example TLDs in section 2 of the same RFC. +/// This exclusion gets subsumed by the `check_example_domains` feature. +static EXAMPLE_TLDS: Lazy> = + Lazy::new(|| HashSet::from_iter([".test", ".example", ".invalid", ".localhost"])); + // Allow usage of example domains in tests #[cfg(any(test, feature = "check_example_domains"))] static EXAMPLE_DOMAINS: Lazy> = Lazy::new(HashSet::new); +#[cfg(any(test, feature = "check_example_domains"))] +static EXAMPLE_TLDS: Lazy> = Lazy::new(HashSet::new); + static UNSUPPORTED_DOMAINS: Lazy> = Lazy::new(|| { HashSet::from_iter([ // Twitter requires an account to view tweets @@ -66,7 +75,9 @@ pub fn is_example_domain(uri: &Uri) -> bool { || domain .split_once('.') .map_or(false, |(_subdomain, tld_part)| tld_part == example) - }) + }) || EXAMPLE_TLDS + .iter() + .any(|&example_tld| domain.ends_with(example_tld)) } None => { // Check if the URI is an email address.