From 97573123ef023ad533b380811abdc4cfd89a60d1 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Wed, 5 Jul 2023 15:05:19 +0200 Subject: [PATCH] Extend remap feature (#1133) * wip * Extend support for remapping This adds supports for partial remaps and capture groups to the remap feature. Fixes #1129 --- Makefile | 1 + fixtures/configs/smoketest.toml | 7 +- lychee-bin/src/commands/dump.rs | 2 +- lychee-bin/src/parse.rs | 4 +- lychee-bin/tests/cli.rs | 36 +++++++ lychee-lib/src/client.rs | 97 +++++++++++------- lychee-lib/src/remap.rs | 168 +++++++++++++++++++++++++------- lychee-lib/src/types/error.rs | 2 +- 8 files changed, 245 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 8287378..d908f1b 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,7 @@ docs: ## Generate and show documentation .PHONY: lint lint: ## Run linter + cargo fmt --all -- --check cargo clippy --all-targets --all-features -- -D warnings .PHONY: test diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index 9164b67..fa91e9a 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -65,7 +65,12 @@ method = "get" headers = [] # Remap URI matching pattern to different URI. -remap = [ "https://example.com http://example.invalid" ] +# This also supports (named) capturing groups. +remap = [ + "https://example.com http://example.invalid", + "https://example.com/(.*) http://example.org/$1", + "https://github.com/(?P.*)/(?P.*) https://gitlab.com/$org/$repo", +] # Base URL or website root directory to check relative URLs. base = "https://example.com" diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index 818a9cf..d16f725 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -47,7 +47,7 @@ where let mut request = request?; // Apply URI remappings (if any) - params.client.remap(&mut request.uri); + params.client.remap(&mut request.uri)?; // Avoid panic on broken pipe. // See https://github.com/rust-lang/rust/issues/46016 diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index 18bdeba..fddb7c4 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -61,7 +61,7 @@ mod tests { use headers::HeaderMap; use regex::Regex; - use reqwest::{header, Url}; + use reqwest::header; use super::*; @@ -89,6 +89,6 @@ mod tests { pattern.to_string(), Regex::new("https://example.com").unwrap().to_string() ); - assert_eq!(url, Url::try_from("http://127.0.0.1:8080").unwrap()); + assert_eq!(url, "http://127.0.0.1:8080"); } } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a1f58ba..75c4044 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1116,6 +1116,42 @@ mod cli { Ok(()) } + #[test] + fn test_remap_capture() -> Result<()> { + let mut cmd = main_command(); + + cmd.arg("--dump") + .arg("--remap") + .arg("https://example.com/(.*) http://example.org/$1") + .arg("--") + .arg("-") + .write_stdin("https://example.com/foo\n") + .env_clear() + .assert() + .success() + .stdout(contains("http://example.org/foo")); + + Ok(()) + } + + #[test] + fn test_remap_named_capture() -> Result<()> { + let mut cmd = main_command(); + + cmd.arg("--dump") + .arg("--remap") + .arg("https://github.com/(?P.*)/(?P.*) https://gitlab.com/$org/$repo") + .arg("--") + .arg("-") + .write_stdin("https://github.com/lycheeverse/lychee\n") + .env_clear() + .assert() + .success() + .stdout(contains("https://gitlab.com/lycheeverse/lychee")); + + Ok(()) + } + #[test] fn test_excluded_paths() -> Result<()> { let test_path = fixtures_path().join("exclude-path"); diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 857f990..35232bb 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -392,7 +392,7 @@ pub struct Client { /// Optional remapping rules for URIs matching pattern. remaps: Option, - /// Rules to decided whether each link would be checked or ignored. + /// Rules to decided whether each link should be checked or ignored. filter: Filter, /// Maximum number of retries per request before returning an error. @@ -446,47 +446,41 @@ impl Client { .. } = request.try_into()?; - self.remap(uri); + // Allow filtering based on element and attribute + // if !self.filter.is_allowed(uri) { + // return Ok(Response::new( + // uri.clone(), + // Status::Excluded, + // source, + // )); + // } - // TODO: Allow filtering based on element and attribute - let status = if self.is_excluded(uri) { - Status::Excluded - } else if uri.is_file() { - self.check_file(uri) - } else if uri.is_mail() { - #[cfg(all(feature = "email-check", feature = "native-tls"))] - { - self.check_mail(uri).await - } + self.remap(uri)?; - #[cfg(not(all(feature = "email-check", feature = "native-tls")))] - Status::Excluded - } else { - match self.check_website(uri, credentials).await { - Status::Ok(code) if self.require_https && uri.scheme() == "http" => { - if self - .check_website(&uri.to_https()?, credentials) - .await - .is_success() - { - Status::Error(ErrorKind::InsecureURL(uri.clone())) - } else { - Status::Ok(code) - } - } - s => s, - } + if self.is_excluded(uri) { + return Ok(Response::new(uri.clone(), Status::Excluded, source)); + } + + let status = match uri.scheme() { + _ if uri.is_file() => self.check_file(uri), + _ if uri.is_mail() => self.check_mail(uri).await, + _ => self.check_website(uri, credentials).await?, }; Ok(Response::new(uri.clone(), status, source)) } /// Remap `uri` using the client-defined remapping rules. - pub fn remap(&self, uri: &mut Uri) { - // TODO: this should be logged (Lucius, Jan 2023) + /// + /// # Errors + /// + /// Returns an `Err` if the final, remapped `uri` is not a valid URI. + pub fn remap(&self, uri: &mut Uri) -> Result<()> { if let Some(ref remaps) = self.remaps { - remaps.remap(&mut uri.url); + debug!("Remapping URI: {}", uri.url); + uri.url = remaps.remap(&uri.url)?; } + Ok(()) } /// Returns whether the given `uri` should be ignored from checking. @@ -495,6 +489,38 @@ impl Client { self.filter.is_excluded(uri) } + /// Checks the given URI of a website. + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The URI is invalid. + /// - The request failed. + /// - The response status code is not accepted. + /// - The URI cannot be converted to HTTPS. + pub async fn check_website( + &self, + uri: &Uri, + credentials: &Option, + ) -> Result { + match self.check_website_inner(uri, credentials).await { + Status::Ok(code) if self.require_https && uri.scheme() == "http" => { + if self + .check_website_inner(&uri.to_https()?, credentials) + .await + .is_success() + { + Ok(Status::Error(ErrorKind::InsecureURL(uri.clone()))) + } else { + // HTTPS is not available for this URI, + // so the original HTTP URL is fine. + Ok(Status::Ok(code)) + } + } + s => Ok(s), + } + } + /// Checks the given URI of a website. /// /// Unsupported schemes will be ignored @@ -505,7 +531,7 @@ impl Client { /// - The URI is invalid. /// - The request failed. /// - The response status code is not accepted. - pub async fn check_website( + pub async fn check_website_inner( &self, uri: &Uri, credentials: &Option, @@ -650,6 +676,11 @@ impl Client { Status::Ok(StatusCode::OK) } } + + #[cfg(not(all(feature = "email-check", feature = "native-tls")))] + pub async fn check_mail(&self, uri: &Uri) -> Status { + Status::Excluded + } } // Check if the given `Url` would cause `reqwest` to panic. diff --git a/lychee-lib/src/remap.rs b/lychee-lib/src/remap.rs index 03d0d36..56cbd37 100644 --- a/lychee-lib/src/remap.rs +++ b/lychee-lib/src/remap.rs @@ -22,9 +22,9 @@ use std::ops::Index; use regex::Regex; -use reqwest::Url; +use url::Url; -use crate::ErrorKind; +use crate::{ErrorKind, Result}; /// Rules that remap matching URL patterns. /// @@ -35,28 +35,42 @@ use crate::ErrorKind; /// # Notes /// See module level documentation of usage notes. #[derive(Debug, Clone)] -pub struct Remaps(Vec<(Regex, Url)>); +pub struct Remaps(Vec<(Regex, String)>); impl Remaps { /// Create a new remapper #[must_use] - pub fn new(patterns: Vec<(Regex, Url)>) -> Self { + pub fn new(patterns: Vec<(Regex, String)>) -> Self { Self(patterns) } /// Returns an iterator over the rules. // `iter_mut` is deliberately avoided. - pub fn iter(&self) -> std::slice::Iter<(Regex, Url)> { + pub fn iter(&self) -> std::slice::Iter<(Regex, String)> { self.0.iter() } /// Remap URL against remapping rules. - pub fn remap(&self, url: &mut Url) { - for (pattern, new_url) in self { - if pattern.is_match(url.as_str()) { - *url = new_url.clone(); + /// + /// If there is no matching rule, the original URL is returned. + /// + /// # Errors + /// + /// Returns an `Err` if the remapping rule produces an invalid URL. + #[must_use = "Remapped URLs must be used"] + pub fn remap(&self, original: &Url) -> Result { + for (pattern, replacement) in self { + if pattern.is_match(original.as_str()) { + let after = pattern.replace_all(original.as_str(), replacement); + let after_url = Url::parse(after.as_ref()).map_err(|_| { + ErrorKind::InvalidUrlRemap(format!( + "The remapping pattern must produce a valid URL, but it is not: {after}" + )) + })?; + return Ok(after_url); } } + Ok(original.clone()) } /// Returns `true` if there is no remapping rule defined. @@ -73,9 +87,9 @@ impl Remaps { } impl Index for Remaps { - type Output = (Regex, Url); + type Output = (Regex, String); - fn index(&self, index: usize) -> &(regex::Regex, url::Url) { + fn index(&self, index: usize) -> &(regex::Regex, String) { &self.0[index] } } @@ -100,13 +114,14 @@ impl TryFrom<&[String]> for Remaps { for remap in remaps { let params: Vec<_> = remap.split_whitespace().collect(); if params.len() != 2 { - return Err(ErrorKind::InvalidUrlRemap(remap.to_string())); + return Err(ErrorKind::InvalidUrlRemap( + format!("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: {remap}" + ))); } let pattern = Regex::new(params[0])?; - let url = Url::try_from(params[1]) - .map_err(|e| ErrorKind::ParseUrl(e, params[1].to_string()))?; - parsed.push((pattern, url)); + let replacement = params[1].to_string(); + parsed.push((pattern, replacement)); } Ok(Remaps::new(parsed)) @@ -116,9 +131,9 @@ impl TryFrom<&[String]> for Remaps { // Implementation for mutable iterator and moving iterator are deliberately // avoided impl<'a> IntoIterator for &'a Remaps { - type Item = &'a (Regex, Url); + type Item = &'a (Regex, String); - type IntoIter = std::slice::Iter<'a, (Regex, Url)>; + type IntoIter = std::slice::Iter<'a, (Regex, String)>; fn into_iter(self) -> Self::IntoIter { self.0.iter() @@ -127,42 +142,127 @@ impl<'a> IntoIterator for &'a Remaps { #[cfg(test)] mod tests { + use url::Url; + use super::*; #[test] fn test_remap() { - let pattern = Regex::new("https://example.com").unwrap(); - let new_url = Url::try_from("http://127.0.0.1:8080").unwrap(); - let remaps = Remaps::new(vec![(pattern, new_url.clone())]); + let input = "https://example.com"; + let input_url = Url::try_from(input).unwrap(); + let input_pattern = Regex::new(input).unwrap(); + let replacement = "http://127.0.0.1:8080"; + let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]); - let mut input = Url::try_from("https://example.com").unwrap(); - remaps.remap(&mut input); + let output = remaps.remap(&input_url).unwrap(); - assert_eq!(input, new_url); + assert_eq!(output, Url::try_from(replacement).unwrap()); } #[test] fn test_remap_path() { - let pattern = Regex::new("../../issues").unwrap(); - let new_url = Url::try_from("https://example.com").unwrap(); - let remaps = Remaps::new(vec![(pattern, new_url.clone())]); + let input = Url::try_from("file://../../issues").unwrap(); + let input_pattern = Regex::new(".*?../../issues").unwrap(); + let replacement = Url::try_from("https://example.com").unwrap(); + let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]); - let mut input = Url::try_from("file://../../issues").unwrap(); - remaps.remap(&mut input); + let output = remaps.remap(&input).unwrap(); - assert_eq!(input, new_url); + assert_eq!(output, replacement); } #[test] fn test_remap_skip() { + let input = Url::try_from("https://unrelated.example.com").unwrap(); let pattern = Regex::new("https://example.com").unwrap(); - let new_url = Url::try_from("http://127.0.0.1:8080").unwrap(); - let remaps = Remaps::new(vec![(pattern, new_url)]); + let replacement = Url::try_from("http://127.0.0.1:8080").unwrap(); + let remaps = Remaps::new(vec![(pattern, replacement.to_string())]); - let mut input = Url::try_from("https://unrelated.example.com").unwrap(); - remaps.remap(&mut input); + let output = remaps.remap(&input).unwrap(); // URL was not modified - assert_eq!(input, input); + assert_eq!(input, output); + } + + #[test] + fn test_remap_url_to_file() { + let pattern = Regex::new("https://docs.example.org").unwrap(); + let replacement = "file:///Users/user/code/repo/docs/_site"; + let remaps = Remaps::new(vec![(pattern, replacement.to_string())]); + + let tests = [ + ( + "https://docs.example.org/integrations/distcp.html", + "file:///Users/user/code/repo/docs/_site/integrations/distcp.html", + ), + ( + "https://docs.example.org/howto/import.html#working-with-imported-data", + "file:///Users/user/code/repo/docs/_site/howto/import.html#working-with-imported-data", + ), + ( + "https://docs.example.org/howto/garbage-collection-committed.html", + "file:///Users/user/code/repo/docs/_site/howto/garbage-collection-committed.html", + ), + ]; + + for (input, expected) in tests { + let input = Url::parse(input).unwrap(); + let output = remaps.remap(&input).unwrap(); + assert_eq!(output, Url::parse(expected).unwrap()); + } + } + + /// This is a partial remap, i.e. the URL is not fully replaced but only + /// part of it. The parts to be replaced are defined by the regex pattern + /// using capture groups. + #[test] + fn test_remap_capture_group() { + let input = Url::try_from("https://example.com/1/2/3").unwrap(); + let input_pattern = Regex::new("https://example.com/.*?/(.*?)/.*").unwrap(); + let replacement = Url::try_from("https://example.com/foo/$1/bar").unwrap(); + + let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]); + + let output = remaps.remap(&input).unwrap(); + + assert_eq!( + output, + Url::try_from("https://example.com/foo/2/bar").unwrap() + ); + } + + #[test] + fn test_remap_named_capture() { + let input = Url::try_from("https://example.com/1/2/3").unwrap(); + let input_pattern = Regex::new("https://example.com/.*?/(?P.*?)/.*").unwrap(); + let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap(); + + let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]); + + let output = remaps.remap(&input).unwrap(); + + assert_eq!( + output, + Url::try_from("https://example.com/foo/2/bar").unwrap() + ); + } + + #[test] + fn test_remap_named_capture_shorthand() { + let input = Url::try_from("https://example.com/1/2/3").unwrap(); + #[allow(clippy::invalid_regex)] + // Clippy acts up here, but this syntax is actually valid + // See https://docs.rs/regex/latest/regex/index.html#grouping-and-flags + let input_pattern = Regex::new(r"https://example.com/.*?/(?.*?)/.*").unwrap(); + let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap(); + + let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]); + + let output = remaps.remap(&input).unwrap(); + + assert_eq!( + output, + Url::try_from("https://example.com/foo/2/bar").unwrap() + ); } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index d708e9f..3be8865 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -82,7 +82,7 @@ pub enum ErrorKind { InvalidBase(String, String), /// The given input can not be parsed into a valid URI remapping - #[error("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: `{0}`")] + #[error("Error remapping URL: `{0}`")] InvalidUrlRemap(String), /// The given path does not resolve to a valid file