Extend remap feature (#1133)

* wip

* Extend support for remapping

This adds supports for partial remaps and
capture groups to the remap feature.

Fixes #1129
This commit is contained in:
Matthias Endler 2023-07-05 15:05:19 +02:00 committed by GitHub
parent 687190bde9
commit 97573123ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 245 additions and 72 deletions

View file

@ -36,6 +36,7 @@ docs: ## Generate and show documentation
.PHONY: lint
lint: ## Run linter
cargo fmt --all -- --check
cargo clippy --all-targets --all-features -- -D warnings
.PHONY: test

View file

@ -65,7 +65,12 @@ method = "get"
headers = []
# Remap URI matching pattern to different URI.
remap = [ "https://example.com http://example.invalid" ]
# This also supports (named) capturing groups.
remap = [
"https://example.com http://example.invalid",
"https://example.com/(.*) http://example.org/$1",
"https://github.com/(?P<org>.*)/(?P<repo>.*) https://gitlab.com/$org/$repo",
]
# Base URL or website root directory to check relative URLs.
base = "https://example.com"

View file

@ -47,7 +47,7 @@ where
let mut request = request?;
// Apply URI remappings (if any)
params.client.remap(&mut request.uri);
params.client.remap(&mut request.uri)?;
// Avoid panic on broken pipe.
// See https://github.com/rust-lang/rust/issues/46016

View file

@ -61,7 +61,7 @@ mod tests {
use headers::HeaderMap;
use regex::Regex;
use reqwest::{header, Url};
use reqwest::header;
use super::*;
@ -89,6 +89,6 @@ mod tests {
pattern.to_string(),
Regex::new("https://example.com").unwrap().to_string()
);
assert_eq!(url, Url::try_from("http://127.0.0.1:8080").unwrap());
assert_eq!(url, "http://127.0.0.1:8080");
}
}

View file

@ -1116,6 +1116,42 @@ mod cli {
Ok(())
}
#[test]
fn test_remap_capture() -> Result<()> {
let mut cmd = main_command();
cmd.arg("--dump")
.arg("--remap")
.arg("https://example.com/(.*) http://example.org/$1")
.arg("--")
.arg("-")
.write_stdin("https://example.com/foo\n")
.env_clear()
.assert()
.success()
.stdout(contains("http://example.org/foo"));
Ok(())
}
#[test]
fn test_remap_named_capture() -> Result<()> {
let mut cmd = main_command();
cmd.arg("--dump")
.arg("--remap")
.arg("https://github.com/(?P<org>.*)/(?P<repo>.*) https://gitlab.com/$org/$repo")
.arg("--")
.arg("-")
.write_stdin("https://github.com/lycheeverse/lychee\n")
.env_clear()
.assert()
.success()
.stdout(contains("https://gitlab.com/lycheeverse/lychee"));
Ok(())
}
#[test]
fn test_excluded_paths() -> Result<()> {
let test_path = fixtures_path().join("exclude-path");

View file

@ -392,7 +392,7 @@ pub struct Client {
/// Optional remapping rules for URIs matching pattern.
remaps: Option<Remaps>,
/// Rules to decided whether each link would be checked or ignored.
/// Rules to decided whether each link should be checked or ignored.
filter: Filter,
/// Maximum number of retries per request before returning an error.
@ -446,47 +446,41 @@ impl Client {
..
} = request.try_into()?;
self.remap(uri);
// Allow filtering based on element and attribute
// if !self.filter.is_allowed(uri) {
// return Ok(Response::new(
// uri.clone(),
// Status::Excluded,
// source,
// ));
// }
// TODO: Allow filtering based on element and attribute
let status = if self.is_excluded(uri) {
Status::Excluded
} else if uri.is_file() {
self.check_file(uri)
} else if uri.is_mail() {
#[cfg(all(feature = "email-check", feature = "native-tls"))]
{
self.check_mail(uri).await
}
self.remap(uri)?;
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
Status::Excluded
} else {
match self.check_website(uri, credentials).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
if self
.check_website(&uri.to_https()?, credentials)
.await
.is_success()
{
Status::Error(ErrorKind::InsecureURL(uri.clone()))
} else {
Status::Ok(code)
}
}
s => s,
}
if self.is_excluded(uri) {
return Ok(Response::new(uri.clone(), Status::Excluded, source));
}
let status = match uri.scheme() {
_ if uri.is_file() => self.check_file(uri),
_ if uri.is_mail() => self.check_mail(uri).await,
_ => self.check_website(uri, credentials).await?,
};
Ok(Response::new(uri.clone(), status, source))
}
/// Remap `uri` using the client-defined remapping rules.
pub fn remap(&self, uri: &mut Uri) {
// TODO: this should be logged (Lucius, Jan 2023)
///
/// # Errors
///
/// Returns an `Err` if the final, remapped `uri` is not a valid URI.
pub fn remap(&self, uri: &mut Uri) -> Result<()> {
if let Some(ref remaps) = self.remaps {
remaps.remap(&mut uri.url);
debug!("Remapping URI: {}", uri.url);
uri.url = remaps.remap(&uri.url)?;
}
Ok(())
}
/// Returns whether the given `uri` should be ignored from checking.
@ -495,6 +489,38 @@ impl Client {
self.filter.is_excluded(uri)
}
/// Checks the given URI of a website.
///
/// # Errors
///
/// This returns an `Err` if
/// - The URI is invalid.
/// - The request failed.
/// - The response status code is not accepted.
/// - The URI cannot be converted to HTTPS.
pub async fn check_website(
&self,
uri: &Uri,
credentials: &Option<BasicAuthCredentials>,
) -> Result<Status> {
match self.check_website_inner(uri, credentials).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
if self
.check_website_inner(&uri.to_https()?, credentials)
.await
.is_success()
{
Ok(Status::Error(ErrorKind::InsecureURL(uri.clone())))
} else {
// HTTPS is not available for this URI,
// so the original HTTP URL is fine.
Ok(Status::Ok(code))
}
}
s => Ok(s),
}
}
/// Checks the given URI of a website.
///
/// Unsupported schemes will be ignored
@ -505,7 +531,7 @@ impl Client {
/// - The URI is invalid.
/// - The request failed.
/// - The response status code is not accepted.
pub async fn check_website(
pub async fn check_website_inner(
&self,
uri: &Uri,
credentials: &Option<BasicAuthCredentials>,
@ -650,6 +676,11 @@ impl Client {
Status::Ok(StatusCode::OK)
}
}
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
pub async fn check_mail(&self, uri: &Uri) -> Status {
Status::Excluded
}
}
// Check if the given `Url` would cause `reqwest` to panic.

View file

@ -22,9 +22,9 @@
use std::ops::Index;
use regex::Regex;
use reqwest::Url;
use url::Url;
use crate::ErrorKind;
use crate::{ErrorKind, Result};
/// Rules that remap matching URL patterns.
///
@ -35,28 +35,42 @@ use crate::ErrorKind;
/// # Notes
/// See module level documentation of usage notes.
#[derive(Debug, Clone)]
pub struct Remaps(Vec<(Regex, Url)>);
pub struct Remaps(Vec<(Regex, String)>);
impl Remaps {
/// Create a new remapper
#[must_use]
pub fn new(patterns: Vec<(Regex, Url)>) -> Self {
pub fn new(patterns: Vec<(Regex, String)>) -> Self {
Self(patterns)
}
/// Returns an iterator over the rules.
// `iter_mut` is deliberately avoided.
pub fn iter(&self) -> std::slice::Iter<(Regex, Url)> {
pub fn iter(&self) -> std::slice::Iter<(Regex, String)> {
self.0.iter()
}
/// Remap URL against remapping rules.
pub fn remap(&self, url: &mut Url) {
for (pattern, new_url) in self {
if pattern.is_match(url.as_str()) {
*url = new_url.clone();
///
/// If there is no matching rule, the original URL is returned.
///
/// # Errors
///
/// Returns an `Err` if the remapping rule produces an invalid URL.
#[must_use = "Remapped URLs must be used"]
pub fn remap(&self, original: &Url) -> Result<Url> {
for (pattern, replacement) in self {
if pattern.is_match(original.as_str()) {
let after = pattern.replace_all(original.as_str(), replacement);
let after_url = Url::parse(after.as_ref()).map_err(|_| {
ErrorKind::InvalidUrlRemap(format!(
"The remapping pattern must produce a valid URL, but it is not: {after}"
))
})?;
return Ok(after_url);
}
}
Ok(original.clone())
}
/// Returns `true` if there is no remapping rule defined.
@ -73,9 +87,9 @@ impl Remaps {
}
impl Index<usize> for Remaps {
type Output = (Regex, Url);
type Output = (Regex, String);
fn index(&self, index: usize) -> &(regex::Regex, url::Url) {
fn index(&self, index: usize) -> &(regex::Regex, String) {
&self.0[index]
}
}
@ -100,13 +114,14 @@ impl TryFrom<&[String]> for Remaps {
for remap in remaps {
let params: Vec<_> = remap.split_whitespace().collect();
if params.len() != 2 {
return Err(ErrorKind::InvalidUrlRemap(remap.to_string()));
return Err(ErrorKind::InvalidUrlRemap(
format!("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: {remap}"
)));
}
let pattern = Regex::new(params[0])?;
let url = Url::try_from(params[1])
.map_err(|e| ErrorKind::ParseUrl(e, params[1].to_string()))?;
parsed.push((pattern, url));
let replacement = params[1].to_string();
parsed.push((pattern, replacement));
}
Ok(Remaps::new(parsed))
@ -116,9 +131,9 @@ impl TryFrom<&[String]> for Remaps {
// Implementation for mutable iterator and moving iterator are deliberately
// avoided
impl<'a> IntoIterator for &'a Remaps {
type Item = &'a (Regex, Url);
type Item = &'a (Regex, String);
type IntoIter = std::slice::Iter<'a, (Regex, Url)>;
type IntoIter = std::slice::Iter<'a, (Regex, String)>;
fn into_iter(self) -> Self::IntoIter {
self.0.iter()
@ -127,42 +142,127 @@ impl<'a> IntoIterator for &'a Remaps {
#[cfg(test)]
mod tests {
use url::Url;
use super::*;
#[test]
fn test_remap() {
let pattern = Regex::new("https://example.com").unwrap();
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
let input = "https://example.com";
let input_url = Url::try_from(input).unwrap();
let input_pattern = Regex::new(input).unwrap();
let replacement = "http://127.0.0.1:8080";
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
let mut input = Url::try_from("https://example.com").unwrap();
remaps.remap(&mut input);
let output = remaps.remap(&input_url).unwrap();
assert_eq!(input, new_url);
assert_eq!(output, Url::try_from(replacement).unwrap());
}
#[test]
fn test_remap_path() {
let pattern = Regex::new("../../issues").unwrap();
let new_url = Url::try_from("https://example.com").unwrap();
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
let input = Url::try_from("file://../../issues").unwrap();
let input_pattern = Regex::new(".*?../../issues").unwrap();
let replacement = Url::try_from("https://example.com").unwrap();
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
let mut input = Url::try_from("file://../../issues").unwrap();
remaps.remap(&mut input);
let output = remaps.remap(&input).unwrap();
assert_eq!(input, new_url);
assert_eq!(output, replacement);
}
#[test]
fn test_remap_skip() {
let input = Url::try_from("https://unrelated.example.com").unwrap();
let pattern = Regex::new("https://example.com").unwrap();
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
let remaps = Remaps::new(vec![(pattern, new_url)]);
let replacement = Url::try_from("http://127.0.0.1:8080").unwrap();
let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
let mut input = Url::try_from("https://unrelated.example.com").unwrap();
remaps.remap(&mut input);
let output = remaps.remap(&input).unwrap();
// URL was not modified
assert_eq!(input, input);
assert_eq!(input, output);
}
#[test]
fn test_remap_url_to_file() {
let pattern = Regex::new("https://docs.example.org").unwrap();
let replacement = "file:///Users/user/code/repo/docs/_site";
let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
let tests = [
(
"https://docs.example.org/integrations/distcp.html",
"file:///Users/user/code/repo/docs/_site/integrations/distcp.html",
),
(
"https://docs.example.org/howto/import.html#working-with-imported-data",
"file:///Users/user/code/repo/docs/_site/howto/import.html#working-with-imported-data",
),
(
"https://docs.example.org/howto/garbage-collection-committed.html",
"file:///Users/user/code/repo/docs/_site/howto/garbage-collection-committed.html",
),
];
for (input, expected) in tests {
let input = Url::parse(input).unwrap();
let output = remaps.remap(&input).unwrap();
assert_eq!(output, Url::parse(expected).unwrap());
}
}
/// This is a partial remap, i.e. the URL is not fully replaced but only
/// part of it. The parts to be replaced are defined by the regex pattern
/// using capture groups.
#[test]
fn test_remap_capture_group() {
let input = Url::try_from("https://example.com/1/2/3").unwrap();
let input_pattern = Regex::new("https://example.com/.*?/(.*?)/.*").unwrap();
let replacement = Url::try_from("https://example.com/foo/$1/bar").unwrap();
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
let output = remaps.remap(&input).unwrap();
assert_eq!(
output,
Url::try_from("https://example.com/foo/2/bar").unwrap()
);
}
#[test]
fn test_remap_named_capture() {
let input = Url::try_from("https://example.com/1/2/3").unwrap();
let input_pattern = Regex::new("https://example.com/.*?/(?P<foo>.*?)/.*").unwrap();
let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
let output = remaps.remap(&input).unwrap();
assert_eq!(
output,
Url::try_from("https://example.com/foo/2/bar").unwrap()
);
}
#[test]
fn test_remap_named_capture_shorthand() {
let input = Url::try_from("https://example.com/1/2/3").unwrap();
#[allow(clippy::invalid_regex)]
// Clippy acts up here, but this syntax is actually valid
// See https://docs.rs/regex/latest/regex/index.html#grouping-and-flags
let input_pattern = Regex::new(r"https://example.com/.*?/(?<foo>.*?)/.*").unwrap();
let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
let output = remaps.remap(&input).unwrap();
assert_eq!(
output,
Url::try_from("https://example.com/foo/2/bar").unwrap()
);
}
}

View file

@ -82,7 +82,7 @@ pub enum ErrorKind {
InvalidBase(String, String),
/// The given input can not be parsed into a valid URI remapping
#[error("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: `{0}`")]
#[error("Error remapping URL: `{0}`")]
InvalidUrlRemap(String),
/// The given path does not resolve to a valid file