mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-13 08:13:14 +00:00
Extend remap feature (#1133)
* wip * Extend support for remapping This adds supports for partial remaps and capture groups to the remap feature. Fixes #1129
This commit is contained in:
parent
687190bde9
commit
97573123ef
8 changed files with 245 additions and 72 deletions
1
Makefile
1
Makefile
|
|
@ -36,6 +36,7 @@ docs: ## Generate and show documentation
|
|||
|
||||
.PHONY: lint
|
||||
lint: ## Run linter
|
||||
cargo fmt --all -- --check
|
||||
cargo clippy --all-targets --all-features -- -D warnings
|
||||
|
||||
.PHONY: test
|
||||
|
|
|
|||
7
fixtures/configs/smoketest.toml
vendored
7
fixtures/configs/smoketest.toml
vendored
|
|
@ -65,7 +65,12 @@ method = "get"
|
|||
headers = []
|
||||
|
||||
# Remap URI matching pattern to different URI.
|
||||
remap = [ "https://example.com http://example.invalid" ]
|
||||
# This also supports (named) capturing groups.
|
||||
remap = [
|
||||
"https://example.com http://example.invalid",
|
||||
"https://example.com/(.*) http://example.org/$1",
|
||||
"https://github.com/(?P<org>.*)/(?P<repo>.*) https://gitlab.com/$org/$repo",
|
||||
]
|
||||
|
||||
# Base URL or website root directory to check relative URLs.
|
||||
base = "https://example.com"
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ where
|
|||
let mut request = request?;
|
||||
|
||||
// Apply URI remappings (if any)
|
||||
params.client.remap(&mut request.uri);
|
||||
params.client.remap(&mut request.uri)?;
|
||||
|
||||
// Avoid panic on broken pipe.
|
||||
// See https://github.com/rust-lang/rust/issues/46016
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ mod tests {
|
|||
|
||||
use headers::HeaderMap;
|
||||
use regex::Regex;
|
||||
use reqwest::{header, Url};
|
||||
use reqwest::header;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
|
@ -89,6 +89,6 @@ mod tests {
|
|||
pattern.to_string(),
|
||||
Regex::new("https://example.com").unwrap().to_string()
|
||||
);
|
||||
assert_eq!(url, Url::try_from("http://127.0.0.1:8080").unwrap());
|
||||
assert_eq!(url, "http://127.0.0.1:8080");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1116,6 +1116,42 @@ mod cli {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_capture() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
|
||||
cmd.arg("--dump")
|
||||
.arg("--remap")
|
||||
.arg("https://example.com/(.*) http://example.org/$1")
|
||||
.arg("--")
|
||||
.arg("-")
|
||||
.write_stdin("https://example.com/foo\n")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("http://example.org/foo"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_named_capture() -> Result<()> {
|
||||
let mut cmd = main_command();
|
||||
|
||||
cmd.arg("--dump")
|
||||
.arg("--remap")
|
||||
.arg("https://github.com/(?P<org>.*)/(?P<repo>.*) https://gitlab.com/$org/$repo")
|
||||
.arg("--")
|
||||
.arg("-")
|
||||
.write_stdin("https://github.com/lycheeverse/lychee\n")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("https://gitlab.com/lycheeverse/lychee"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_excluded_paths() -> Result<()> {
|
||||
let test_path = fixtures_path().join("exclude-path");
|
||||
|
|
|
|||
|
|
@ -392,7 +392,7 @@ pub struct Client {
|
|||
/// Optional remapping rules for URIs matching pattern.
|
||||
remaps: Option<Remaps>,
|
||||
|
||||
/// Rules to decided whether each link would be checked or ignored.
|
||||
/// Rules to decided whether each link should be checked or ignored.
|
||||
filter: Filter,
|
||||
|
||||
/// Maximum number of retries per request before returning an error.
|
||||
|
|
@ -446,47 +446,41 @@ impl Client {
|
|||
..
|
||||
} = request.try_into()?;
|
||||
|
||||
self.remap(uri);
|
||||
// Allow filtering based on element and attribute
|
||||
// if !self.filter.is_allowed(uri) {
|
||||
// return Ok(Response::new(
|
||||
// uri.clone(),
|
||||
// Status::Excluded,
|
||||
// source,
|
||||
// ));
|
||||
// }
|
||||
|
||||
// TODO: Allow filtering based on element and attribute
|
||||
let status = if self.is_excluded(uri) {
|
||||
Status::Excluded
|
||||
} else if uri.is_file() {
|
||||
self.check_file(uri)
|
||||
} else if uri.is_mail() {
|
||||
#[cfg(all(feature = "email-check", feature = "native-tls"))]
|
||||
{
|
||||
self.check_mail(uri).await
|
||||
}
|
||||
self.remap(uri)?;
|
||||
|
||||
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
|
||||
Status::Excluded
|
||||
} else {
|
||||
match self.check_website(uri, credentials).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
if self
|
||||
.check_website(&uri.to_https()?, credentials)
|
||||
.await
|
||||
.is_success()
|
||||
{
|
||||
Status::Error(ErrorKind::InsecureURL(uri.clone()))
|
||||
} else {
|
||||
Status::Ok(code)
|
||||
}
|
||||
}
|
||||
s => s,
|
||||
}
|
||||
if self.is_excluded(uri) {
|
||||
return Ok(Response::new(uri.clone(), Status::Excluded, source));
|
||||
}
|
||||
|
||||
let status = match uri.scheme() {
|
||||
_ if uri.is_file() => self.check_file(uri),
|
||||
_ if uri.is_mail() => self.check_mail(uri).await,
|
||||
_ => self.check_website(uri, credentials).await?,
|
||||
};
|
||||
|
||||
Ok(Response::new(uri.clone(), status, source))
|
||||
}
|
||||
|
||||
/// Remap `uri` using the client-defined remapping rules.
|
||||
pub fn remap(&self, uri: &mut Uri) {
|
||||
// TODO: this should be logged (Lucius, Jan 2023)
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if the final, remapped `uri` is not a valid URI.
|
||||
pub fn remap(&self, uri: &mut Uri) -> Result<()> {
|
||||
if let Some(ref remaps) = self.remaps {
|
||||
remaps.remap(&mut uri.url);
|
||||
debug!("Remapping URI: {}", uri.url);
|
||||
uri.url = remaps.remap(&uri.url)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns whether the given `uri` should be ignored from checking.
|
||||
|
|
@ -495,6 +489,38 @@ impl Client {
|
|||
self.filter.is_excluded(uri)
|
||||
}
|
||||
|
||||
/// Checks the given URI of a website.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This returns an `Err` if
|
||||
/// - The URI is invalid.
|
||||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
/// - The URI cannot be converted to HTTPS.
|
||||
pub async fn check_website(
|
||||
&self,
|
||||
uri: &Uri,
|
||||
credentials: &Option<BasicAuthCredentials>,
|
||||
) -> Result<Status> {
|
||||
match self.check_website_inner(uri, credentials).await {
|
||||
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
|
||||
if self
|
||||
.check_website_inner(&uri.to_https()?, credentials)
|
||||
.await
|
||||
.is_success()
|
||||
{
|
||||
Ok(Status::Error(ErrorKind::InsecureURL(uri.clone())))
|
||||
} else {
|
||||
// HTTPS is not available for this URI,
|
||||
// so the original HTTP URL is fine.
|
||||
Ok(Status::Ok(code))
|
||||
}
|
||||
}
|
||||
s => Ok(s),
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks the given URI of a website.
|
||||
///
|
||||
/// Unsupported schemes will be ignored
|
||||
|
|
@ -505,7 +531,7 @@ impl Client {
|
|||
/// - The URI is invalid.
|
||||
/// - The request failed.
|
||||
/// - The response status code is not accepted.
|
||||
pub async fn check_website(
|
||||
pub async fn check_website_inner(
|
||||
&self,
|
||||
uri: &Uri,
|
||||
credentials: &Option<BasicAuthCredentials>,
|
||||
|
|
@ -650,6 +676,11 @@ impl Client {
|
|||
Status::Ok(StatusCode::OK)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(all(feature = "email-check", feature = "native-tls")))]
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
Status::Excluded
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the given `Url` would cause `reqwest` to panic.
|
||||
|
|
|
|||
|
|
@ -22,9 +22,9 @@
|
|||
use std::ops::Index;
|
||||
|
||||
use regex::Regex;
|
||||
use reqwest::Url;
|
||||
use url::Url;
|
||||
|
||||
use crate::ErrorKind;
|
||||
use crate::{ErrorKind, Result};
|
||||
|
||||
/// Rules that remap matching URL patterns.
|
||||
///
|
||||
|
|
@ -35,28 +35,42 @@ use crate::ErrorKind;
|
|||
/// # Notes
|
||||
/// See module level documentation of usage notes.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Remaps(Vec<(Regex, Url)>);
|
||||
pub struct Remaps(Vec<(Regex, String)>);
|
||||
|
||||
impl Remaps {
|
||||
/// Create a new remapper
|
||||
#[must_use]
|
||||
pub fn new(patterns: Vec<(Regex, Url)>) -> Self {
|
||||
pub fn new(patterns: Vec<(Regex, String)>) -> Self {
|
||||
Self(patterns)
|
||||
}
|
||||
|
||||
/// Returns an iterator over the rules.
|
||||
// `iter_mut` is deliberately avoided.
|
||||
pub fn iter(&self) -> std::slice::Iter<(Regex, Url)> {
|
||||
pub fn iter(&self) -> std::slice::Iter<(Regex, String)> {
|
||||
self.0.iter()
|
||||
}
|
||||
|
||||
/// Remap URL against remapping rules.
|
||||
pub fn remap(&self, url: &mut Url) {
|
||||
for (pattern, new_url) in self {
|
||||
if pattern.is_match(url.as_str()) {
|
||||
*url = new_url.clone();
|
||||
///
|
||||
/// If there is no matching rule, the original URL is returned.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an `Err` if the remapping rule produces an invalid URL.
|
||||
#[must_use = "Remapped URLs must be used"]
|
||||
pub fn remap(&self, original: &Url) -> Result<Url> {
|
||||
for (pattern, replacement) in self {
|
||||
if pattern.is_match(original.as_str()) {
|
||||
let after = pattern.replace_all(original.as_str(), replacement);
|
||||
let after_url = Url::parse(after.as_ref()).map_err(|_| {
|
||||
ErrorKind::InvalidUrlRemap(format!(
|
||||
"The remapping pattern must produce a valid URL, but it is not: {after}"
|
||||
))
|
||||
})?;
|
||||
return Ok(after_url);
|
||||
}
|
||||
}
|
||||
Ok(original.clone())
|
||||
}
|
||||
|
||||
/// Returns `true` if there is no remapping rule defined.
|
||||
|
|
@ -73,9 +87,9 @@ impl Remaps {
|
|||
}
|
||||
|
||||
impl Index<usize> for Remaps {
|
||||
type Output = (Regex, Url);
|
||||
type Output = (Regex, String);
|
||||
|
||||
fn index(&self, index: usize) -> &(regex::Regex, url::Url) {
|
||||
fn index(&self, index: usize) -> &(regex::Regex, String) {
|
||||
&self.0[index]
|
||||
}
|
||||
}
|
||||
|
|
@ -100,13 +114,14 @@ impl TryFrom<&[String]> for Remaps {
|
|||
for remap in remaps {
|
||||
let params: Vec<_> = remap.split_whitespace().collect();
|
||||
if params.len() != 2 {
|
||||
return Err(ErrorKind::InvalidUrlRemap(remap.to_string()));
|
||||
return Err(ErrorKind::InvalidUrlRemap(
|
||||
format!("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: {remap}"
|
||||
)));
|
||||
}
|
||||
|
||||
let pattern = Regex::new(params[0])?;
|
||||
let url = Url::try_from(params[1])
|
||||
.map_err(|e| ErrorKind::ParseUrl(e, params[1].to_string()))?;
|
||||
parsed.push((pattern, url));
|
||||
let replacement = params[1].to_string();
|
||||
parsed.push((pattern, replacement));
|
||||
}
|
||||
|
||||
Ok(Remaps::new(parsed))
|
||||
|
|
@ -116,9 +131,9 @@ impl TryFrom<&[String]> for Remaps {
|
|||
// Implementation for mutable iterator and moving iterator are deliberately
|
||||
// avoided
|
||||
impl<'a> IntoIterator for &'a Remaps {
|
||||
type Item = &'a (Regex, Url);
|
||||
type Item = &'a (Regex, String);
|
||||
|
||||
type IntoIter = std::slice::Iter<'a, (Regex, Url)>;
|
||||
type IntoIter = std::slice::Iter<'a, (Regex, String)>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.iter()
|
||||
|
|
@ -127,42 +142,127 @@ impl<'a> IntoIterator for &'a Remaps {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use url::Url;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_remap() {
|
||||
let pattern = Regex::new("https://example.com").unwrap();
|
||||
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
|
||||
let input = "https://example.com";
|
||||
let input_url = Url::try_from(input).unwrap();
|
||||
let input_pattern = Regex::new(input).unwrap();
|
||||
let replacement = "http://127.0.0.1:8080";
|
||||
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
|
||||
|
||||
let mut input = Url::try_from("https://example.com").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
let output = remaps.remap(&input_url).unwrap();
|
||||
|
||||
assert_eq!(input, new_url);
|
||||
assert_eq!(output, Url::try_from(replacement).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_path() {
|
||||
let pattern = Regex::new("../../issues").unwrap();
|
||||
let new_url = Url::try_from("https://example.com").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url.clone())]);
|
||||
let input = Url::try_from("file://../../issues").unwrap();
|
||||
let input_pattern = Regex::new(".*?../../issues").unwrap();
|
||||
let replacement = Url::try_from("https://example.com").unwrap();
|
||||
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
|
||||
|
||||
let mut input = Url::try_from("file://../../issues").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
|
||||
assert_eq!(input, new_url);
|
||||
assert_eq!(output, replacement);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_skip() {
|
||||
let input = Url::try_from("https://unrelated.example.com").unwrap();
|
||||
let pattern = Regex::new("https://example.com").unwrap();
|
||||
let new_url = Url::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, new_url)]);
|
||||
let replacement = Url::try_from("http://127.0.0.1:8080").unwrap();
|
||||
let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
|
||||
|
||||
let mut input = Url::try_from("https://unrelated.example.com").unwrap();
|
||||
remaps.remap(&mut input);
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
|
||||
// URL was not modified
|
||||
assert_eq!(input, input);
|
||||
assert_eq!(input, output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_url_to_file() {
|
||||
let pattern = Regex::new("https://docs.example.org").unwrap();
|
||||
let replacement = "file:///Users/user/code/repo/docs/_site";
|
||||
let remaps = Remaps::new(vec![(pattern, replacement.to_string())]);
|
||||
|
||||
let tests = [
|
||||
(
|
||||
"https://docs.example.org/integrations/distcp.html",
|
||||
"file:///Users/user/code/repo/docs/_site/integrations/distcp.html",
|
||||
),
|
||||
(
|
||||
"https://docs.example.org/howto/import.html#working-with-imported-data",
|
||||
"file:///Users/user/code/repo/docs/_site/howto/import.html#working-with-imported-data",
|
||||
),
|
||||
(
|
||||
"https://docs.example.org/howto/garbage-collection-committed.html",
|
||||
"file:///Users/user/code/repo/docs/_site/howto/garbage-collection-committed.html",
|
||||
),
|
||||
];
|
||||
|
||||
for (input, expected) in tests {
|
||||
let input = Url::parse(input).unwrap();
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
assert_eq!(output, Url::parse(expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a partial remap, i.e. the URL is not fully replaced but only
|
||||
/// part of it. The parts to be replaced are defined by the regex pattern
|
||||
/// using capture groups.
|
||||
#[test]
|
||||
fn test_remap_capture_group() {
|
||||
let input = Url::try_from("https://example.com/1/2/3").unwrap();
|
||||
let input_pattern = Regex::new("https://example.com/.*?/(.*?)/.*").unwrap();
|
||||
let replacement = Url::try_from("https://example.com/foo/$1/bar").unwrap();
|
||||
|
||||
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
|
||||
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
output,
|
||||
Url::try_from("https://example.com/foo/2/bar").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_named_capture() {
|
||||
let input = Url::try_from("https://example.com/1/2/3").unwrap();
|
||||
let input_pattern = Regex::new("https://example.com/.*?/(?P<foo>.*?)/.*").unwrap();
|
||||
let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
|
||||
|
||||
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
|
||||
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
output,
|
||||
Url::try_from("https://example.com/foo/2/bar").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remap_named_capture_shorthand() {
|
||||
let input = Url::try_from("https://example.com/1/2/3").unwrap();
|
||||
#[allow(clippy::invalid_regex)]
|
||||
// Clippy acts up here, but this syntax is actually valid
|
||||
// See https://docs.rs/regex/latest/regex/index.html#grouping-and-flags
|
||||
let input_pattern = Regex::new(r"https://example.com/.*?/(?<foo>.*?)/.*").unwrap();
|
||||
let replacement = Url::try_from("https://example.com/foo/$foo/bar").unwrap();
|
||||
|
||||
let remaps = Remaps::new(vec![(input_pattern, replacement.to_string())]);
|
||||
|
||||
let output = remaps.remap(&input).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
output,
|
||||
Url::try_from("https://example.com/foo/2/bar").unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ pub enum ErrorKind {
|
|||
InvalidBase(String, String),
|
||||
|
||||
/// The given input can not be parsed into a valid URI remapping
|
||||
#[error("Cannot parse into URI remapping, must be a Regex pattern and a URL separated by whitespaces: `{0}`")]
|
||||
#[error("Error remapping URL: `{0}`")]
|
||||
InvalidUrlRemap(String),
|
||||
|
||||
/// The given path does not resolve to a valid file
|
||||
|
|
|
|||
Loading…
Reference in a new issue