Avoid double-encoding already encoded destination paths

E.g. `web%20site` becomes `web site`.
That's because Url::from_file_path will encode the full URL in the end.
This behavior cannot be configured.
See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
This commit is contained in:
Matthias 2021-09-09 01:44:10 +02:00
parent a75cae54b1
commit 93948d7367
6 changed files with 32 additions and 13 deletions

1
Cargo.lock generated
View file

@ -1395,6 +1395,7 @@ dependencies = [
"markup5ever_rcdom",
"openssl-sys",
"path-clean",
"percent-encoding",
"pretty_assertions",
"pulldown-cmark",
"regex",

View file

@ -160,8 +160,8 @@ mod cli {
.env_clear()
.assert()
.success()
.stdout(contains("Total............4"))
.stdout(contains("Successful.......4"));
.stdout(contains("Total............3"))
.stdout(contains("Successful.......3"));
}
#[test]

View file

@ -42,6 +42,7 @@ typed-builder = "0.9.1"
url = { version = "2.2.2", features = ["serde"] }
log = "0.4.14"
path-clean = "0.1.0"
percent-encoding = "2.1.0"
[dev-dependencies]
doc-comment = "0.3.3"

View file

@ -262,7 +262,7 @@ impl Client {
return Status::Ok(StatusCode::OK);
}
}
ErrorKind::InvalidFileUri(uri.clone()).into()
ErrorKind::InvalidFilePath(uri.clone()).into()
}
pub async fn check_mail(&self, uri: &Uri) -> Status {

View file

@ -6,6 +6,7 @@ use html5ever::{
};
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use percent_encoding::percent_decode_str;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use reqwest::Url;
@ -122,10 +123,16 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
.collect()
}
fn create_uri_from_path(root: &Path, base: &Option<Base>, link: &str) -> Result<Url> {
let link = url::remove_get_params_and_fragment(link);
let path = path::resolve(root, &PathBuf::from(&link), base)?;
Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))
fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Url> {
let dst = url::remove_get_params_and_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
// This behavior cannot be configured.
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
let decoded = percent_decode_str(dst).decode_utf8()?.to_string();
let path = path::resolve(src, &PathBuf::from(decoded), base)?;
Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path))
}
#[cfg(test)]

View file

@ -14,6 +14,8 @@ pub enum ErrorKind {
// reading files
/// Any form of I/O error occurred while reading from a given path.
IoError(Option<PathBuf>, std::io::Error),
/// Errors which can occur when attempting to interpret a sequence of u8 as a string
Utf8Error(std::str::Utf8Error),
/// Network error when trying to connect to an endpoint via reqwest.
ReqwestError(reqwest::Error),
/// Network error when trying to connect to an endpoint via hubcaps.
@ -21,9 +23,9 @@ pub enum ErrorKind {
/// The given string can not be parsed into a valid URL, e-mail address, or file path
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
/// The given URI cannot be converted to a file path
InvalidFileUri(Uri),
InvalidFilePath(Uri),
/// The given path cannot be converted to a URI
InvalidPath(PathBuf),
InvalidUrl(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
@ -74,8 +76,9 @@ impl Hash for ErrorKind {
Self::HubcapsError(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::InvalidPath(p) => p.hash(state),
Self::InvalidFileUri(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
Self::InvalidUrl(p) => p.hash(state),
Self::Utf8Error(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
u.hash(state);
}
Self::InvalidBase(base, e) => (base, e).hash(state),
@ -109,8 +112,8 @@ impl Display for ErrorKind {
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u),
Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()),
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
@ -124,6 +127,7 @@ impl Display for ErrorKind {
uri
),
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
Self::Utf8Error(e) => e.fmt(f),
}
}
}
@ -143,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind {
}
}
impl From<std::str::Utf8Error> for ErrorKind {
fn from(e: std::str::Utf8Error) -> Self {
Self::Utf8Error(e)
}
}
impl From<std::io::Error> for ErrorKind {
fn from(e: std::io::Error) -> Self {
Self::IoError(None, e)