Harden URL detection and extend verbatim elements (#899)

Previously remote URLs were incorrectly detected because the
string representation of a path is different than the path itself,
causing the `http` prefix match to be insufficient.

This resulted in unexpected side-effects, such as the
incorrect detection of verbatim mode for remote URLs.

The check now got improved and unit tests were added to avoid
future breakage. On top of that, missing verbatim elements were added
This commit is contained in:
Matthias Endler 2023-01-04 00:38:19 +01:00 committed by GitHub
parent f19ba8b5c4
commit 5654b7c317
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 135 additions and 19 deletions

28
fixtures/TEST_VERBATIM.html vendored Normal file
View file

@ -0,0 +1,28 @@
<!-- Test URLs in verbatim HTML elements -->
<html>
<head>
<title>Verbatim HTML</title>
</head>
<body>
<h1>Verbatim HTML</h1>
<p>Some verbatim HTML elements:</p>
<pre>http://www.example.com/pre</pre>
<code>http://www.example.com/code</code>
<samp> http://www.example.com/samp </samp>
<kbd>http://www.example.com/kbd</kbd>
<var>http://www.example.com/var</var>
<address>http://www.example.com/address</address>
<script>
// http://www.example.com/script
"http://www.example.com/script";
</script>
</body>
</html>

View file

@ -19,6 +19,7 @@ mod cli {
// constant.
const LYCHEE_CACHE_FILE: &str = ".lycheecache";
/// Helper macro to create a mock server which returns a custom status code.
macro_rules! mock_server {
($status:expr $(, $func:tt ($($arg:expr),*))*) => {{
let mock_server = wiremock::MockServer::start().await;
@ -29,11 +30,25 @@ mod cli {
}};
}
/// Helper macro to create a mock server which returns a 200 OK and a custom response body.
macro_rules! mock_response {
($body:expr) => {{
let mock_server = wiremock::MockServer::start().await;
let template = wiremock::ResponseTemplate::new(200).set_body_string($body);
wiremock::Mock::given(wiremock::matchers::method("GET"))
.respond_with(template)
.mount(&mock_server)
.await;
mock_server
}};
}
/// Gets the "main" binary name (e.g. `lychee`)
fn main_command() -> Command {
// this gets the "main" binary name (e.g. `lychee`)
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
}
/// Helper function to get the path to the fixtures directory.
fn fixtures_path() -> PathBuf {
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
@ -82,6 +97,7 @@ mod cli {
}
}
/// Helper macro to test the output of the JSON format.
macro_rules! test_json_output {
($test_file:expr, $expected:expr $(, $arg:expr)*) => {{
let mut cmd = main_command();
@ -846,6 +862,20 @@ mod cli {
Ok(())
}
#[test]
fn test_verbatim_skipped_by_default() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
cmd.arg(input)
.arg("--dump")
.assert()
.success()
.stdout(is_empty());
Ok(())
}
#[test]
fn test_include_verbatim() -> Result<()> {
let mut cmd = main_command();
@ -863,13 +893,15 @@ mod cli {
Ok(())
}
#[test]
fn test_exclude_verbatim() -> Result<()> {
#[tokio::test]
async fn test_verbatim_skipped_by_default_via_remote_url() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
let file = fixtures_path().join("TEST_VERBATIM.html");
let body = fs::read_to_string(file)?;
let mock_server = mock_response!(body);
cmd.arg(input)
.arg("--dump")
cmd.arg("--dump")
.arg(mock_server.uri())
.assert()
.success()
.stdout(is_empty());
@ -877,6 +909,28 @@ mod cli {
Ok(())
}
#[tokio::test]
async fn test_include_verbatim_via_remote_url() -> Result<()> {
let mut cmd = main_command();
let file = fixtures_path().join("TEST_VERBATIM.html");
let body = fs::read_to_string(file)?;
let mock_server = mock_response!(body);
cmd.arg("--include-verbatim")
.arg("--dump")
.arg(mock_server.uri())
.assert()
.success()
.stdout(contains("http://www.example.com/pre"))
.stdout(contains("http://www.example.com/code"))
.stdout(contains("http://www.example.com/samp"))
.stdout(contains("http://www.example.com/kbd"))
.stdout(contains("http://www.example.com/var"))
.stdout(contains("http://www.example.com/address"))
.stdout(contains("http://www.example.com/script"));
Ok(())
}
#[test]
fn test_require_https() -> Result<()> {
let mut cmd = main_command();

View file

@ -17,7 +17,18 @@ use plaintext::extract_plaintext;
pub(crate) fn is_verbatim_elem(name: &str) -> bool {
matches!(
name,
"code" | "listing" | "plaintext" | "samp" | "script" | "textarea" | "xmp" | "pre"
"address"
| "code"
| "kbd"
| "listing"
| "noscript"
| "plaintext"
| "pre"
| "samp"
| "script"
| "textarea"
| "var"
| "xmp"
)
}
@ -108,6 +119,7 @@ mod tests {
assert!(is_verbatim_elem("pre"));
assert!(is_verbatim_elem("code"));
assert!(is_verbatim_elem("listing"));
assert!(is_verbatim_elem("script"));
}
#[test]

View file

@ -39,6 +39,7 @@ pub(crate) fn website(url: &str) -> Uri {
Uri::from(Url::parse(url).expect("Expected valid Website URI"))
}
/// Creates a mail URI from a string
pub(crate) fn mail(address: &str) -> Uri {
if address.starts_with("mailto:") {
Url::parse(address)
@ -49,6 +50,7 @@ pub(crate) fn mail(address: &str) -> Uri {
.into()
}
/// Loads a fixture from the `fixtures` directory
pub(crate) fn load_fixture(filename: &str) -> String {
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()

View file

@ -19,18 +19,22 @@ impl Default for FileType {
impl<P: AsRef<Path>> From<P> for FileType {
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
//
// Assume HTML in case of no extension.
//
// This is only reasonable for URLs, not paths on disk. For example,
// a file named `README` without an extension is more likely to be a
// plaintext file.
//
// A better solution would be to also implement `From<Url> for
// FileType`. Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is
// why `From<Url> for FileType` is not allowed (orphan rule).
//
// As a workaround, we check if the scheme is `http` or `https` and
// assume HTML in that case.
fn from(p: P) -> FileType {
let path = p.as_ref();
// Assume HTML in case of no extension.
// Note: this is only reasonable for URLs; not paths on disk.
// For example, `README` without an extension is more likely to be a plaintext file.
// A better solution would be to also implement `From<Url> for FileType`.
// Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
// `From<Url> for FileType` is not allowed.
// As a workaround, we check if we got a known web-protocol
let is_url = path.starts_with("http");
match path
.extension()
.and_then(std::ffi::OsStr::to_str)
@ -42,12 +46,19 @@ impl<P: AsRef<Path>> From<P> for FileType {
FileType::Markdown
}
Some("htm" | "html") => FileType::Html,
None if is_url => FileType::Html,
_ => FileType::Plaintext,
None if is_url(path) => FileType::Html,
_ => FileType::default(),
}
}
}
/// Helper function to check if a path is likely a URL.
fn is_url(path: &Path) -> bool {
path.to_str().map_or(false, |s| {
s.starts_with("http://") || s.starts_with("https://")
})
}
#[cfg(test)]
mod tests {
use super::*;
@ -73,4 +84,13 @@ mod tests {
FileType::Html
);
}
#[test]
fn test_is_url() {
assert!(is_url(Path::new("http://foo.com")));
assert!(is_url(Path::new("https://foo.com")));
assert!(!is_url(Path::new("foo.com")));
assert!(!is_url(Path::new("foo")));
assert!(!is_url(Path::new("foo/bar")));
}
}