From 92a9bca23f7a16124315d49f5b5e334311e29356 Mon Sep 17 00:00:00 2001 From: MichaIng Date: Sun, 6 Jul 2025 10:46:06 +0200 Subject: [PATCH] feat: skip fragment checking for unsupported MIME types (#1744) * feat: skip fragment checking for unsupported MIME types The remote URL/website checker currently passes all URLs with fragments to the fragment checker as HTML document, even if it is a different or unsupported MIME type. This can cause false fragment checking for Markdown documents, failures for other MIME types, especially binaries, and unnecessary traffic for large downloads, which are always finished completely, if the fragment checker is invoked. This commit checks the Content-Type header of the response: - Only if it is `text/html`, it is passed to the fragment checker as HTML type. - Only if it is `text/markdown`, of `text/plain` and URL path ends on `.md`, it is passed to the fragment checker as Markdown type. - In all other cases, the fragment checker is skipped and the HTTP status is returned. To invoke the fragment checker with a variable document type, a new `FileType` argument is added to the `check_html_fragment()` function. The fragment checker test and fixture are adjusted to match the expected result: checking a binary file via remote URL with fragment is now expected to succeed, since its Content-Type header does not invoke the fragment checker anymore. Signed-off-by: MichaIng * Update fixtures/fragments/file1.md Co-authored-by: MichaIng --------- Signed-off-by: MichaIng Co-authored-by: Matthias Endler --- fixtures/fragments/file1.md | 13 ++------- lychee-bin/tests/cli.rs | 4 +-- lychee-lib/src/checker/website.rs | 46 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index 4d99aa0..092ca99 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -104,17 +104,10 @@ Even with fragment checking enabled, the following links must hence succeed: [Link to remote binary file without fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin) [Link to remote binary file with empty fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#) -## Local file with fragment +## With fragment -For local files URIs with fragment, the fragment checker is invoked and fails to read the content, -but the file checker emits a warning only. The following link hence must succeed as well: +Fragment checking is skipped if the Content-Type header is not "text/html", "text/markdown", or "text/plain" with ".md" URL path ending. +Hence, despite containing fragments which cannot be checked in binary files, the following links are expected to succeed with a HTTP 200 status: [Link to local binary file with fragment](zero.bin#fragment) - -## Remote URL with fragment - -Right now, there is not MIME/content type based exclusion for fragment checks in the website checker. -Also, other than the file checker, the website checker throws an error if reading the response body fails. -The following link hence must fail: - [Link to remote binary file with fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#fragment) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 31a360b..7bf9f73 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1915,9 +1915,9 @@ mod cli { "https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#fragment", )) .stdout(contains("42 Total")) - .stdout(contains("29 OK")) + .stdout(contains("30 OK")) // Failures because of missing fragments or failed binary body scan - .stdout(contains("13 Errors")); + .stdout(contains("12 Errors")); } #[test] diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index f4393b2..ef1a2a5 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -1,5 +1,5 @@ use crate::{ - BasicAuthCredentials, ErrorKind, Status, Uri, + BasicAuthCredentials, ErrorKind, FileType, Status, Uri, chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, quirks::Quirks, retry::RetryExt, @@ -9,8 +9,8 @@ use crate::{ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; -use reqwest::{Request, Response}; -use std::{collections::HashSet, time::Duration}; +use reqwest::{Request, Response, header::CONTENT_TYPE}; +use std::{collections::HashSet, path::Path, time::Duration}; #[derive(Debug, Clone)] pub(crate) struct WebsiteChecker { @@ -108,7 +108,28 @@ impl WebsiteChecker { && method == Method::GET && response.url().fragment().is_some_and(|x| !x.is_empty()) { - self.check_html_fragment(status, response).await + let Some(content_type) = response + .headers() + .get(CONTENT_TYPE) + .and_then(|header| header.to_str().ok()) + else { + return status; + }; + + let file_type = match content_type { + ct if ct.starts_with("text/html") => FileType::Html, + ct if ct.starts_with("text/markdown") => FileType::Markdown, + ct if ct.starts_with("text/plain") => { + let path = Path::new(response.url().path()); + match path.extension() { + Some(ext) if ext.eq_ignore_ascii_case("md") => FileType::Markdown, + _ => return status, + } + } + _ => return status, + }; + + self.check_html_fragment(status, response, file_type).await } else { status } @@ -117,19 +138,18 @@ impl WebsiteChecker { } } - async fn check_html_fragment(&self, status: Status, response: Response) -> Status { + async fn check_html_fragment( + &self, + status: Status, + response: Response, + file_type: FileType, + ) -> Status { let url = response.url().clone(); match response.text().await { - Ok(text) => { + Ok(content) => { match self .fragment_checker - .check( - FragmentInput { - content: text, - file_type: crate::FileType::Html, - }, - &url, - ) + .check(FragmentInput { content, file_type }, &url) .await { Ok(true) => status,