diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index 4d99aa0..092ca99 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -104,17 +104,10 @@ Even with fragment checking enabled, the following links must hence succeed: [Link to remote binary file without fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin) [Link to remote binary file with empty fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#) -## Local file with fragment +## With fragment -For local files URIs with fragment, the fragment checker is invoked and fails to read the content, -but the file checker emits a warning only. The following link hence must succeed as well: +Fragment checking is skipped if the Content-Type header is not "text/html", "text/markdown", or "text/plain" with ".md" URL path ending. +Hence, despite containing fragments which cannot be checked in binary files, the following links are expected to succeed with a HTTP 200 status: [Link to local binary file with fragment](zero.bin#fragment) - -## Remote URL with fragment - -Right now, there is not MIME/content type based exclusion for fragment checks in the website checker. -Also, other than the file checker, the website checker throws an error if reading the response body fails. -The following link hence must fail: - [Link to remote binary file with fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#fragment) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 31a360b..7bf9f73 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1915,9 +1915,9 @@ mod cli { "https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#fragment", )) .stdout(contains("42 Total")) - .stdout(contains("29 OK")) + .stdout(contains("30 OK")) // Failures because of missing fragments or failed binary body scan - .stdout(contains("13 Errors")); + .stdout(contains("12 Errors")); } #[test] diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index f4393b2..ef1a2a5 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -1,5 +1,5 @@ use crate::{ - BasicAuthCredentials, ErrorKind, Status, Uri, + BasicAuthCredentials, ErrorKind, FileType, Status, Uri, chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, quirks::Quirks, retry::RetryExt, @@ -9,8 +9,8 @@ use crate::{ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; -use reqwest::{Request, Response}; -use std::{collections::HashSet, time::Duration}; +use reqwest::{Request, Response, header::CONTENT_TYPE}; +use std::{collections::HashSet, path::Path, time::Duration}; #[derive(Debug, Clone)] pub(crate) struct WebsiteChecker { @@ -108,7 +108,28 @@ impl WebsiteChecker { && method == Method::GET && response.url().fragment().is_some_and(|x| !x.is_empty()) { - self.check_html_fragment(status, response).await + let Some(content_type) = response + .headers() + .get(CONTENT_TYPE) + .and_then(|header| header.to_str().ok()) + else { + return status; + }; + + let file_type = match content_type { + ct if ct.starts_with("text/html") => FileType::Html, + ct if ct.starts_with("text/markdown") => FileType::Markdown, + ct if ct.starts_with("text/plain") => { + let path = Path::new(response.url().path()); + match path.extension() { + Some(ext) if ext.eq_ignore_ascii_case("md") => FileType::Markdown, + _ => return status, + } + } + _ => return status, + }; + + self.check_html_fragment(status, response, file_type).await } else { status } @@ -117,19 +138,18 @@ impl WebsiteChecker { } } - async fn check_html_fragment(&self, status: Status, response: Response) -> Status { + async fn check_html_fragment( + &self, + status: Status, + response: Response, + file_type: FileType, + ) -> Status { let url = response.url().clone(); match response.text().await { - Ok(text) => { + Ok(content) => { match self .fragment_checker - .check( - FragmentInput { - content: text, - file_type: crate::FileType::Html, - }, - &url, - ) + .check(FragmentInput { content, file_type }, &url) .await { Ok(true) => status,