From b97025624863561552c5b3e0557319682c321c00 Mon Sep 17 00:00:00 2001 From: MichaIng Date: Fri, 20 Jun 2025 17:47:35 +0200 Subject: [PATCH] fix: skip fragment check if website URL doesn't contain fragment (#1733) * fix: skip fragment check if website URL doesn't contain fragment Signed-off-by: MichaIng * test: add tests for fragment checks with binary data Signed-off-by: MichaIng * fix: skip fragment checking as well if fragment is empty `is_some()` is true as well if the fragment is given but empty, i.e. `#`. While it is an edge case, skip the fragment checker as well in case of an empty fragment. Signed-off-by: MichaIng * test: switch to lycheeverse/master remote URLs Signed-off-by: MichaIng * fix: apply rustfmt annotation Signed-off-by: MichaIng --------- Signed-off-by: MichaIng --- fixtures/fragments/file1.md | 30 ++++++++++++++++++++++++++++++ fixtures/fragments/zero.bin | 1 + lychee-bin/tests/cli.rs | 20 ++++++++++++++++---- lychee-lib/src/checker/file.rs | 5 ++++- lychee-lib/src/checker/website.rs | 6 +++++- 5 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 fixtures/fragments/zero.bin diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md index d4ec6f5..18d11a1 100644 --- a/fixtures/fragments/file1.md +++ b/fixtures/fragments/file1.md @@ -67,3 +67,33 @@ without related HTML element. Browser will scroll to the top of the page. A link to the non-existing fragment: [try](https://github.com/lycheeverse/lychee#non-existent-anchor). Skip the fragment check for directories like: [empty](empty_dir/). + +# Binary data URLs checks + +Fragment checking tries to scan the (whole) content/response body for HTML element IDs. +This fails for binary data and can cause unnecessary traffic for remote URLs. + +## Without fragment + +Fragment checking is skipped if the URL does not actually contain a fragment. +Even with fragment checking enabled, the following links must hence succeed: + +[Link to local binary file without fragment](zero.bin) +[Link to local binary file with empty fragment](zero.bin#) +[Link to remote binary file without fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin) +[Link to remote binary file with empty fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#) + +## Local file with fragment + +For local files URIs with fragment, the fragment checker is invoked and fails to read the content, +but the file checker emits a warning only. The following link hence must succeed as well: + +[Link to local binary file with fragment](zero.bin#fragment) + +## Remote URL with fragment + +Right now, there is not MIME/content type based exclusion for fragment checks in the website checker. +Also, other than the file checker, the website checker throws an error if reading the response body fails. +The following link hence must fail: + +[Link to remote binary file with fragment](https://raw.githubusercontent.com/lycheeverse/lychee/master/fixtures/fragments/zero.bin#fragment) diff --git a/fixtures/fragments/zero.bin b/fixtures/fragments/zero.bin new file mode 100644 index 0000000..abfba4e --- /dev/null +++ b/fixtures/fragments/zero.bin @@ -0,0 +1 @@ +âÔ \ No newline at end of file diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 1119a60..0888dfe 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1876,10 +1876,22 @@ mod cli { .stderr(contains( "https://github.com/lycheeverse/lychee#non-existent-anchor", )) - .stdout(contains("28 Total")) - .stdout(contains("23 OK")) - // 4 failures because of missing fragments - .stdout(contains("5 Errors")); + .stderr(contains("fixtures/fragments/zero.bin")) + .stderr(contains("fixtures/fragments/zero.bin#")) + .stderr(contains( + "https://raw.githubusercontent.com/MichaIng/lychee/skip-fragment-check-by-url/fixtures/fragments/zero.bin", + )) + .stderr(contains( + "https://raw.githubusercontent.com/MichaIng/lychee/skip-fragment-check-by-url/fixtures/fragments/zero.bin#", + )) + .stderr(contains("fixtures/fragments/zero.bin#fragment")) + .stderr(contains( + "https://raw.githubusercontent.com/MichaIng/lychee/skip-fragment-check-by-url/fixtures/fragments/zero.bin#fragment", + )) + .stdout(contains("34 Total")) + .stdout(contains("28 OK")) + // Failures because of missing fragments or failed binary body scan + .stdout(contains("6 Errors")); } #[test] diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 0e3c7d1..9ea05a0 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -125,7 +125,10 @@ impl FileChecker { async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status { // Only files can contain content with fragments. // Skip if the uri doesn't have the fragment. - if self.include_fragments && path.is_file() && uri.url.fragment().is_some() { + if self.include_fragments + && path.is_file() + && uri.url.fragment().is_some_and(|x| !x.is_empty()) + { self.check_fragment(path, uri).await } else { Status::Ok(StatusCode::OK) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index de118be..f4393b2 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -103,7 +103,11 @@ impl WebsiteChecker { match self.reqwest_client.execute(request).await { Ok(response) => { let status = Status::new(&response, &self.accepted); - if self.include_fragments && status.is_success() && method == Method::GET { + if self.include_fragments + && status.is_success() + && method == Method::GET + && response.url().fragment().is_some_and(|x| !x.is_empty()) + { self.check_html_fragment(status, response).await } else { status