Allow overriding accepted status codes for cached URIs (#843)

Fixes #840
2026-05-08 13:54:44 +00:00 · 2022-11-28 12:23:07 +01:00 · 2022-11-28 12:23:07 +01:00 · b479a5810e
commit b479a5810e
parent d4e8bb01c1
8 changed files with 139 additions and 11 deletions
--- a/4
+++ b/4
@ -38,6 +38,10 @@ docs: ## Generate and show documentation
 lint: ## Run linter
 	cargo clippy --all-targets --all-features -- -D warnings

+.PHONY: test
+test: ## Run tests
+	cargo test --all-targets --all-features
+
 .PHONY: screencast
 screencast: ## Create a screencast for the docs
 	svg-term --command 'assets/screencast.sh' --out 'assets/screencast.svg' --width 100 --padding 10 --window
--- a/lychee-bin/src/commands/check.rs
+++ b/lychee-bin/src/commands/check.rs
@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::io::{self, Write};
 use std::sync::Arc;
 use std::time::Duration;
@ -30,6 +31,7 @@ where

    let client = params.client;
    let cache = params.cache;
+    let accept = params.cfg.accept;
    // Start receiving requests
    tokio::spawn(async move {
        futures::StreamExt::for_each_concurrent(
@ -37,7 +39,7 @@ where
            max_concurrency,
            |request: Result<Request>| async {
                let request = request.expect("cannot read request");
-                let response = handle(&client, cache.clone(), request).await;
+                let response = handle(&client, cache.clone(), request, accept.clone()).await;

                send_resp
                    .send(response)
@ -112,7 +114,12 @@ where
 }

 /// Handle a single request
-async fn handle(client: &Client, cache: Arc<Cache>, request: Request) -> Response {
+async fn handle(
+    client: &Client,
+    cache: Arc<Cache>,
+    request: Request,
+    accept: Option<HashSet<u16>>,
+) -> Response {
    let uri = request.uri.clone();
    if let Some(v) = cache.get(&uri) {
        // Found a cached request
@ -121,7 +128,11 @@ async fn handle(client: &Client, cache: Arc<Cache>, request: Request) -> Respons
        let status = if client.is_excluded(&uri) {
            Status::Excluded
        } else {
-            Status::from(v.value().status)
+            // Can't impl `Status::from(v.value().status)` here because the
+            // `accepted` status codes might have changed from the previous run
+            // and they may have an impact on the interpretation of the status
+            // code.
+            Status::from_cache_status(v.value().status, accept)
        };
        return Response::new(uri.clone(), status, request.source);
    }
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@ -178,7 +178,7 @@ fn load_cache(cfg: &Config) -> Option<Cache> {
            let elapsed = modified.elapsed().ok()?;
            if elapsed > cfg.max_cache_age {
                eprintln!(
-                    "Cache is too old (age: {}, max age: {}). Discarding",
+                    "Cache is too old (age: {}, max age: {}). Discarding and recreating",
                    humantime::format_duration(elapsed),
                    humantime::format_duration(cfg.max_cache_age)
                );
--- a/lychee-bin/src/parse.rs
+++ b/lychee-bin/src/parse.rs
@ -53,6 +53,11 @@ pub(crate) fn parse_base(src: &str) -> Result<Base, lychee_lib::ErrorKind> {
 }

 /// Parse HTTP status codes into a set of `StatusCode`
+///
+/// Note that this function does not convert the status codes into
+/// `StatusCode` but rather into `u16` to avoid the need for
+/// `http` as a dependency and to support custom status codes, which are
+/// necessary for some websites, which don't adhere to the HTTP spec or IANA.
 pub(crate) fn parse_statuscodes(accept: &str) -> Result<HashSet<u16>> {
    let mut statuscodes = HashSet::new();
    for code in accept.split(',') {
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@ -771,6 +771,81 @@ mod cli {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_lycheecache_accept_custom_status_codes() -> Result<()> {
+        let base_path = fixtures_path().join("cache");
+        let cache_file = base_path.join(LYCHEE_CACHE_FILE);
+
+        // Unconditionally remove cache file if it exists
+        let _ = fs::remove_file(&cache_file);
+
+        let mock_server_ok = mock_server!(StatusCode::OK);
+        let mock_server_teapot = mock_server!(StatusCode::IM_A_TEAPOT);
+        let mock_server_server_error = mock_server!(StatusCode::INTERNAL_SERVER_ERROR);
+
+        let dir = tempfile::tempdir()?;
+        let mut file = File::create(dir.path().join("c.md"))?;
+
+        writeln!(file, "{}", mock_server_ok.uri().as_str())?;
+        writeln!(file, "{}", mock_server_teapot.uri().as_str())?;
+        writeln!(file, "{}", mock_server_server_error.uri().as_str())?;
+
+        let mut cmd = main_command();
+        let test_cmd = cmd
+            .current_dir(&base_path)
+            .arg(dir.path().join("c.md"))
+            .arg("--verbose")
+            .arg("--cache");
+
+        assert!(
+            !cache_file.exists(),
+            "cache file should not exist before this test"
+        );
+
+        // run first without cache to generate the cache file
+        // ignore exit code
+        test_cmd
+            .assert()
+            .failure()
+            .code(2)
+            .stdout(contains(format!(
+                "[418] {}/ | Failed: Network error: I'm a teapot\n",
+                mock_server_teapot.uri()
+            )))
+            .stdout(contains(format!(
+                "[500] {}/ | Failed: Network error: Internal Server Error\n",
+                mock_server_server_error.uri()
+            )));
+
+        // check content of cache file
+        let data = fs::read_to_string(&cache_file)?;
+        assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
+        assert!(data.contains(&format!("{}/,418", mock_server_teapot.uri())));
+        assert!(data.contains(&format!("{}/,500", mock_server_server_error.uri())));
+
+        // run again to verify cache behavior
+        // this time accept 418 and 500 as valid status codes
+        test_cmd
+            .arg("--no-progress")
+            .arg("--accept")
+            .arg("200,418,500")
+            .assert()
+            .success()
+            .stdout(contains(format!(
+                "[418] {}/ | Cached: OK (cached)\n",
+                mock_server_teapot.uri()
+            )))
+            .stdout(contains(format!(
+                "[500] {}/ | Cached: OK (cached)\n",
+                mock_server_server_error.uri()
+            )));
+
+        // clear the cache file
+        fs::remove_file(&cache_file)?;
+
+        Ok(())
+    }
+
    #[test]
    fn test_include_verbatim() -> Result<()> {
        let mut cmd = main_command();
--- a/lychee-lib/src/types/cache.rs
+++ b/lychee-lib/src/types/cache.rs
@ -39,7 +39,9 @@ impl<'de> Deserialize<'de> for CacheStatus {
            other => match other.parse::<u16>() {
                Ok(code) => match code {
                    // classify successful status codes as cache status success
-                    // TODO does not account for status code overrides passed through the 'accept' flag
+                    // Does not account for status code overrides passed through
+                    // the 'accept' flag. Instead, this is handled at a higher level
+                    // when the cache status is converted to a status.
                    200..=299 => Ok(CacheStatus::Ok(code)),
                    // classify redirects, client errors, & server errors as cache status error
                    _ => Ok(CacheStatus::Error(Some(code))),
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@ -93,6 +93,9 @@ pub enum ErrorKind {
    /// Cannot parse the given URI
    #[error("The given URI is invalid: {0}")]
    InvalidURI(Uri),
+    /// The given status code is invalid (not in the range 100-1000)
+    #[error("Invalid status code: {0}")]
+    InvalidStatusCode(u16),
    /// Regex error
    #[error("Error when using regex engine: {0}")]
    Regex(#[from] regex::Error),
@ -195,6 +198,7 @@ impl Hash for ErrorKind {
            Self::InvalidUriRemap(remap) => (remap).hash(state),
            Self::InvalidHeader(e) => e.to_string().hash(state),
            Self::InvalidGlobPattern(e) => e.to_string().hash(state),
+            Self::InvalidStatusCode(c) => c.hash(state),
            Self::Channel(e) => e.to_string().hash(state),
            Self::MissingGitHubToken | Self::InvalidUrlHost => {
                std::mem::discriminant(self).hash(state);
--- a/lychee-lib/src/types/status.rs
+++ b/lychee-lib/src/types/status.rs
@ -84,6 +84,39 @@ impl Status {
        }
    }

+    /// Create a status object from a cached status (from a previous run of
+    /// lychee) and the set of accepted status codes.
+    ///
+    /// The set of accepted status codes can change between runs,
+    /// necessitating more complex logic than just using the cached status.
+    ///
+    /// Note that the accepted status codes are not of type `StatusCode`,
+    /// because they are provided by the user and can be invalid according to
+    /// the HTTP spec and IANA, but the user might still want to accept them.
+    #[must_use]
+    pub fn from_cache_status(s: CacheStatus, accepted: Option<HashSet<u16>>) -> Self {
+        match s {
+            CacheStatus::Ok(code) => {
+                // Not sure if we should change the status based on the
+                // accepted status codes. If we find out that this is
+                // counter-intuitive, we can change it.
+                if accepted.map(|a| a.contains(&code)) == Some(true) {
+                    return Self::Cached(CacheStatus::Ok(code));
+                };
+                Self::Cached(CacheStatus::Error(Some(code)))
+            }
+            CacheStatus::Error(code) => {
+                if let Some(code) = code {
+                    if accepted.map(|a| a.contains(&code)) == Some(true) {
+                        return Self::Cached(CacheStatus::Ok(code));
+                    };
+                }
+                Self::Cached(CacheStatus::Error(code))
+            }
+            _ => Self::Cached(s),
+        }
+    }
+
    /// Return more details about the status (if any)
    ///
    /// Which additional information we can extract depends on the underlying
@ -219,9 +252,3 @@ impl From<reqwest::Error> for Status {
        }
    }
 }
-
-impl From<CacheStatus> for Status {
-    fn from(s: CacheStatus) -> Self {
-        Self::Cached(s)
-    }
-}