From 6fae93f2daeccf7edf618982d8451df5f6540e3a Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 17 Jul 2022 18:40:45 +0200 Subject: [PATCH] Skip caching unsupported and excluded URLs (#692) As discussed in https://github.com/lycheeverse/lychee/issues/647#issuecomment-1170773449, it does not make much sense to cache unsupported and excluded URLs. Unsupported URLs might be supported in the future and caching them would mean they won't get checked then. Excluded URLs were excluded for a reason and should not appear in the cache. Furthermore they might not be excluded in a consecutive run, leading to a false-positive. --- lychee-bin/src/commands/check.rs | 12 +++++--- lychee-bin/tests/cli.rs | 51 ++++++++++++++++++++++++++++++-- lychee-lib/src/client.rs | 5 ++-- lychee-lib/src/types/cache.rs | 8 +++++ lychee-lib/src/types/status.rs | 2 +- 5 files changed, 69 insertions(+), 9 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 4c98789..3e10887 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -135,10 +135,14 @@ async fn handle(client: &Client, cache: Arc, request: Request) -> Respons // TODO: Handle error as soon as https://github.com/seanmonstar/reqwest/pull/1399 got merged let response = client.check(request).await.expect("cannot check URI"); - // Never cache filesystem access as it is fast already so caching has no - // benefit - if !uri.is_file() { - cache.insert(uri, response.status().into()); + // - Never cache filesystem access as it is fast already so caching has no + // benefit. + // - Skip caching unsupported URLs as they might be supported in a + // future run. + // - Skip caching excluded links; they might not be excluded in the next run + let status = response.status(); + if !uri.is_file() && !status.is_excluded() && !status.is_unsupported() { + cache.insert(uri, status.into()); } response } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index fea516d..17143a3 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -14,6 +14,11 @@ mod cli { type Result = std::result::Result>; + // The lychee cache file name is used for some tests. + // Since it is currently static and can't be overwritten, declare it as a + // constant. + const LYCHEE_CACHE_FILE: &str = ".lycheecache"; + macro_rules! mock_server { ($status:expr $(, $func:tt ($($arg:expr),*))*) => {{ let mock_server = wiremock::MockServer::start().await; @@ -609,7 +614,10 @@ mod cli { #[tokio::test] async fn test_lycheecache_file() -> Result<()> { let base_path = fixtures_path().join("cache"); - let cache_file = base_path.join(".lycheecache"); + let cache_file = base_path.join(LYCHEE_CACHE_FILE); + + // Unconditionally remove cache file if it exists + let _ = fs::remove_file(&cache_file); let mock_server_ok = mock_server!(StatusCode::OK); let mock_server_err = mock_server!(StatusCode::NOT_FOUND); @@ -649,7 +657,7 @@ mod cli { mock_server_exclude.uri() ))); - // check content of cache file file + // check content of cache file let data = fs::read_to_string(&cache_file)?; assert!(data.contains(&format!("{}/,200", mock_server_ok.uri()))); assert!(data.contains(&format!("{}/,404", mock_server_err.uri()))); @@ -677,6 +685,45 @@ mod cli { Ok(()) } + #[tokio::test] + async fn test_skip_cache_unsupported() -> Result<()> { + let base_path = fixtures_path().join("cache"); + let cache_file = base_path.join(LYCHEE_CACHE_FILE); + + // Unconditionally remove cache file if it exists + let _ = fs::remove_file(&cache_file); + + let unsupported_url = "slack://user".to_string(); + let excluded_url = "https://example.com/"; + + // run first without cache to generate the cache file + main_command() + .current_dir(&base_path) + .write_stdin(format!("{unsupported_url}\n{excluded_url}")) + .arg("--cache") + .arg("--verbose") + .arg("--exclude") + .arg(excluded_url) + .arg("--") + .arg("-") + .assert() + .stderr(contains(format!( + "[IGNORED] {unsupported_url} | Unsupported Error creating request client\n" + ))) + .stderr(contains(format!("[EXCLUDED] {excluded_url} | Excluded\n"))); + + // The cache file should be empty, because the only checked URL is + // unsupported and we don't want to cache that. It might be supported in + // future versions. + let buf = fs::read(&cache_file).unwrap(); + assert!(buf.is_empty()); + + // clear the cache file + fs::remove_file(&cache_file)?; + + Ok(()) + } + #[test] fn test_include_verbatim() -> Result<()> { let mut cmd = main_command(); diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 2fedf97..2ce9949 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -449,8 +449,9 @@ impl Client { return Status::Error(ErrorKind::InvalidURI(uri.clone())); } // This is merely a URI with a scheme that is not supported by - // reqwest yet. It would be safe to pass that to reqwest and it wouldn't - // panic, but it's also unnecessary, because it would simply return an error. + // reqwest yet. It would be safe to pass that to reqwest and it + // wouldn't panic, but it's also unnecessary, because it would + // simply return an error. return Status::Unsupported(ErrorKind::InvalidURI(uri.clone())); } diff --git a/lychee-lib/src/types/cache.rs b/lychee-lib/src/types/cache.rs index 75c089f..2b22c27 100644 --- a/lychee-lib/src/types/cache.rs +++ b/lychee-lib/src/types/cache.rs @@ -16,6 +16,11 @@ pub enum CacheStatus { /// The request was excluded (skipped) Excluded, /// The protocol is not yet supported + // We no longer cache unsupported files as they might be supported in future + // versions. + // Nevertheless, keep for compatibility when deserializing older cache + // files, even though this no longer gets serialized. Can be removed at a + // later point in time. Unsupported, } @@ -27,6 +32,9 @@ impl<'de> Deserialize<'de> for CacheStatus { let status = <&str as Deserialize<'de>>::deserialize(deserializer)?; match status { "Excluded" => Ok(CacheStatus::Excluded), + // Keep for compatibility with older cache files, even though this + // no longer gets serialized. Can be removed at a later point in + // time. "Unsupported" => Ok(CacheStatus::Unsupported), other => match other.parse::() { Ok(code) => match code { diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs index 2afcfd0..2b88577 100644 --- a/lychee-lib/src/types/status.rs +++ b/lychee-lib/src/types/status.rs @@ -34,7 +34,7 @@ pub enum Status { /// Resource was excluded from checking Excluded, /// The request type is currently not supported, - /// for example when the URL scheme is `slack://` or `file://` + /// for example when the URL scheme is `slack://`. /// See https://github.com/lycheeverse/lychee/issues/199 Unsupported(ErrorKind), /// Cached request status from previous run