Skip caching unsupported and excluded URLs (#692)

As discussed in https://github.com/lycheeverse/lychee/issues/647#issuecomment-1170773449, it does not make much sense to cache unsupported
and excluded URLs.
Unsupported URLs might be supported in the future and caching them
would mean they won't get checked then. Excluded URLs were
excluded for a reason and should not appear in the cache.
Furthermore they might not be excluded
in a consecutive run, leading to a false-positive.
This commit is contained in:
Matthias 2022-07-17 18:40:45 +02:00 committed by GitHub
parent 9ad53f97a2
commit 6fae93f2da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 69 additions and 9 deletions

View file

@ -135,10 +135,14 @@ async fn handle(client: &Client, cache: Arc<Cache>, request: Request) -> Respons
// TODO: Handle error as soon as https://github.com/seanmonstar/reqwest/pull/1399 got merged
let response = client.check(request).await.expect("cannot check URI");
// Never cache filesystem access as it is fast already so caching has no
// benefit
if !uri.is_file() {
cache.insert(uri, response.status().into());
// - Never cache filesystem access as it is fast already so caching has no
// benefit.
// - Skip caching unsupported URLs as they might be supported in a
// future run.
// - Skip caching excluded links; they might not be excluded in the next run
let status = response.status();
if !uri.is_file() && !status.is_excluded() && !status.is_unsupported() {
cache.insert(uri, status.into());
}
response
}

View file

@ -14,6 +14,11 @@ mod cli {
type Result<T> = std::result::Result<T, Box<dyn Error>>;
// The lychee cache file name is used for some tests.
// Since it is currently static and can't be overwritten, declare it as a
// constant.
const LYCHEE_CACHE_FILE: &str = ".lycheecache";
macro_rules! mock_server {
($status:expr $(, $func:tt ($($arg:expr),*))*) => {{
let mock_server = wiremock::MockServer::start().await;
@ -609,7 +614,10 @@ mod cli {
#[tokio::test]
async fn test_lycheecache_file() -> Result<()> {
let base_path = fixtures_path().join("cache");
let cache_file = base_path.join(".lycheecache");
let cache_file = base_path.join(LYCHEE_CACHE_FILE);
// Unconditionally remove cache file if it exists
let _ = fs::remove_file(&cache_file);
let mock_server_ok = mock_server!(StatusCode::OK);
let mock_server_err = mock_server!(StatusCode::NOT_FOUND);
@ -649,7 +657,7 @@ mod cli {
mock_server_exclude.uri()
)));
// check content of cache file file
// check content of cache file
let data = fs::read_to_string(&cache_file)?;
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(data.contains(&format!("{}/,404", mock_server_err.uri())));
@ -677,6 +685,45 @@ mod cli {
Ok(())
}
#[tokio::test]
async fn test_skip_cache_unsupported() -> Result<()> {
let base_path = fixtures_path().join("cache");
let cache_file = base_path.join(LYCHEE_CACHE_FILE);
// Unconditionally remove cache file if it exists
let _ = fs::remove_file(&cache_file);
let unsupported_url = "slack://user".to_string();
let excluded_url = "https://example.com/";
// run first without cache to generate the cache file
main_command()
.current_dir(&base_path)
.write_stdin(format!("{unsupported_url}\n{excluded_url}"))
.arg("--cache")
.arg("--verbose")
.arg("--exclude")
.arg(excluded_url)
.arg("--")
.arg("-")
.assert()
.stderr(contains(format!(
"[IGNORED] {unsupported_url} | Unsupported Error creating request client\n"
)))
.stderr(contains(format!("[EXCLUDED] {excluded_url} | Excluded\n")));
// The cache file should be empty, because the only checked URL is
// unsupported and we don't want to cache that. It might be supported in
// future versions.
let buf = fs::read(&cache_file).unwrap();
assert!(buf.is_empty());
// clear the cache file
fs::remove_file(&cache_file)?;
Ok(())
}
#[test]
fn test_include_verbatim() -> Result<()> {
let mut cmd = main_command();

View file

@ -449,8 +449,9 @@ impl Client {
return Status::Error(ErrorKind::InvalidURI(uri.clone()));
}
// This is merely a URI with a scheme that is not supported by
// reqwest yet. It would be safe to pass that to reqwest and it wouldn't
// panic, but it's also unnecessary, because it would simply return an error.
// reqwest yet. It would be safe to pass that to reqwest and it
// wouldn't panic, but it's also unnecessary, because it would
// simply return an error.
return Status::Unsupported(ErrorKind::InvalidURI(uri.clone()));
}

View file

@ -16,6 +16,11 @@ pub enum CacheStatus {
/// The request was excluded (skipped)
Excluded,
/// The protocol is not yet supported
// We no longer cache unsupported files as they might be supported in future
// versions.
// Nevertheless, keep for compatibility when deserializing older cache
// files, even though this no longer gets serialized. Can be removed at a
// later point in time.
Unsupported,
}
@ -27,6 +32,9 @@ impl<'de> Deserialize<'de> for CacheStatus {
let status = <&str as Deserialize<'de>>::deserialize(deserializer)?;
match status {
"Excluded" => Ok(CacheStatus::Excluded),
// Keep for compatibility with older cache files, even though this
// no longer gets serialized. Can be removed at a later point in
// time.
"Unsupported" => Ok(CacheStatus::Unsupported),
other => match other.parse::<u16>() {
Ok(code) => match code {

View file

@ -34,7 +34,7 @@ pub enum Status {
/// Resource was excluded from checking
Excluded,
/// The request type is currently not supported,
/// for example when the URL scheme is `slack://` or `file://`
/// for example when the URL scheme is `slack://`.
/// See https://github.com/lycheeverse/lychee/issues/199
Unsupported(ErrorKind),
/// Cached request status from previous run