diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1aceb88..d4174fd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,4 +34,4 @@ ask for some early feedback by creating an issue yourself and asking for feedbac ## Thanks! -No matter how small, we appreciate very contribution. You're awesome! +No matter how small, we appreciate every contribution. You're awesome! diff --git a/Cargo.lock b/Cargo.lock index b05ceef..3118eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -622,7 +622,7 @@ version = "4.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fddf67631444a3a3e3e5ac51c36a5e01335302de677bd78759eaa90ab1f46644" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro-error", "proc-macro2", "quote", @@ -1110,7 +1110,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21cdad81446a7f7dc43f6a77409efeb9733d2fa65553efef6018ef257c959b73" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro2", "quote", "syn 1.0.107", @@ -1122,7 +1122,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9720bba047d567ffc8a3cba48bf19126600e249ab7f128e9233e6376976a116" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro2", "quote", "syn 1.0.107", @@ -1518,6 +1518,15 @@ dependencies = [ "http", ] +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.0" @@ -1989,6 +1998,7 @@ dependencies = [ "secrecy", "serde", "serde_json", + "strum", "supports-color", "tabled", "tempfile", @@ -2924,6 +2934,12 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "rustversion" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" + [[package]] name = "ryu" version = "1.0.12" @@ -3172,7 +3188,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5e79cdebbabaebb06a9bdbaedc7f159b410461f63611d4d0e3fb0fab8fed850" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro2", "quote", "syn 1.0.107", @@ -3235,6 +3251,28 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck 0.3.3", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "supports-color" version = "2.0.0" @@ -3284,7 +3322,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "beca1b4eaceb4f2755df858b88d9b9315b7ccfd1ffd0d7a48a52602301f01a57" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro-error", "proc-macro2", "quote", @@ -3724,6 +3762,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/README.md b/README.md index 66042eb..1833e75 100644 --- a/README.md +++ b/README.md @@ -250,6 +250,14 @@ Options: --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked + --archive + Specify the use of a specific web archive. Can be used in combination with `--suggest` + + [possible values: wayback] + + --suggest + Suggest link replacements for broken links, using a web archive. The web archive can be specified with `--archive` + -m, --max-redirects Maximum number of allowed redirects diff --git a/lychee-bin/Cargo.toml b/lychee-bin/Cargo.toml index adcdc25..f827508 100644 --- a/lychee-bin/Cargo.toml +++ b/lychee-bin/Cargo.toml @@ -49,6 +49,7 @@ secrecy = { version = "0.8.0", features = ["serde"] } supports-color = "2.0.0" log = "0.4.17" env_logger = "0.10.0" +strum = {version = "0.23.0" , features = ["derive"] } [dependencies.clap] version = "4.1.11" diff --git a/lychee-bin/src/archive/mod.rs b/lychee-bin/src/archive/mod.rs new file mode 100644 index 0000000..7246f8a --- /dev/null +++ b/lychee-bin/src/archive/mod.rs @@ -0,0 +1,42 @@ +use reqwest::{Error, Url}; +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use strum::{Display, EnumIter, EnumString, EnumVariantNames}; + +use crate::color::{color, GREEN, PINK}; + +mod wayback; + +#[derive(Debug, Serialize, Eq, Hash, PartialEq)] +pub(crate) struct Suggestion { + pub(crate) original: Url, + pub(crate) suggestion: Url, +} + +impl Display for Suggestion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + color!(f, PINK, "{}", self.original)?; + write!(f, " ")?; + color!(f, GREEN, "{}", self.suggestion)?; + Ok(()) + } +} + +#[non_exhaustive] +#[derive(Debug, Deserialize, Default, Clone, Display, EnumIter, EnumString, EnumVariantNames)] +pub(crate) enum Archive { + #[serde(rename = "wayback")] + #[strum(serialize = "wayback", ascii_case_insensitive)] + #[default] + WaybackMachine, +} + +impl Archive { + pub(crate) async fn get_link(&self, original: &Url) -> Result, Error> { + let function = match self { + Archive::WaybackMachine => wayback::get_wayback_link, + }; + + function(original).await + } +} diff --git a/lychee-bin/src/archive/wayback/mod.rs b/lychee-bin/src/archive/wayback/mod.rs new file mode 100644 index 0000000..059211d --- /dev/null +++ b/lychee-bin/src/archive/wayback/mod.rs @@ -0,0 +1,79 @@ +use once_cell::sync::Lazy; +use serde::{Deserialize, Deserializer}; + +use http::StatusCode; +use reqwest::{Error, Url}; +static WAYBACK_URL: Lazy = + Lazy::new(|| Url::parse("https://archive.org/wayback/available").unwrap()); + +pub(crate) async fn get_wayback_link(url: &Url) -> Result, Error> { + let mut archive_url: Url = WAYBACK_URL.clone(); + archive_url.set_query(Some(&format!("url={url}"))); + + let response = reqwest::get(archive_url) + .await? + .json::() + .await?; + + Ok(response + .archived_snapshots + .closest + .map(|closest| closest.url)) +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct InternetArchiveResponse { + pub(crate) url: Url, + pub(crate) archived_snapshots: ArchivedSnapshots, +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct ArchivedSnapshots { + pub(crate) closest: Option, +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct Closest { + #[serde(deserialize_with = "from_string")] + pub(crate) status: StatusCode, + pub(crate) available: bool, + pub(crate) url: Url, + pub(crate) timestamp: String, +} + +fn from_string<'d, D>(deserializer: D) -> Result +where + D: Deserializer<'d>, +{ + let value: &str = Deserialize::deserialize(deserializer)?; + let result = value.parse::().unwrap(); + Ok(StatusCode::from_u16(result).unwrap()) +} + +#[cfg(test)] +mod tests { + use crate::archive::wayback::get_wayback_link; + use reqwest::Error; + + #[tokio::test] + async fn wayback_suggestion() -> Result<(), Error> { + let url = &"https://example.com".try_into().unwrap(); + let response = get_wayback_link(url).await?; + let suggestion = response.unwrap(); + + assert!(suggestion.as_str().contains("web.archive.org")); + + Ok(()) + } + + #[tokio::test] + async fn wayback_suggestion_unknown_url() -> Result<(), Error> { + let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man" + .try_into() + .unwrap(); + + let response = get_wayback_link(url).await?; + assert_eq!(response, None); + Ok(()) + } +} diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index ce97a62..915a97a 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -5,16 +5,19 @@ use std::time::Duration; use indicatif::ProgressBar; use indicatif::ProgressStyle; -use lychee_lib::Result; -use lychee_lib::Status; +use reqwest::Url; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tokio_stream::StreamExt; +use lychee_lib::Status; +use lychee_lib::{Client, Request, Response}; +use lychee_lib::{InputSource, Result}; + +use crate::archive::{Archive, Suggestion}; use crate::formatters::response::ResponseFormatter; use crate::verbosity::Verbosity; use crate::{cache::Cache, stats::ResponseStats, ExitCode}; -use lychee_lib::{Client, Request, Response}; use super::CommandParams; @@ -42,7 +45,7 @@ where let pb = if params.cfg.no_progress { None } else { - Some(init_progress_bar()) + Some(init_progress_bar("Extracting links")) }; // Start receiving requests @@ -68,12 +71,21 @@ where // Wait until all responses are received let result = show_results_task.await?; - let (pb, stats) = result?; + let (pb, mut stats) = result?; // Note that print statements may interfere with the progress bar, so this // must go before printing the stats if let Some(pb) = &pb { - pb.finish_and_clear(); + pb.finish_with_message("Finished extracting links"); + } + + if params.cfg.suggest { + suggest_archived_links( + params.cfg.archive.unwrap_or_default(), + &mut stats, + !params.cfg.no_progress, + ) + .await; } let code = if stats.is_success() { @@ -84,6 +96,48 @@ where Ok((stats, cache_ref, code)) } +async fn suggest_archived_links(archive: Archive, stats: &mut ResponseStats, show_progress: bool) { + let failed_urls = &stats + .fail_map + .iter() + .flat_map(|(source, set)| set.iter().map(|entry| (source, entry)).collect::>()) + .filter(|(_, response)| { + let uri = &response.uri; + !(uri.is_data() || uri.is_mail() || uri.is_file()) + }) + .map(|(source, response)| (source, response.uri.as_str().try_into().unwrap())) + .collect::>(); + + let bar = if show_progress { + let bar = init_progress_bar("Searching for alternatives"); + bar.set_length(failed_urls.len() as u64); + Some(bar) + } else { + None + }; + + for (input, url) in failed_urls { + if let Ok(Some(suggestion)) = archive.get_link(url).await { + stats + .suggestion_map + .entry((*input).clone()) + .or_default() + .insert(Suggestion { + suggestion, + original: url.clone(), + }); + } + + if let Some(bar) = &bar { + bar.inc(1); + } + } + + if let Some(bar) = &bar { + bar.finish_with_message("Finished searching for alternatives"); + } +} + // drops the `send_req` channel on exit // required for the receiver task to end, which closes send_resp, which allows // the show_results_task to finish @@ -125,7 +179,7 @@ async fn progress_bar_task( Ok((pb, stats)) } -fn init_progress_bar() -> ProgressBar { +fn init_progress_bar(initial_message: &'static str) -> ProgressBar { let bar = ProgressBar::new_spinner().with_style( ProgressStyle::with_template( "{spinner:.197.bright} {pos}/{len:.dim} ETA {eta} {bar:.dim} {wide_msg}", @@ -133,7 +187,7 @@ fn init_progress_bar() -> ProgressBar { .expect("Valid progress bar"), ); bar.set_length(0); - bar.set_message("Extracting links"); + bar.set_message(initial_message); // report status _at least_ every 500ms bar.enable_steady_tick(Duration::from_millis(500)); bar @@ -232,6 +286,7 @@ fn show_progress( #[cfg(test)] mod tests { use log::info; + use lychee_lib::{CacheStatus, InputSource, ResponseBody, Uri}; use crate::formatters; diff --git a/lychee-bin/src/formatters/stats/compact.rs b/lychee-bin/src/formatters/stats/compact.rs index af50ac2..48cbbca 100644 --- a/lychee-bin/src/formatters/stats/compact.rs +++ b/lychee-bin/src/formatters/stats/compact.rs @@ -59,6 +59,14 @@ impl Display for CompactResponseStats { for response in responses { writeln!(f, "{}", color_response(response))?; } + + if let Some(suggestions) = &stats.suggestion_map.get(source) { + writeln!(f, "\n\u{2139} Suggestions")?; + for suggestion in *suggestions { + writeln!(f, "{suggestion}")?; + } + } + writeln!(f)?; } diff --git a/lychee-bin/src/formatters/stats/detailed.rs b/lychee-bin/src/formatters/stats/detailed.rs index 1077b80..3cd628a 100644 --- a/lychee-bin/src/formatters/stats/detailed.rs +++ b/lychee-bin/src/formatters/stats/detailed.rs @@ -45,6 +45,13 @@ impl Display for DetailedResponseStats { write!(f, "\n\nErrors in {source}")?; for response in responses { write!(f, "\n{}", color_response(response))?; + + if let Some(suggestions) = &stats.suggestion_map.get(source) { + writeln!(f, "\nSuggestions in {source}")?; + for suggestion in *suggestions { + writeln!(f, "{suggestion}")?; + } + } } } diff --git a/lychee-bin/src/formatters/stats/markdown.rs b/lychee-bin/src/formatters/stats/markdown.rs index 2de6321..f83ce14 100644 --- a/lychee-bin/src/formatters/stats/markdown.rs +++ b/lychee-bin/src/formatters/stats/markdown.rs @@ -1,9 +1,12 @@ -use std::fmt::{self, Display}; +use std::{ + collections::{HashMap, HashSet}, + fmt::{self, Display}, +}; use super::StatsFormatter; use anyhow::Result; use http::StatusCode; -use lychee_lib::{ResponseBody, Status}; +use lychee_lib::{InputSource, ResponseBody, Status}; use std::fmt::Write; use tabled::{object::Segment, Alignment, Modify, Table, Tabled}; @@ -94,28 +97,44 @@ impl Display for MarkdownResponseStats { writeln!(f)?; writeln!(f, "{}", stats_table(&self.0))?; - if !&stats.fail_map.is_empty() { - writeln!(f)?; - writeln!(f, "## Errors per input\n")?; - for (source, responses) in &stats.fail_map { - // Using leading newlines over trailing ones (e.g. `writeln!`) - // lets us avoid extra newlines without any additional logic. - writeln!(f, "### Errors in {source}\n")?; - for response in responses { - writeln!( - f, - "{}", - markdown_response(response).map_err(|_e| fmt::Error)? - )?; - } - writeln!(f)?; - } - } + write_stats_per_input(f, "Errors", &stats.fail_map, |response| { + markdown_response(response).map_err(|_e| fmt::Error) + })?; + + write_stats_per_input(f, "Suggestions", &stats.suggestion_map, |suggestion| { + Ok(format!( + "* {} --> {}", + suggestion.original, suggestion.suggestion + )) + })?; Ok(()) } } +fn write_stats_per_input( + f: &mut fmt::Formatter<'_>, + name: &'static str, + map: &HashMap>, + write_stat: F, +) -> fmt::Result +where + T: Display, + F: Fn(&T) -> Result, +{ + if !&map.is_empty() { + writeln!(f, "\n## {name} per input")?; + for (source, responses) in map { + writeln!(f, "\n### {name} in {source}\n")?; + for response in responses { + writeln!(f, "{}", write_stat(response)?)?; + } + } + } + + Ok(()) +} + pub(crate) struct Markdown; impl Markdown { @@ -136,6 +155,9 @@ mod tests { use http::StatusCode; use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status, Uri}; + use reqwest::Url; + + use crate::archive::Suggestion; use super::*; @@ -205,6 +227,14 @@ mod tests { }, ); stats.add(response); + stats + .suggestion_map + .entry((InputSource::Stdin).clone()) + .or_default() + .insert(Suggestion { + suggestion: Url::parse("https://example.com/suggestion").unwrap(), + original: Url::parse("https://example.com/original").unwrap(), + }); let summary = MarkdownResponseStats(stats); let expected = r#"## Summary @@ -224,6 +254,11 @@ mod tests { * [404] [http://127.0.0.1/](http://127.0.0.1/) | Cached: Error (cached) +## Suggestions per input + +### Suggestions in stdin + +* https://example.com/original --> https://example.com/suggestion "#; assert_eq!(summary.to_string(), expected.to_string()); } diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index cdf67c6..45a050c 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -76,6 +76,7 @@ use ring as _; // required for apple silicon use lychee_lib::Collector; +mod archive; mod cache; mod client; mod color; @@ -109,7 +110,7 @@ enum ExitCode { } /// Ignore lines starting with this marker in `.lycheeignore` files -const LYCHEEINGORE_COMMENT_MARKER: &str = "#"; +const LYCHEEIGNORE_COMMENT_MARKER: &str = "#"; fn main() -> Result<()> { #[cfg(feature = "tokio-console")] @@ -128,7 +129,7 @@ fn read_lines(file: &File) -> Result> { Ok(lines .into_iter() .filter(|line| { - !line.is_empty() && !line.trim_start().starts_with(LYCHEEINGORE_COMMENT_MARKER) + !line.is_empty() && !line.trim_start().starts_with(LYCHEEIGNORE_COMMENT_MARKER) }) .collect()) } diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 64f12d3..fb4377b 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,7 +1,8 @@ +use crate::archive::Archive; use crate::parse::{parse_base, parse_statuscodes}; use crate::verbosity::Verbosity; use anyhow::{anyhow, Context, Error, Result}; -use clap::{arg, Parser}; +use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, @@ -11,6 +12,7 @@ use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; use std::path::Path; use std::{collections::HashSet, fs, path::PathBuf, str::FromStr, time::Duration}; +use strum::VariantNames; pub(crate) const LYCHEE_IGNORE_FILE: &str = ".lycheeignore"; pub(crate) const LYCHEE_CACHE_FILE: &str = ".lycheecache"; @@ -179,6 +181,18 @@ pub(crate) struct Config { #[serde(default)] pub(crate) dump: bool, + /// Specify the use of a specific web archive. + /// Can be used in combination with `--suggest` + #[arg(long, value_parser = clap::builder::PossibleValuesParser::new(Archive::VARIANTS).map(|s| s.parse::().unwrap()))] + #[serde(default)] + pub(crate) archive: Option, + + /// Suggest link replacements for broken links, using a web archive. + /// The web archive can be specified with `--archive` + #[arg(long)] + #[serde(default)] + pub(crate) suggest: bool, + /// Maximum number of allowed redirects #[arg(short, long, default_value = &MAX_REDIRECTS_STR)] #[serde(default = "max_redirects")] diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 2a91b52..ccf856b 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -1,5 +1,6 @@ use std::collections::{HashMap, HashSet}; +use crate::archive::Suggestion; use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status}; use serde::Serialize; @@ -17,6 +18,7 @@ pub(crate) struct ResponseStats { pub(crate) cached: usize, pub(crate) success_map: HashMap>, pub(crate) fail_map: HashMap>, + pub(crate) suggestion_map: HashMap>, pub(crate) excluded_map: HashMap>, } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 4d55ab0..59a9e72 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -78,6 +78,7 @@ mod cli { cached: usize, success_map: HashMap>, fail_map: HashMap>, + suggestion_map: HashMap>, excluded_map: HashMap>, } @@ -97,6 +98,7 @@ mod cli { "cached": {}, "success_map": {:?}, "fail_map": {:?}, + "suggestion_map": {:?}, "excluded_map": {:?} }}"#, self.detailed_stats, @@ -110,6 +112,7 @@ mod cli { self.errors, self.cached, self.success_map, + self.suggestion_map, self.fail_map, self.excluded_map ) @@ -508,7 +511,7 @@ mod cli { .assert() .success(); - let expected = r#"{"detailed_stats":false,"total":11,"successful":11,"unknown":0,"unsupported":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"cached":0,"success_map":{},"fail_map":{},"excluded_map":{}}"#; + let expected = r#"{"detailed_stats":false,"total":11,"successful":11,"unknown":0,"unsupported":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"cached":0,"success_map":{},"fail_map":{},"suggestion_map":{},"excluded_map":{}}"#; let output = fs::read_to_string(&outfile)?; assert_eq!(output.split_whitespace().collect::(), expected); fs::remove_file(outfile)?;