Wayback integration (#1003)

Adds support for suggesting archived URLs for broken links.
Uses Wayback Machine as the archive provider.
This commit is contained in:
Thomas 2023-03-28 00:45:06 +02:00 committed by GitHub
parent f02576810b
commit 994b2852cd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 336 additions and 37 deletions

View file

@ -34,4 +34,4 @@ ask for some early feedback by creating an issue yourself and asking for feedbac
## Thanks!
No matter how small, we appreciate very contribution. You're awesome!
No matter how small, we appreciate every contribution. You're awesome!

54
Cargo.lock generated
View file

@ -622,7 +622,7 @@ version = "4.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fddf67631444a3a3e3e5ac51c36a5e01335302de677bd78759eaa90ab1f46644"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro-error",
"proc-macro2",
"quote",
@ -1110,7 +1110,7 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21cdad81446a7f7dc43f6a77409efeb9733d2fa65553efef6018ef257c959b73"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro2",
"quote",
"syn 1.0.107",
@ -1122,7 +1122,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9720bba047d567ffc8a3cba48bf19126600e249ab7f128e9233e6376976a116"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro2",
"quote",
"syn 1.0.107",
@ -1518,6 +1518,15 @@ dependencies = [
"http",
]
[[package]]
name = "heck"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "heck"
version = "0.4.0"
@ -1989,6 +1998,7 @@ dependencies = [
"secrecy",
"serde",
"serde_json",
"strum",
"supports-color",
"tabled",
"tempfile",
@ -2924,6 +2934,12 @@ dependencies = [
"windows-sys 0.42.0",
]
[[package]]
name = "rustversion"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06"
[[package]]
name = "ryu"
version = "1.0.12"
@ -3172,7 +3188,7 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5e79cdebbabaebb06a9bdbaedc7f159b410461f63611d4d0e3fb0fab8fed850"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro2",
"quote",
"syn 1.0.107",
@ -3235,6 +3251,28 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
dependencies = [
"heck 0.3.3",
"proc-macro2",
"quote",
"rustversion",
"syn",
]
[[package]]
name = "supports-color"
version = "2.0.0"
@ -3284,7 +3322,7 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "beca1b4eaceb4f2755df858b88d9b9315b7ccfd1ffd0d7a48a52602301f01a57"
dependencies = [
"heck",
"heck 0.4.0",
"proc-macro-error",
"proc-macro2",
"quote",
@ -3724,6 +3762,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
version = "0.1.10"

View file

@ -250,6 +250,14 @@ Options:
--dump
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked
--archive <ARCHIVE>
Specify the use of a specific web archive. Can be used in combination with `--suggest`
[possible values: wayback]
--suggest
Suggest link replacements for broken links, using a web archive. The web archive can be specified with `--archive`
-m, --max-redirects <MAX_REDIRECTS>
Maximum number of allowed redirects

View file

@ -49,6 +49,7 @@ secrecy = { version = "0.8.0", features = ["serde"] }
supports-color = "2.0.0"
log = "0.4.17"
env_logger = "0.10.0"
strum = {version = "0.23.0" , features = ["derive"] }
[dependencies.clap]
version = "4.1.11"

View file

@ -0,0 +1,42 @@
use reqwest::{Error, Url};
use serde::{Deserialize, Serialize};
use std::fmt::Display;
use strum::{Display, EnumIter, EnumString, EnumVariantNames};
use crate::color::{color, GREEN, PINK};
mod wayback;
#[derive(Debug, Serialize, Eq, Hash, PartialEq)]
pub(crate) struct Suggestion {
pub(crate) original: Url,
pub(crate) suggestion: Url,
}
impl Display for Suggestion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
color!(f, PINK, "{}", self.original)?;
write!(f, " ")?;
color!(f, GREEN, "{}", self.suggestion)?;
Ok(())
}
}
#[non_exhaustive]
#[derive(Debug, Deserialize, Default, Clone, Display, EnumIter, EnumString, EnumVariantNames)]
pub(crate) enum Archive {
#[serde(rename = "wayback")]
#[strum(serialize = "wayback", ascii_case_insensitive)]
#[default]
WaybackMachine,
}
impl Archive {
pub(crate) async fn get_link(&self, original: &Url) -> Result<Option<Url>, Error> {
let function = match self {
Archive::WaybackMachine => wayback::get_wayback_link,
};
function(original).await
}
}

View file

@ -0,0 +1,79 @@
use once_cell::sync::Lazy;
use serde::{Deserialize, Deserializer};
use http::StatusCode;
use reqwest::{Error, Url};
static WAYBACK_URL: Lazy<Url> =
Lazy::new(|| Url::parse("https://archive.org/wayback/available").unwrap());
pub(crate) async fn get_wayback_link(url: &Url) -> Result<Option<Url>, Error> {
let mut archive_url: Url = WAYBACK_URL.clone();
archive_url.set_query(Some(&format!("url={url}")));
let response = reqwest::get(archive_url)
.await?
.json::<InternetArchiveResponse>()
.await?;
Ok(response
.archived_snapshots
.closest
.map(|closest| closest.url))
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct InternetArchiveResponse {
pub(crate) url: Url,
pub(crate) archived_snapshots: ArchivedSnapshots,
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct ArchivedSnapshots {
pub(crate) closest: Option<Closest>,
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub(crate) struct Closest {
#[serde(deserialize_with = "from_string")]
pub(crate) status: StatusCode,
pub(crate) available: bool,
pub(crate) url: Url,
pub(crate) timestamp: String,
}
fn from_string<'d, D>(deserializer: D) -> Result<StatusCode, D::Error>
where
D: Deserializer<'d>,
{
let value: &str = Deserialize::deserialize(deserializer)?;
let result = value.parse::<u16>().unwrap();
Ok(StatusCode::from_u16(result).unwrap())
}
#[cfg(test)]
mod tests {
use crate::archive::wayback::get_wayback_link;
use reqwest::Error;
#[tokio::test]
async fn wayback_suggestion() -> Result<(), Error> {
let url = &"https://example.com".try_into().unwrap();
let response = get_wayback_link(url).await?;
let suggestion = response.unwrap();
assert!(suggestion.as_str().contains("web.archive.org"));
Ok(())
}
#[tokio::test]
async fn wayback_suggestion_unknown_url() -> Result<(), Error> {
let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man"
.try_into()
.unwrap();
let response = get_wayback_link(url).await?;
assert_eq!(response, None);
Ok(())
}
}

View file

@ -5,16 +5,19 @@ use std::time::Duration;
use indicatif::ProgressBar;
use indicatif::ProgressStyle;
use lychee_lib::Result;
use lychee_lib::Status;
use reqwest::Url;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tokio_stream::StreamExt;
use lychee_lib::Status;
use lychee_lib::{Client, Request, Response};
use lychee_lib::{InputSource, Result};
use crate::archive::{Archive, Suggestion};
use crate::formatters::response::ResponseFormatter;
use crate::verbosity::Verbosity;
use crate::{cache::Cache, stats::ResponseStats, ExitCode};
use lychee_lib::{Client, Request, Response};
use super::CommandParams;
@ -42,7 +45,7 @@ where
let pb = if params.cfg.no_progress {
None
} else {
Some(init_progress_bar())
Some(init_progress_bar("Extracting links"))
};
// Start receiving requests
@ -68,12 +71,21 @@ where
// Wait until all responses are received
let result = show_results_task.await?;
let (pb, stats) = result?;
let (pb, mut stats) = result?;
// Note that print statements may interfere with the progress bar, so this
// must go before printing the stats
if let Some(pb) = &pb {
pb.finish_and_clear();
pb.finish_with_message("Finished extracting links");
}
if params.cfg.suggest {
suggest_archived_links(
params.cfg.archive.unwrap_or_default(),
&mut stats,
!params.cfg.no_progress,
)
.await;
}
let code = if stats.is_success() {
@ -84,6 +96,48 @@ where
Ok((stats, cache_ref, code))
}
async fn suggest_archived_links(archive: Archive, stats: &mut ResponseStats, show_progress: bool) {
let failed_urls = &stats
.fail_map
.iter()
.flat_map(|(source, set)| set.iter().map(|entry| (source, entry)).collect::<Vec<_>>())
.filter(|(_, response)| {
let uri = &response.uri;
!(uri.is_data() || uri.is_mail() || uri.is_file())
})
.map(|(source, response)| (source, response.uri.as_str().try_into().unwrap()))
.collect::<Vec<(&InputSource, Url)>>();
let bar = if show_progress {
let bar = init_progress_bar("Searching for alternatives");
bar.set_length(failed_urls.len() as u64);
Some(bar)
} else {
None
};
for (input, url) in failed_urls {
if let Ok(Some(suggestion)) = archive.get_link(url).await {
stats
.suggestion_map
.entry((*input).clone())
.or_default()
.insert(Suggestion {
suggestion,
original: url.clone(),
});
}
if let Some(bar) = &bar {
bar.inc(1);
}
}
if let Some(bar) = &bar {
bar.finish_with_message("Finished searching for alternatives");
}
}
// drops the `send_req` channel on exit
// required for the receiver task to end, which closes send_resp, which allows
// the show_results_task to finish
@ -125,7 +179,7 @@ async fn progress_bar_task(
Ok((pb, stats))
}
fn init_progress_bar() -> ProgressBar {
fn init_progress_bar(initial_message: &'static str) -> ProgressBar {
let bar = ProgressBar::new_spinner().with_style(
ProgressStyle::with_template(
"{spinner:.197.bright} {pos}/{len:.dim} ETA {eta} {bar:.dim} {wide_msg}",
@ -133,7 +187,7 @@ fn init_progress_bar() -> ProgressBar {
.expect("Valid progress bar"),
);
bar.set_length(0);
bar.set_message("Extracting links");
bar.set_message(initial_message);
// report status _at least_ every 500ms
bar.enable_steady_tick(Duration::from_millis(500));
bar
@ -232,6 +286,7 @@ fn show_progress(
#[cfg(test)]
mod tests {
use log::info;
use lychee_lib::{CacheStatus, InputSource, ResponseBody, Uri};
use crate::formatters;

View file

@ -59,6 +59,14 @@ impl Display for CompactResponseStats {
for response in responses {
writeln!(f, "{}", color_response(response))?;
}
if let Some(suggestions) = &stats.suggestion_map.get(source) {
writeln!(f, "\n\u{2139} Suggestions")?;
for suggestion in *suggestions {
writeln!(f, "{suggestion}")?;
}
}
writeln!(f)?;
}

View file

@ -45,6 +45,13 @@ impl Display for DetailedResponseStats {
write!(f, "\n\nErrors in {source}")?;
for response in responses {
write!(f, "\n{}", color_response(response))?;
if let Some(suggestions) = &stats.suggestion_map.get(source) {
writeln!(f, "\nSuggestions in {source}")?;
for suggestion in *suggestions {
writeln!(f, "{suggestion}")?;
}
}
}
}

View file

@ -1,9 +1,12 @@
use std::fmt::{self, Display};
use std::{
collections::{HashMap, HashSet},
fmt::{self, Display},
};
use super::StatsFormatter;
use anyhow::Result;
use http::StatusCode;
use lychee_lib::{ResponseBody, Status};
use lychee_lib::{InputSource, ResponseBody, Status};
use std::fmt::Write;
use tabled::{object::Segment, Alignment, Modify, Table, Tabled};
@ -94,28 +97,44 @@ impl Display for MarkdownResponseStats {
writeln!(f)?;
writeln!(f, "{}", stats_table(&self.0))?;
if !&stats.fail_map.is_empty() {
writeln!(f)?;
writeln!(f, "## Errors per input\n")?;
for (source, responses) in &stats.fail_map {
// Using leading newlines over trailing ones (e.g. `writeln!`)
// lets us avoid extra newlines without any additional logic.
writeln!(f, "### Errors in {source}\n")?;
for response in responses {
writeln!(
f,
"{}",
markdown_response(response).map_err(|_e| fmt::Error)?
)?;
}
writeln!(f)?;
}
}
write_stats_per_input(f, "Errors", &stats.fail_map, |response| {
markdown_response(response).map_err(|_e| fmt::Error)
})?;
write_stats_per_input(f, "Suggestions", &stats.suggestion_map, |suggestion| {
Ok(format!(
"* {} --> {}",
suggestion.original, suggestion.suggestion
))
})?;
Ok(())
}
}
fn write_stats_per_input<T, F>(
f: &mut fmt::Formatter<'_>,
name: &'static str,
map: &HashMap<InputSource, HashSet<T>>,
write_stat: F,
) -> fmt::Result
where
T: Display,
F: Fn(&T) -> Result<String, std::fmt::Error>,
{
if !&map.is_empty() {
writeln!(f, "\n## {name} per input")?;
for (source, responses) in map {
writeln!(f, "\n### {name} in {source}\n")?;
for response in responses {
writeln!(f, "{}", write_stat(response)?)?;
}
}
}
Ok(())
}
pub(crate) struct Markdown;
impl Markdown {
@ -136,6 +155,9 @@ mod tests {
use http::StatusCode;
use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status, Uri};
use reqwest::Url;
use crate::archive::Suggestion;
use super::*;
@ -205,6 +227,14 @@ mod tests {
},
);
stats.add(response);
stats
.suggestion_map
.entry((InputSource::Stdin).clone())
.or_default()
.insert(Suggestion {
suggestion: Url::parse("https://example.com/suggestion").unwrap(),
original: Url::parse("https://example.com/original").unwrap(),
});
let summary = MarkdownResponseStats(stats);
let expected = r#"## Summary
@ -224,6 +254,11 @@ mod tests {
* [404] [http://127.0.0.1/](http://127.0.0.1/) | Cached: Error (cached)
## Suggestions per input
### Suggestions in stdin
* https://example.com/original --> https://example.com/suggestion
"#;
assert_eq!(summary.to_string(), expected.to_string());
}

View file

@ -76,6 +76,7 @@ use ring as _; // required for apple silicon
use lychee_lib::Collector;
mod archive;
mod cache;
mod client;
mod color;
@ -109,7 +110,7 @@ enum ExitCode {
}
/// Ignore lines starting with this marker in `.lycheeignore` files
const LYCHEEINGORE_COMMENT_MARKER: &str = "#";
const LYCHEEIGNORE_COMMENT_MARKER: &str = "#";
fn main() -> Result<()> {
#[cfg(feature = "tokio-console")]
@ -128,7 +129,7 @@ fn read_lines(file: &File) -> Result<Vec<String>> {
Ok(lines
.into_iter()
.filter(|line| {
!line.is_empty() && !line.trim_start().starts_with(LYCHEEINGORE_COMMENT_MARKER)
!line.is_empty() && !line.trim_start().starts_with(LYCHEEIGNORE_COMMENT_MARKER)
})
.collect())
}

View file

@ -1,7 +1,8 @@
use crate::archive::Archive;
use crate::parse::{parse_base, parse_statuscodes};
use crate::verbosity::Verbosity;
use anyhow::{anyhow, Context, Error, Result};
use clap::{arg, Parser};
use clap::{arg, builder::TypedValueParser, Parser};
use const_format::{concatcp, formatcp};
use lychee_lib::{
Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS,
@ -11,6 +12,7 @@ use secrecy::{ExposeSecret, SecretString};
use serde::Deserialize;
use std::path::Path;
use std::{collections::HashSet, fs, path::PathBuf, str::FromStr, time::Duration};
use strum::VariantNames;
pub(crate) const LYCHEE_IGNORE_FILE: &str = ".lycheeignore";
pub(crate) const LYCHEE_CACHE_FILE: &str = ".lycheecache";
@ -179,6 +181,18 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) dump: bool,
/// Specify the use of a specific web archive.
/// Can be used in combination with `--suggest`
#[arg(long, value_parser = clap::builder::PossibleValuesParser::new(Archive::VARIANTS).map(|s| s.parse::<Archive>().unwrap()))]
#[serde(default)]
pub(crate) archive: Option<Archive>,
/// Suggest link replacements for broken links, using a web archive.
/// The web archive can be specified with `--archive`
#[arg(long)]
#[serde(default)]
pub(crate) suggest: bool,
/// Maximum number of allowed redirects
#[arg(short, long, default_value = &MAX_REDIRECTS_STR)]
#[serde(default = "max_redirects")]

View file

@ -1,5 +1,6 @@
use std::collections::{HashMap, HashSet};
use crate::archive::Suggestion;
use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status};
use serde::Serialize;
@ -17,6 +18,7 @@ pub(crate) struct ResponseStats {
pub(crate) cached: usize,
pub(crate) success_map: HashMap<InputSource, HashSet<ResponseBody>>,
pub(crate) fail_map: HashMap<InputSource, HashSet<ResponseBody>>,
pub(crate) suggestion_map: HashMap<InputSource, HashSet<Suggestion>>,
pub(crate) excluded_map: HashMap<InputSource, HashSet<ResponseBody>>,
}

View file

@ -78,6 +78,7 @@ mod cli {
cached: usize,
success_map: HashMap<InputSource, HashSet<ResponseBody>>,
fail_map: HashMap<InputSource, HashSet<ResponseBody>>,
suggestion_map: HashMap<InputSource, HashSet<ResponseBody>>,
excluded_map: HashMap<InputSource, HashSet<ResponseBody>>,
}
@ -97,6 +98,7 @@ mod cli {
"cached": {},
"success_map": {:?},
"fail_map": {:?},
"suggestion_map": {:?},
"excluded_map": {:?}
}}"#,
self.detailed_stats,
@ -110,6 +112,7 @@ mod cli {
self.errors,
self.cached,
self.success_map,
self.suggestion_map,
self.fail_map,
self.excluded_map
)
@ -508,7 +511,7 @@ mod cli {
.assert()
.success();
let expected = r#"{"detailed_stats":false,"total":11,"successful":11,"unknown":0,"unsupported":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"cached":0,"success_map":{},"fail_map":{},"excluded_map":{}}"#;
let expected = r#"{"detailed_stats":false,"total":11,"successful":11,"unknown":0,"unsupported":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"cached":0,"success_map":{},"fail_map":{},"suggestion_map":{},"excluded_map":{}}"#;
let output = fs::read_to_string(&outfile)?;
assert_eq!(output.split_whitespace().collect::<String>(), expected);
fs::remove_file(outfile)?;