From 31b2525a8d0422030161f01d92c85be94bacc492 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 6 Jun 2025 22:24:10 +0200 Subject: [PATCH] Move archive functionality to library (#1720) * Bump flake 1.83.0 -> 1.87.0 * Move archive functionality into lychee-lib * Create example, update name and docs * Split function & update tests * Remove trailing slashes in API calls & update tests * Apply lint suggestions * Rename function * Move module * Add cargo-nextest to devShell to support 'make test' --- Cargo.lock | 10 + README.md | 9 +- examples/archive/Cargo.toml | 19 ++ examples/archive/LICENSE-APACHE | 201 ++++++++++++++++++++ examples/archive/LICENSE-MIT | 21 ++ examples/archive/archive.rs | 18 ++ flake.lock | 18 +- flake.nix | 1 + lychee-bin/src/archive/mod.rs | 46 ----- lychee-bin/src/archive/wayback/mod.rs | 137 ------------- lychee-bin/src/commands/check.rs | 5 +- lychee-bin/src/formatters/mod.rs | 1 + lychee-bin/src/formatters/stats/markdown.rs | 3 +- lychee-bin/src/formatters/suggestion.rs | 23 +++ lychee-bin/src/main.rs | 1 - lychee-bin/src/options.rs | 7 +- lychee-bin/src/stats.rs | 3 +- lychee-lib/Cargo.toml | 2 + lychee-lib/src/archive/mod.rs | 38 ++++ lychee-lib/src/archive/wayback/mod.rs | 177 +++++++++++++++++ lychee-lib/src/lib.rs | 2 + 21 files changed, 535 insertions(+), 207 deletions(-) create mode 100644 examples/archive/Cargo.toml create mode 100644 examples/archive/LICENSE-APACHE create mode 100644 examples/archive/LICENSE-MIT create mode 100644 examples/archive/archive.rs delete mode 100644 lychee-bin/src/archive/mod.rs delete mode 100644 lychee-bin/src/archive/wayback/mod.rs create mode 100644 lychee-bin/src/formatters/suggestion.rs create mode 100644 lychee-lib/src/archive/mod.rs create mode 100644 lychee-lib/src/archive/wayback/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 3567332..2ff6f7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -128,6 +128,15 @@ version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +[[package]] +name = "archive" +version = "0.1.0" +dependencies = [ + "lychee-lib", + "tokio", + "url", +] + [[package]] name = "ascii_utils" version = "0.9.3" @@ -2566,6 +2575,7 @@ dependencies = [ "serde_json", "serde_with", "shellexpand", + "strum", "tempfile", "thiserror 2.0.12", "tokio", diff --git a/README.md b/README.md index eafe2c2..18cf0f7 100644 --- a/README.md +++ b/README.md @@ -461,9 +461,9 @@ Options: -H, --header Set custom header for requests - Some websites require custom headers to be passed in order to return valid responses. + Some websites require custom headers to be passed in order to return valid responses. You can specify custom headers in the format 'Name: Value'. For example, 'Accept: text/html'. - This is the same format that other tools like curl or wget use. + This is the same format that other tools like curl or wget use. Multiple headers can be specified by using the flag multiple times. -a, --accept @@ -666,10 +666,9 @@ let client = lychee_lib::ClientBuilder::builder() ``` All options that you set will be used for all link checks. -See the [builder -documentation](https://docs.rs/lychee-lib/latest/lychee_lib/struct.ClientBuilder.html) +See the [builder documentation](https://docs.rs/lychee-lib/latest/lychee_lib/struct.ClientBuilder.html) for all options. For more information, check out the [examples](examples) -folder. +directory. The examples can be run with `cargo run --example `. ## GitHub Action Usage diff --git a/examples/archive/Cargo.toml b/examples/archive/Cargo.toml new file mode 100644 index 0000000..e114a6d --- /dev/null +++ b/examples/archive/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "archive" +version = "0.1.0" +edition = "2024" + +[[example]] +name = "archive" +path = "archive.rs" + +[dependencies] +lychee-lib = { path = "../../lychee-lib", default-features = false } +tokio = { version = "1.45.1", features = ["full"] } +url = "2.5.4" + +[features] +email-check = ["lychee-lib/email-check"] +native-tls = ["lychee-lib/native-tls"] +rustls-tls = ["lychee-lib/rustls-tls"] +default = ["native-tls", "email-check"] diff --git a/examples/archive/LICENSE-APACHE b/examples/archive/LICENSE-APACHE new file mode 100644 index 0000000..f51e79e --- /dev/null +++ b/examples/archive/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2020 The lychee maintainers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/examples/archive/LICENSE-MIT b/examples/archive/LICENSE-MIT new file mode 100644 index 0000000..65a3ce3 --- /dev/null +++ b/examples/archive/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 The lychee maintainers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/examples/archive/archive.rs b/examples/archive/archive.rs new file mode 100644 index 0000000..1d096f0 --- /dev/null +++ b/examples/archive/archive.rs @@ -0,0 +1,18 @@ +use lychee_lib::archive::Archive; +use std::{error::Error, time::Duration}; +use url::Url; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let archive = Archive::WaybackMachine; + let url = Url::parse("https://example.com")?; + let result = archive + .get_archive_snapshot(&url, Duration::from_secs(10)) + .await?; + + if let Some(replacement) = result { + println!("Good news! {} can be replaced with {}", url, replacement); + } + + Ok(()) +} diff --git a/flake.lock b/flake.lock index 098a46b..b17a3aa 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1733759999, - "narHash": "sha256-463SNPWmz46iLzJKRzO3Q2b0Aurff3U1n0nYItxq7jU=", + "lastModified": 1748370509, + "narHash": "sha256-QlL8slIgc16W5UaI3w7xHQEP+Qmv/6vSNTpoZrrSlbk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "a73246e2eef4c6ed172979932bc80e1404ba2d56", + "rev": "4faa5f5321320e49a78ae7848582f684d64783e9", "type": "github" }, "original": { @@ -18,11 +18,11 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1728538411, - "narHash": "sha256-f0SBJz1eZ2yOuKUr5CA9BHULGXVSn6miBuUWdTyhUhU=", + "lastModified": 1744536153, + "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b69de56fac8c2b6f8fd27f2eca01dcda8e0a4221", + "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11", "type": "github" }, "original": { @@ -43,11 +43,11 @@ "nixpkgs": "nixpkgs_2" }, "locked": { - "lastModified": 1733798086, - "narHash": "sha256-XHIh0h84xDnjkqampyNI/r2FAkKmwbL719ZsygiJHKE=", + "lastModified": 1748486227, + "narHash": "sha256-veMuFa9cq/XgUXp1S57oC8K0TIw3XyZWL2jIyGWlW0c=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "8a19e07800d64462913f3dbf5c9a20ea7b50e6cd", + "rev": "4bf1892eb81113e868efe67982b64f1da15c8c5a", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 02516d2..3757101 100644 --- a/flake.nix +++ b/flake.nix @@ -43,6 +43,7 @@ pkgs.pkg-config pkgs.openssl rust + pkgs.cargo-nextest ]; }; } diff --git a/lychee-bin/src/archive/mod.rs b/lychee-bin/src/archive/mod.rs deleted file mode 100644 index 98e41bf..0000000 --- a/lychee-bin/src/archive/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -use reqwest::{Error, Url}; -use serde::{Deserialize, Serialize}; -use std::{fmt::Display, time::Duration}; -use strum::{Display, EnumIter, EnumString, VariantNames}; - -use crate::color::{GREEN, PINK, color}; - -mod wayback; - -#[derive(Debug, Serialize, Eq, Hash, PartialEq)] -pub(crate) struct Suggestion { - pub(crate) original: Url, - pub(crate) suggestion: Url, -} - -impl Display for Suggestion { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - color!(f, PINK, "{}", self.original)?; - write!(f, " ")?; - color!(f, GREEN, "{}", self.suggestion)?; - Ok(()) - } -} - -#[non_exhaustive] -#[derive(Debug, Deserialize, Default, Clone, Display, EnumIter, EnumString, VariantNames)] -pub(crate) enum Archive { - #[serde(rename = "wayback")] - #[strum(serialize = "wayback", ascii_case_insensitive)] - #[default] - WaybackMachine, -} - -impl Archive { - pub(crate) async fn get_link( - &self, - original: &Url, - timeout: Duration, - ) -> Result, Error> { - let function = match self { - Archive::WaybackMachine => wayback::get_wayback_link, - }; - - function(original, timeout).await - } -} diff --git a/lychee-bin/src/archive/wayback/mod.rs b/lychee-bin/src/archive/wayback/mod.rs deleted file mode 100644 index 577cf09..0000000 --- a/lychee-bin/src/archive/wayback/mod.rs +++ /dev/null @@ -1,137 +0,0 @@ -use std::sync::LazyLock; -use std::time::Duration; - -use serde::de::Error as SerdeError; -use serde::{Deserialize, Deserializer}; - -use http::StatusCode; -use reqwest::{Client, Error, Url}; -static WAYBACK_URL: LazyLock = - LazyLock::new(|| Url::parse("https://archive.org/wayback/available").unwrap()); - -pub(crate) async fn get_wayback_link(url: &Url, timeout: Duration) -> Result, Error> { - let mut archive_url: Url = WAYBACK_URL.clone(); - archive_url.set_query(Some(&format!("url={url}"))); - - let response = Client::builder() - .timeout(timeout) - .build()? - .get(archive_url) - .send() - .await? - .json::() - .await?; - - Ok(response - .archived_snapshots - .closest - .map(|closest| closest.url)) -} - -#[derive(Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct InternetArchiveResponse { - pub(crate) url: Url, - pub(crate) archived_snapshots: ArchivedSnapshots, -} - -#[derive(Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct ArchivedSnapshots { - pub(crate) closest: Option, -} - -#[derive(Debug, Deserialize, Eq, PartialEq)] -pub(crate) struct Closest { - #[serde(deserialize_with = "from_string")] - pub(crate) status: StatusCode, - pub(crate) available: bool, - pub(crate) url: Url, - pub(crate) timestamp: String, -} - -fn from_string<'d, D>(deserializer: D) -> Result -where - D: Deserializer<'d>, -{ - let value: &str = Deserialize::deserialize(deserializer)?; - let result = value - .parse::() - .map_err(|e| D::Error::custom(e.to_string()))?; - StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string())) -} - -#[cfg(test)] -mod tests { - use crate::archive::wayback::get_wayback_link; - use reqwest::{Error, Url}; - use std::{error::Error as StdError, time::Duration}; - use tokio::time::sleep; - - // This test is currently ignored because it is flaky. - // The Wayback Machine does not always return a suggestion. - // We can consider mocking the endpoint in the future. - #[tokio::test] - #[ignore = "Wayback Machine currently has certificate issues"] - async fn wayback_suggestion() -> Result<(), Box> { - let target_url = "https://example.com".parse::()?; - - // Extract domain from target_url without the scheme and trailing slash - let expected_ending = (target_url.host_str().ok_or("Invalid target URL")?).to_string(); - - // This test can be flaky, because the wayback machine does not always - // return a suggestion. Retry a few times if needed. - for _ in 0..3 { - match get_wayback_link(&target_url, Duration::from_secs(20)).await { - Ok(Some(suggested_url)) => { - // Ensure the host is correct - let host = suggested_url - .host_str() - .ok_or("Suggestion doesn't have a host")?; - assert_eq!(host, "web.archive.org"); - - // Extract the actual archived URL from the Wayback URL - let archived_url = suggested_url - .path() - .trim_start_matches("/web/") - .split_once('/') - .map(|x| x.1) - .ok_or("Failed to extract archived URL from Wayback suggestion")?; - - // Check the ending of the suggested URL without considering trailing slash - if !archived_url - .trim_end_matches('/') - .ends_with(&expected_ending) - { - return Err(format!( - "Expected suggestion '{archived_url}' to end with '{expected_ending}'" - ) - .into()); - } - - return Ok(()); - } - Ok(None) => { - // No suggestion was returned, wait and retry - sleep(Duration::from_secs(1)).await; - } - Err(e) => { - // Propagate other errors - return Err(format!("Error retrieving Wayback link: {e}").into()); - } - } - } - - Err("Did not get a valid Wayback Machine suggestion after multiple attempts.".into()) - } - - #[tokio::test] - #[ignore = "Wayback Machine currently has certificate issues"] - async fn wayback_suggestion_unknown_url() -> Result<(), Error> { - let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man" - .try_into() - .unwrap(); - - let response = get_wayback_link(url, Duration::from_secs(20)).await?; - assert_eq!(response, None); - Ok(()) - } -} diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 05c9359..b622f29 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -10,13 +10,14 @@ use reqwest::Url; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; +use lychee_lib::archive::Archive; use lychee_lib::{Client, ErrorKind, Request, Response, Uri}; use lychee_lib::{InputSource, Result}; use lychee_lib::{ResponseBody, Status}; -use crate::archive::{Archive, Suggestion}; use crate::formatters::get_response_formatter; use crate::formatters::response::ResponseFormatter; +use crate::formatters::suggestion::Suggestion; use crate::options::OutputMode; use crate::parse::parse_duration_secs; use crate::verbosity::Verbosity; @@ -141,7 +142,7 @@ async fn suggest_archived_links( let suggestions = Mutex::new(&mut stats.suggestion_map); futures::stream::iter(failed_urls) - .map(|(input, url)| (input, url, archive.get_link(url, timeout))) + .map(|(input, url)| (input, url, archive.get_archive_snapshot(url, timeout))) .for_each_concurrent(max_concurrency, |(input, url, future)| async { if let Ok(Some(suggestion)) = future.await { suggestions diff --git a/lychee-bin/src/formatters/mod.rs b/lychee-bin/src/formatters/mod.rs index ff8712d..ddb82cf 100644 --- a/lychee-bin/src/formatters/mod.rs +++ b/lychee-bin/src/formatters/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod duration; pub(crate) mod log; pub(crate) mod response; pub(crate) mod stats; +pub(crate) mod suggestion; use self::{response::ResponseFormatter, stats::StatsFormatter}; use crate::options::{OutputMode, StatsFormat}; diff --git a/lychee-bin/src/formatters/stats/markdown.rs b/lychee-bin/src/formatters/stats/markdown.rs index 1461ca3..1f6ea7f 100644 --- a/lychee-bin/src/formatters/stats/markdown.rs +++ b/lychee-bin/src/formatters/stats/markdown.rs @@ -154,12 +154,11 @@ impl StatsFormatter for Markdown { #[cfg(test)] mod tests { - use http::StatusCode; use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status, Uri}; use reqwest::Url; - use crate::archive::Suggestion; + use crate::formatters::suggestion::Suggestion; use super::*; diff --git a/lychee-bin/src/formatters/suggestion.rs b/lychee-bin/src/formatters/suggestion.rs new file mode 100644 index 0000000..a3c2d29 --- /dev/null +++ b/lychee-bin/src/formatters/suggestion.rs @@ -0,0 +1,23 @@ +use std::fmt::Display; + +use crate::color::{GREEN, PINK, color}; +use serde::Serialize; +use url::Url; + +#[derive(Debug, Serialize, Eq, Hash, PartialEq)] +/// A suggestion on how to replace a broken link with a link hosted by a web archive service. +pub(crate) struct Suggestion { + /// The original `Url` that was identified to be broken + pub(crate) original: Url, + /// The suggested `Url` replacement, which should remadiate the broken link with the use of a digital archive service. + pub(crate) suggestion: Url, +} + +impl Display for Suggestion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + color!(f, PINK, "{}", self.original)?; + write!(f, " ")?; + color!(f, GREEN, "{}", self.suggestion)?; + Ok(()) + } +} diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 08e1d65..b4af979 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -79,7 +79,6 @@ use lychee_lib::BasicAuthExtractor; use lychee_lib::Collector; use lychee_lib::CookieJar; -mod archive; mod cache; mod client; mod commands; diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 630573a..62313af 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,4 +1,3 @@ -use crate::archive::Archive; use crate::parse::parse_base; use crate::verbosity::Verbosity; use anyhow::{Context, Error, Result, anyhow}; @@ -12,7 +11,7 @@ use http::{ use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, - FileType, Input, StatusCodeExcluder, StatusCodeSelector, + FileType, Input, StatusCodeExcluder, StatusCodeSelector, archive::Archive, }; use reqwest::tls; use secrecy::{ExposeSecret, SecretString}; @@ -579,9 +578,9 @@ Example: --fallback-extensions html,htm,php,asp,aspx,jsp,cgi" value_name = "HEADER:VALUE", long_help = "Set custom header for requests -Some websites require custom headers to be passed in order to return valid responses. +Some websites require custom headers to be passed in order to return valid responses. You can specify custom headers in the format 'Name: Value'. For example, 'Accept: text/html'. -This is the same format that other tools like curl or wget use. +This is the same format that other tools like curl or wget use. Multiple headers can be specified by using the flag multiple times." )] #[serde(default)] diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index f339080..87cd144 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -3,10 +3,11 @@ use std::collections::{HashMap, HashSet}; -use crate::archive::Suggestion; use lychee_lib::{CacheStatus, InputSource, Response, ResponseBody, Status}; use serde::Serialize; +use crate::formatters::suggestion::Suggestion; + /// Response statistics /// /// This struct contains various counters for the responses received during a diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 55abaa4..bd6fe89 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -42,6 +42,7 @@ reqwest = { version = "0.12.15", default-features = false, features = [ "gzip", "trust-dns", "cookies", + "json" ] } reqwest_cookie_store = { version = "0.8.0", features = ["serde"] } # Make build work on Apple Silicon. @@ -53,6 +54,7 @@ secrecy = "0.10.3" serde = { version = "1.0.219", features = ["derive"] } serde_with = "3.12.0" shellexpand = "3.1.1" +strum = { version = "0.27.1", features = ["derive"] } thiserror = "2.0.12" tokio = { version = "1.45.1", features = ["full"] } toml = "0.8.22" diff --git a/lychee-lib/src/archive/mod.rs b/lychee-lib/src/archive/mod.rs new file mode 100644 index 0000000..76e1dcb --- /dev/null +++ b/lychee-lib/src/archive/mod.rs @@ -0,0 +1,38 @@ +use reqwest::{Error, Url}; +use serde::Deserialize; +use std::time::Duration; +use strum::{Display, EnumIter, EnumString, VariantNames}; + +mod wayback; + +#[non_exhaustive] +#[derive(Debug, Deserialize, Default, Clone, Display, EnumIter, EnumString, VariantNames)] +/// The different supported online archive sites for restoring broken links. +pub enum Archive { + #[serde(rename = "wayback")] + #[strum(serialize = "wayback", ascii_case_insensitive)] + #[default] + /// The most prominent digital archive provided by the [Interne Archive](https://archive.org) + WaybackMachine, +} + +impl Archive { + /// Query the `Archive` to try and find the latest snapshot of the specified `url`. + /// Returns `None` if the specified `url` hasn't been archived in the past. + /// + /// # Errors + /// + /// Returns an error if the `reqwest` client cannot be built, the request itself fails + /// or the API response cannot be parsed. + pub async fn get_archive_snapshot( + &self, + url: &Url, + timeout: Duration, + ) -> Result, Error> { + let function = match self { + Archive::WaybackMachine => wayback::get_archive_snapshot, + }; + + function(url, timeout).await + } +} diff --git a/lychee-lib/src/archive/wayback/mod.rs b/lychee-lib/src/archive/wayback/mod.rs new file mode 100644 index 0000000..1572104 --- /dev/null +++ b/lychee-lib/src/archive/wayback/mod.rs @@ -0,0 +1,177 @@ +use std::sync::LazyLock; +use std::time::Duration; + +use serde::de::Error as SerdeError; +use serde::{Deserialize, Deserializer}; + +use http::StatusCode; +use reqwest::{Client, Error, Url}; + +static WAYBACK_URL: LazyLock = + LazyLock::new(|| Url::parse("https://archive.org/wayback/available").unwrap()); + +pub(crate) async fn get_archive_snapshot( + url: &Url, + timeout: Duration, +) -> Result, Error> { + get_archive_snapshot_internal(url, timeout, WAYBACK_URL.clone()).await +} + +async fn get_archive_snapshot_internal( + url: &Url, + timeout: Duration, + mut api: Url, +) -> Result, Error> { + let url = url.to_string(); + + // The Wayback API doesn't return any snapshots for URLs with trailing slashes + let stripped = url.strip_suffix("/").unwrap_or(&url); + api.set_query(Some(&format!("url={stripped}"))); + + let response = Client::builder() + .timeout(timeout) + .build()? + .get(api) + .send() + .await? + .json::() + .await?; + + Ok(response + .archived_snapshots + .closest + .map(|closest| closest.url)) +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct InternetArchiveResponse { + pub(crate) url: Url, + pub(crate) archived_snapshots: ArchivedSnapshots, +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct ArchivedSnapshots { + pub(crate) closest: Option, +} + +#[derive(Debug, Deserialize, Eq, PartialEq)] +pub(crate) struct Closest { + #[serde(deserialize_with = "from_string")] + pub(crate) status: StatusCode, + pub(crate) available: bool, + pub(crate) url: Url, + pub(crate) timestamp: String, +} + +fn from_string<'d, D>(deserializer: D) -> Result +where + D: Deserializer<'d>, +{ + let value: &str = Deserialize::deserialize(deserializer)?; + let result = value + .parse::() + .map_err(|e| D::Error::custom(e.to_string()))?; + StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string())) +} + +#[cfg(test)] +mod tests { + use crate::archive::wayback::{get_archive_snapshot, get_archive_snapshot_internal}; + use http::StatusCode; + use reqwest::{Client, Error, Url}; + use std::{error::Error as StdError, time::Duration}; + use wiremock::matchers::query_param; + + const TIMEOUT: Duration = Duration::from_secs(20); + + #[tokio::test] + /// Test retrieval by mocking the Wayback API. + /// We mock their API beacuse unfortuantely it happens quite often that the + /// `archived_snapshots` field is empty because the API is unreliable. + /// This way we avoid flaky tests. + async fn wayback_suggestion_mocked() -> Result<(), Box> { + let mock_server = wiremock::MockServer::start().await; + let api_url = mock_server.uri(); + let api_response = wiremock::ResponseTemplate::new(StatusCode::OK).set_body_raw( + r#" + { + "url": "https://google.com/jobs.html", + "archived_snapshots": { + "closest": { + "available": true, + "url": "http://web.archive.org/web/20130919044612/http://example.com/", + "timestamp": "20130919044612", + "status": "200" + } + } + } + "#, + "application/json", + ); + + let url_to_restore = "https://example.com".parse::()?; + wiremock::Mock::given(wiremock::matchers::method("GET")) + .and(query_param( + "url", + url_to_restore.as_str().strip_suffix("/").unwrap(), + )) + .respond_with(api_response) + .mount(&mock_server) + .await; + + let result = + get_archive_snapshot_internal(&url_to_restore, TIMEOUT, api_url.parse()?).await; + + assert_eq!( + result?, + Some("http://web.archive.org/web/20130919044612/http://example.com/".parse()?) + ); + + Ok(()) + } + + #[tokio::test] + /// Their API documentation mentions when the last changes occurred. + /// Because we mock their API in previous tests we try to detect breaking API changes with this test. + async fn wayback_api_no_breaking_changes() -> Result<(), Error> { + let api_docs_url = "https://archive.org/help/wayback_api.php"; + let html = Client::builder() + .timeout(TIMEOUT) + .build()? + .get(api_docs_url) + .send() + .await? + .text() + .await?; + + assert!(html.contains("Updated on September, 24, 2013")); + Ok(()) + } + + #[ignore] + #[tokio::test] + /// This tests the real Wayback API without any mocks. + /// It is flaky because the API does not reliably return snapshots, + /// i.e. the `archived_snapshots` field is unreliable. + /// That's why the test is ignored. For development and documentation this test is still useful. + async fn wayback_suggestion_real() -> Result<(), Box> { + let url = &"https://example.com".try_into()?; + let response = get_archive_snapshot(url, TIMEOUT).await?; + assert_eq!( + response, + Some("http://web.archive.org/web/20250603204626/http://www.example.com/".parse()?) + ); + Ok(()) + } + + #[tokio::test] + /// This tests the real Wayback API without any mocks. + /// The flakyness of the API shouldn't affect this test because it originates from + /// the `archived_snapshots` field. + async fn wayback_suggestion_real_unknown() -> Result<(), Box> { + let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man".try_into()?; + let response = get_archive_snapshot(url, TIMEOUT).await?; + assert_eq!(response, None); + Ok(()) + } +} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 7fa0007..8311d03 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -50,6 +50,8 @@ #[cfg(doctest)] doc_comment::doctest!("../../README.md"); +/// Check online archives to try and restore broken links +pub mod archive; mod basic_auth; pub mod chain; mod checker;