mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-11 15:23:11 +00:00
* Update flake * Fix clippy's new suggestions * Do not ignore tests any longer since they work by now * Add ignore reason
177 lines
5.8 KiB
Rust
177 lines
5.8 KiB
Rust
use std::sync::LazyLock;
|
|
use std::time::Duration;
|
|
|
|
use serde::de::Error as SerdeError;
|
|
use serde::{Deserialize, Deserializer};
|
|
|
|
use http::StatusCode;
|
|
use reqwest::{Client, Error, Url};
|
|
|
|
static WAYBACK_URL: LazyLock<Url> =
|
|
LazyLock::new(|| Url::parse("https://archive.org/wayback/available").unwrap());
|
|
|
|
pub(crate) async fn get_archive_snapshot(
|
|
url: &Url,
|
|
timeout: Duration,
|
|
) -> Result<Option<Url>, Error> {
|
|
get_archive_snapshot_internal(url, timeout, WAYBACK_URL.clone()).await
|
|
}
|
|
|
|
async fn get_archive_snapshot_internal(
|
|
url: &Url,
|
|
timeout: Duration,
|
|
mut api: Url,
|
|
) -> Result<Option<Url>, Error> {
|
|
let url = url.to_string();
|
|
|
|
// The Wayback API doesn't return any snapshots for URLs with trailing slashes
|
|
let stripped = url.strip_suffix("/").unwrap_or(&url);
|
|
api.set_query(Some(&format!("url={stripped}")));
|
|
|
|
let response = Client::builder()
|
|
.timeout(timeout)
|
|
.build()?
|
|
.get(api)
|
|
.send()
|
|
.await?
|
|
.json::<InternetArchiveResponse>()
|
|
.await?;
|
|
|
|
Ok(response
|
|
.archived_snapshots
|
|
.closest
|
|
.map(|closest| closest.url))
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Eq, PartialEq)]
|
|
pub(crate) struct InternetArchiveResponse {
|
|
pub(crate) url: Url,
|
|
pub(crate) archived_snapshots: ArchivedSnapshots,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Eq, PartialEq)]
|
|
pub(crate) struct ArchivedSnapshots {
|
|
pub(crate) closest: Option<Closest>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize, Eq, PartialEq)]
|
|
pub(crate) struct Closest {
|
|
#[serde(deserialize_with = "from_string")]
|
|
pub(crate) status: StatusCode,
|
|
pub(crate) available: bool,
|
|
pub(crate) url: Url,
|
|
pub(crate) timestamp: String,
|
|
}
|
|
|
|
fn from_string<'d, D>(deserializer: D) -> Result<StatusCode, D::Error>
|
|
where
|
|
D: Deserializer<'d>,
|
|
{
|
|
let value: &str = Deserialize::deserialize(deserializer)?;
|
|
let result = value
|
|
.parse::<u16>()
|
|
.map_err(|e| D::Error::custom(e.to_string()))?;
|
|
StatusCode::from_u16(result).map_err(|e| D::Error::custom(e.to_string()))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::archive::wayback::{get_archive_snapshot, get_archive_snapshot_internal};
|
|
use http::StatusCode;
|
|
use reqwest::{Client, Error, Url};
|
|
use std::{error::Error as StdError, time::Duration};
|
|
use wiremock::matchers::query_param;
|
|
|
|
const TIMEOUT: Duration = Duration::from_secs(20);
|
|
|
|
#[tokio::test]
|
|
/// Test retrieval by mocking the Wayback API.
|
|
/// We mock their API beacuse unfortuantely it happens quite often that the
|
|
/// `archived_snapshots` field is empty because the API is unreliable.
|
|
/// This way we avoid flaky tests.
|
|
async fn wayback_suggestion_mocked() -> Result<(), Box<dyn StdError>> {
|
|
let mock_server = wiremock::MockServer::start().await;
|
|
let api_url = mock_server.uri();
|
|
let api_response = wiremock::ResponseTemplate::new(StatusCode::OK).set_body_raw(
|
|
r#"
|
|
{
|
|
"url": "https://google.com/jobs.html",
|
|
"archived_snapshots": {
|
|
"closest": {
|
|
"available": true,
|
|
"url": "http://web.archive.org/web/20130919044612/http://example.com/",
|
|
"timestamp": "20130919044612",
|
|
"status": "200"
|
|
}
|
|
}
|
|
}
|
|
"#,
|
|
"application/json",
|
|
);
|
|
|
|
let url_to_restore = "https://example.com".parse::<Url>()?;
|
|
wiremock::Mock::given(wiremock::matchers::method("GET"))
|
|
.and(query_param(
|
|
"url",
|
|
url_to_restore.as_str().strip_suffix("/").unwrap(),
|
|
))
|
|
.respond_with(api_response)
|
|
.mount(&mock_server)
|
|
.await;
|
|
|
|
let result =
|
|
get_archive_snapshot_internal(&url_to_restore, TIMEOUT, api_url.parse()?).await;
|
|
|
|
assert_eq!(
|
|
result?,
|
|
Some("http://web.archive.org/web/20130919044612/http://example.com/".parse()?)
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[tokio::test]
|
|
/// Their API documentation mentions when the last changes occurred.
|
|
/// Because we mock their API in previous tests we try to detect breaking API changes with this test.
|
|
async fn wayback_api_no_breaking_changes() -> Result<(), Error> {
|
|
let api_docs_url = "https://archive.org/help/wayback_api.php";
|
|
let html = Client::builder()
|
|
.timeout(TIMEOUT)
|
|
.build()?
|
|
.get(api_docs_url)
|
|
.send()
|
|
.await?
|
|
.text()
|
|
.await?;
|
|
|
|
assert!(html.contains("Updated on September, 24, 2013"));
|
|
Ok(())
|
|
}
|
|
|
|
#[ignore = "
|
|
It is flaky because the API does not reliably return snapshots,
|
|
i.e. the `archived_snapshots` field is unreliable.
|
|
That's why the test is ignored. For development and documentation this test is still useful."]
|
|
#[tokio::test]
|
|
/// This tests the real Wayback API without any mocks.
|
|
async fn wayback_suggestion_real() -> Result<(), Box<dyn StdError>> {
|
|
let url = &"https://example.com".try_into()?;
|
|
let response = get_archive_snapshot(url, TIMEOUT).await?;
|
|
assert_eq!(
|
|
response,
|
|
Some("http://web.archive.org/web/20250603204626/http://www.example.com/".parse()?)
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
#[tokio::test]
|
|
/// This tests the real Wayback API without any mocks.
|
|
/// The flakyness of the API shouldn't affect this test because it originates from
|
|
/// the `archived_snapshots` field.
|
|
async fn wayback_suggestion_real_unknown() -> Result<(), Box<dyn StdError>> {
|
|
let url = &"https://github.com/mre/idiomatic-rust-doesnt-exist-man".try_into()?;
|
|
let response = get_archive_snapshot(url, TIMEOUT).await?;
|
|
assert_eq!(response, None);
|
|
Ok(())
|
|
}
|
|
}
|