mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-03 19:34:45 +00:00
Add support for website quirks
This commit is contained in:
commit
bcb3933b22
5 changed files with 174 additions and 9 deletions
2
fixtures/TEST_QUIRKS.txt
Normal file
2
fixtures/TEST_QUIRKS.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7
|
||||
https://twitter.com/zarfeblong/status/1339742840142872577
|
||||
|
|
@ -13,6 +13,7 @@ use url::Url;
|
|||
use crate::filter::Excludes;
|
||||
use crate::filter::Filter;
|
||||
use crate::filter::Includes;
|
||||
use crate::quirks::Quirks;
|
||||
use crate::types::{Response, Status};
|
||||
use crate::uri::Uri;
|
||||
use crate::Request;
|
||||
|
|
@ -22,11 +23,18 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
|
|||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
/// The underlying reqwest client instance that handles the HTTP requests
|
||||
reqwest_client: reqwest::Client,
|
||||
/// Github API client
|
||||
github: Option<Github>,
|
||||
/// Filtered domain handling
|
||||
filter: Filter,
|
||||
/// The default request HTTP method to use
|
||||
method: reqwest::Method,
|
||||
/// The set of accepted HTTP status codes for valid URIs
|
||||
accepted: Option<HashSet<reqwest::StatusCode>>,
|
||||
/// Override behavior for certain known issues with URIs
|
||||
quirks: Quirks,
|
||||
}
|
||||
|
||||
/// A link checker using an API token for Github links
|
||||
|
|
@ -152,10 +160,13 @@ impl ClientBuilder {
|
|||
|
||||
let filter = Filter::new(Some(includes), Some(excludes), scheme);
|
||||
|
||||
let quirks = Quirks::default();
|
||||
|
||||
Ok(Client {
|
||||
reqwest_client,
|
||||
github,
|
||||
filter,
|
||||
quirks,
|
||||
method: self.method.clone().unwrap_or(reqwest::Method::GET),
|
||||
accepted: self.accepted.clone().unwrap_or(None),
|
||||
})
|
||||
|
|
@ -171,17 +182,22 @@ impl Client {
|
|||
if self.filter.excluded(&request) {
|
||||
return Ok(Response::new(request.uri, Status::Excluded, request.source));
|
||||
}
|
||||
let status = match request.uri {
|
||||
|
||||
let status = self.check_main(&request).await?;
|
||||
Ok(Response::new(request.uri, status, request.source))
|
||||
}
|
||||
|
||||
async fn check_main(&self, request: &Request) -> Result<Status> {
|
||||
Ok(match request.uri {
|
||||
Uri::Website(ref url) => self.check_website(&url).await,
|
||||
Uri::Mail(ref address) => {
|
||||
// TODO: We should not be using a HTTP status code for mail
|
||||
match self.valid_mail(&address).await {
|
||||
match self.check_mail(&address).await {
|
||||
true => Status::Ok(http::StatusCode::OK),
|
||||
false => Status::Error(format!("Invalid mail address: {}", address), None),
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Response::new(request.uri, status, request.source))
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn check_website(&self, url: &Url) -> Status {
|
||||
|
|
@ -230,11 +246,17 @@ impl Client {
|
|||
}
|
||||
|
||||
async fn check_default(&self, url: &Url) -> Status {
|
||||
let request = self
|
||||
let request = match self
|
||||
.reqwest_client
|
||||
.request(self.method.clone(), url.as_str());
|
||||
let res = request.send().await;
|
||||
match res {
|
||||
.request(self.method.clone(), url.to_owned())
|
||||
.build()
|
||||
{
|
||||
Ok(r) => r,
|
||||
Err(e) => return e.into(),
|
||||
};
|
||||
let request = self.quirks.apply(request);
|
||||
|
||||
match self.reqwest_client.execute(request).await {
|
||||
Ok(response) => Status::new(response.status(), self.accepted.clone()),
|
||||
Err(e) => e.into(),
|
||||
}
|
||||
|
|
@ -248,7 +270,7 @@ impl Client {
|
|||
Ok((owner.as_str().into(), repo.as_str().into()))
|
||||
}
|
||||
|
||||
pub async fn valid_mail(&self, address: &str) -> bool {
|
||||
pub async fn check_mail(&self, address: &str) -> bool {
|
||||
let input = CheckEmailInput::new(vec![address.to_string()]);
|
||||
let results = check_email(&input).await;
|
||||
let result = results.get(0);
|
||||
|
|
@ -369,6 +391,20 @@ mod test {
|
|||
assert!(res.is_failure());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_youtube() {
|
||||
// This is applying a quirk. See the quirks module.
|
||||
let client: Client = ClientBuilder::default().build().unwrap();
|
||||
assert!(client.check("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
|
||||
.await
|
||||
.unwrap()
|
||||
.status.is_success());
|
||||
assert!(client.check("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
|
||||
.await
|
||||
.unwrap()
|
||||
.status.is_failure());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_non_github() {
|
||||
let template = ResponseTemplate::new(200);
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ doctest!("../README.md");
|
|||
mod client;
|
||||
mod client_pool;
|
||||
mod filter;
|
||||
mod quirks;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
|
|
|
|||
111
src/quirks/mod.rs
Normal file
111
src/quirks/mod.rs
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
use http::{header, Method};
|
||||
use regex::Regex;
|
||||
use reqwest::{Request, Url};
|
||||
|
||||
/// Sadly some pages only return plaintext results if Google is trying to crawl them.
|
||||
const GOOGLEBOT: &str = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)";
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Quirk {
|
||||
pub pattern: Regex,
|
||||
pub rewrite: fn(Request) -> Request,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Quirks {
|
||||
quirks: Vec<Quirk>,
|
||||
}
|
||||
|
||||
impl Default for Quirks {
|
||||
fn default() -> Self {
|
||||
let quirks = vec![
|
||||
Quirk {
|
||||
// Twitter cut off the ability to read a tweet by fetching its
|
||||
// URL with a normal HTTP GET. Only Googlebot will get a plain
|
||||
// HTML response.
|
||||
// See https://twitter.com/zarfeblong/status/1339742840142872577
|
||||
pattern: Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap(),
|
||||
rewrite: |request| {
|
||||
let mut out = request;
|
||||
*out.method_mut() = Method::HEAD;
|
||||
out.headers_mut()
|
||||
.insert(header::USER_AGENT, GOOGLEBOT.parse().unwrap());
|
||||
out
|
||||
},
|
||||
},
|
||||
Quirk {
|
||||
// The https://www.youtube.com/oembed API will return 404 for
|
||||
// missing videos and can be used to check youtube links.
|
||||
// See https://stackoverflow.com/a/19377429/270334
|
||||
pattern: Regex::new(r"^(https?://)?(www\.)?(youtube\.com|youtu\.?be)").unwrap(),
|
||||
rewrite: |request| {
|
||||
let mut out = request;
|
||||
let original_url = out.url();
|
||||
let urlencoded: String =
|
||||
url::form_urlencoded::byte_serialize(original_url.as_str().as_bytes())
|
||||
.collect();
|
||||
let mut url = Url::parse("https://www.youtube.com/oembed").unwrap();
|
||||
url.set_query(Some(&format!("url={}", urlencoded)));
|
||||
*out.url_mut() = url;
|
||||
out
|
||||
},
|
||||
},
|
||||
];
|
||||
Self { quirks }
|
||||
}
|
||||
}
|
||||
|
||||
impl Quirks {
|
||||
/// Apply quirks to a given request. Only the first quirk regex pattern
|
||||
/// matching the URL will be applied. The rest will be discarded for
|
||||
/// simplicity reasons. This limitation might be lifted in the future.
|
||||
pub fn apply(&self, request: Request) -> Request {
|
||||
for quirk in &self.quirks {
|
||||
if quirk.pattern.is_match(request.url().as_str()) {
|
||||
return (quirk.rewrite)(request);
|
||||
}
|
||||
}
|
||||
// Request was not modified
|
||||
request
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_twitter_request() {
|
||||
let orig = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap();
|
||||
let request = Request::new(Method::GET, orig.clone());
|
||||
let quirks = Quirks::default();
|
||||
let modified = quirks.apply(request);
|
||||
assert_eq!(modified.url(), &orig);
|
||||
assert_eq!(modified.method(), Method::HEAD);
|
||||
assert_eq!(
|
||||
modified.headers().get(header::USER_AGENT).unwrap(),
|
||||
&GOOGLEBOT
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_youtube_request() {
|
||||
let orig = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap();
|
||||
let request = Request::new(Method::GET, orig);
|
||||
let quirks = Quirks::default();
|
||||
let modified = quirks.apply(request);
|
||||
let expected_url = Url::parse("https://www.youtube.com/oembed?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DNlKuICiT470%26list%3DPLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ%26index%3D7").unwrap();
|
||||
assert_eq!(modified.url(), &expected_url);
|
||||
assert_eq!(modified.method(), Method::GET);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_quirk_applied() {
|
||||
let orig = Url::parse("https://endler.dev").unwrap();
|
||||
let request = Request::new(Method::GET, orig.clone());
|
||||
let quirks = Quirks::default();
|
||||
let modified = quirks.apply(request);
|
||||
assert_eq!(modified.url(), &orig);
|
||||
assert_eq!(modified.method(), Method::GET);
|
||||
}
|
||||
}
|
||||
15
tests/cli.rs
15
tests/cli.rs
|
|
@ -52,6 +52,21 @@ mod cli {
|
|||
.stdout(contains("Errors...........0"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quirks() {
|
||||
let mut cmd = main_command();
|
||||
let test_quirks_path = fixtures_path().join("TEST_QUIRKS.txt");
|
||||
|
||||
cmd.arg("--verbose")
|
||||
.arg(test_quirks_path)
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............2"))
|
||||
.stdout(contains("Excluded.........0"))
|
||||
.stdout(contains("Successful.......2"))
|
||||
.stdout(contains("Errors...........0"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_failure_404_link() {
|
||||
let mut cmd = main_command();
|
||||
|
|
|
|||
Loading…
Reference in a new issue