Add support for website quirks

This commit is contained in:
Matthias 2021-03-29 00:42:25 +02:00 committed by GitHub
commit bcb3933b22
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 174 additions and 9 deletions

2
fixtures/TEST_QUIRKS.txt Normal file
View file

@ -0,0 +1,2 @@
https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7
https://twitter.com/zarfeblong/status/1339742840142872577

View file

@ -13,6 +13,7 @@ use url::Url;
use crate::filter::Excludes;
use crate::filter::Filter;
use crate::filter::Includes;
use crate::quirks::Quirks;
use crate::types::{Response, Status};
use crate::uri::Uri;
use crate::Request;
@ -22,11 +23,18 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
#[derive(Debug, Clone)]
pub struct Client {
/// The underlying reqwest client instance that handles the HTTP requests
reqwest_client: reqwest::Client,
/// Github API client
github: Option<Github>,
/// Filtered domain handling
filter: Filter,
/// The default request HTTP method to use
method: reqwest::Method,
/// The set of accepted HTTP status codes for valid URIs
accepted: Option<HashSet<reqwest::StatusCode>>,
/// Override behavior for certain known issues with URIs
quirks: Quirks,
}
/// A link checker using an API token for Github links
@ -152,10 +160,13 @@ impl ClientBuilder {
let filter = Filter::new(Some(includes), Some(excludes), scheme);
let quirks = Quirks::default();
Ok(Client {
reqwest_client,
github,
filter,
quirks,
method: self.method.clone().unwrap_or(reqwest::Method::GET),
accepted: self.accepted.clone().unwrap_or(None),
})
@ -171,17 +182,22 @@ impl Client {
if self.filter.excluded(&request) {
return Ok(Response::new(request.uri, Status::Excluded, request.source));
}
let status = match request.uri {
let status = self.check_main(&request).await?;
Ok(Response::new(request.uri, status, request.source))
}
async fn check_main(&self, request: &Request) -> Result<Status> {
Ok(match request.uri {
Uri::Website(ref url) => self.check_website(&url).await,
Uri::Mail(ref address) => {
// TODO: We should not be using a HTTP status code for mail
match self.valid_mail(&address).await {
match self.check_mail(&address).await {
true => Status::Ok(http::StatusCode::OK),
false => Status::Error(format!("Invalid mail address: {}", address), None),
}
}
};
Ok(Response::new(request.uri, status, request.source))
})
}
pub async fn check_website(&self, url: &Url) -> Status {
@ -230,11 +246,17 @@ impl Client {
}
async fn check_default(&self, url: &Url) -> Status {
let request = self
let request = match self
.reqwest_client
.request(self.method.clone(), url.as_str());
let res = request.send().await;
match res {
.request(self.method.clone(), url.to_owned())
.build()
{
Ok(r) => r,
Err(e) => return e.into(),
};
let request = self.quirks.apply(request);
match self.reqwest_client.execute(request).await {
Ok(response) => Status::new(response.status(), self.accepted.clone()),
Err(e) => e.into(),
}
@ -248,7 +270,7 @@ impl Client {
Ok((owner.as_str().into(), repo.as_str().into()))
}
pub async fn valid_mail(&self, address: &str) -> bool {
pub async fn check_mail(&self, address: &str) -> bool {
let input = CheckEmailInput::new(vec![address.to_string()]);
let results = check_email(&input).await;
let result = results.get(0);
@ -369,6 +391,20 @@ mod test {
assert!(res.is_failure());
}
#[tokio::test]
async fn test_youtube() {
// This is applying a quirk. See the quirks module.
let client: Client = ClientBuilder::default().build().unwrap();
assert!(client.check("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
.await
.unwrap()
.status.is_success());
assert!(client.check("https://www.youtube.com/watch?v=invalidNlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7")
.await
.unwrap()
.status.is_failure());
}
#[tokio::test]
async fn test_non_github() {
let template = ResponseTemplate::new(200);

View file

@ -42,6 +42,7 @@ doctest!("../README.md");
mod client;
mod client_pool;
mod filter;
mod quirks;
mod types;
mod uri;

111
src/quirks/mod.rs Normal file
View file

@ -0,0 +1,111 @@
use http::{header, Method};
use regex::Regex;
use reqwest::{Request, Url};
/// Sadly some pages only return plaintext results if Google is trying to crawl them.
const GOOGLEBOT: &str = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)";
#[derive(Debug, Clone)]
pub struct Quirk {
pub pattern: Regex,
pub rewrite: fn(Request) -> Request,
}
#[derive(Debug, Clone)]
pub struct Quirks {
quirks: Vec<Quirk>,
}
impl Default for Quirks {
fn default() -> Self {
let quirks = vec![
Quirk {
// Twitter cut off the ability to read a tweet by fetching its
// URL with a normal HTTP GET. Only Googlebot will get a plain
// HTML response.
// See https://twitter.com/zarfeblong/status/1339742840142872577
pattern: Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap(),
rewrite: |request| {
let mut out = request;
*out.method_mut() = Method::HEAD;
out.headers_mut()
.insert(header::USER_AGENT, GOOGLEBOT.parse().unwrap());
out
},
},
Quirk {
// The https://www.youtube.com/oembed API will return 404 for
// missing videos and can be used to check youtube links.
// See https://stackoverflow.com/a/19377429/270334
pattern: Regex::new(r"^(https?://)?(www\.)?(youtube\.com|youtu\.?be)").unwrap(),
rewrite: |request| {
let mut out = request;
let original_url = out.url();
let urlencoded: String =
url::form_urlencoded::byte_serialize(original_url.as_str().as_bytes())
.collect();
let mut url = Url::parse("https://www.youtube.com/oembed").unwrap();
url.set_query(Some(&format!("url={}", urlencoded)));
*out.url_mut() = url;
out
},
},
];
Self { quirks }
}
}
impl Quirks {
/// Apply quirks to a given request. Only the first quirk regex pattern
/// matching the URL will be applied. The rest will be discarded for
/// simplicity reasons. This limitation might be lifted in the future.
pub fn apply(&self, request: Request) -> Request {
for quirk in &self.quirks {
if quirk.pattern.is_match(request.url().as_str()) {
return (quirk.rewrite)(request);
}
}
// Request was not modified
request
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_twitter_request() {
let orig = Url::parse("https://twitter.com/zarfeblong/status/1339742840142872577").unwrap();
let request = Request::new(Method::GET, orig.clone());
let quirks = Quirks::default();
let modified = quirks.apply(request);
assert_eq!(modified.url(), &orig);
assert_eq!(modified.method(), Method::HEAD);
assert_eq!(
modified.headers().get(header::USER_AGENT).unwrap(),
&GOOGLEBOT
);
}
#[test]
fn test_youtube_request() {
let orig = Url::parse("https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7").unwrap();
let request = Request::new(Method::GET, orig);
let quirks = Quirks::default();
let modified = quirks.apply(request);
let expected_url = Url::parse("https://www.youtube.com/oembed?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DNlKuICiT470%26list%3DPLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ%26index%3D7").unwrap();
assert_eq!(modified.url(), &expected_url);
assert_eq!(modified.method(), Method::GET);
}
#[test]
fn test_no_quirk_applied() {
let orig = Url::parse("https://endler.dev").unwrap();
let request = Request::new(Method::GET, orig.clone());
let quirks = Quirks::default();
let modified = quirks.apply(request);
assert_eq!(modified.url(), &orig);
assert_eq!(modified.method(), Method::GET);
}
}

View file

@ -52,6 +52,21 @@ mod cli {
.stdout(contains("Errors...........0"));
}
#[test]
fn test_quirks() {
let mut cmd = main_command();
let test_quirks_path = fixtures_path().join("TEST_QUIRKS.txt");
cmd.arg("--verbose")
.arg(test_quirks_path)
.assert()
.success()
.stdout(contains("Total............2"))
.stdout(contains("Excluded.........0"))
.stdout(contains("Successful.......2"))
.stdout(contains("Errors...........0"));
}
#[tokio::test]
async fn test_failure_404_link() {
let mut cmd = main_command();