From 1e7c1709ffd351834f5f2ff70549fa84c0613a87 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Mon, 22 Feb 2021 14:21:04 +0100 Subject: [PATCH] working prototype --- src/client.rs | 34 +++++++++++++++++++-- src/lib.rs | 1 + src/quirks/mod.rs | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 src/quirks/mod.rs diff --git a/src/client.rs b/src/client.rs index cb9bebc..68e16f1 100644 --- a/src/client.rs +++ b/src/client.rs @@ -13,6 +13,7 @@ use url::Url; use crate::filter::Excludes; use crate::filter::Filter; use crate::filter::Includes; +use crate::quirks::Quirks; use crate::types::{Response, Status}; use crate::uri::Uri; use crate::Request; @@ -22,11 +23,18 @@ const DEFAULT_MAX_REDIRECTS: usize = 5; #[derive(Debug, Clone)] pub struct Client { + /// The underlying reqwest client instance that handles the HTTP requests reqwest_client: reqwest::Client, + /// Github API client github: Option, + /// Filtered domain handling filter: Filter, + /// The default request HTTP method to use method: reqwest::Method, + /// The set of accepted HTTP status codes for valid URIs accepted: Option>, + /// Override behavior for certain known issues with URIs + quirks: Quirks, } /// A link checker using an API token for Github links @@ -152,10 +160,13 @@ impl ClientBuilder { let filter = Filter::new(Some(includes), Some(excludes), scheme); + let quirks = Quirks::init(); + Ok(Client { reqwest_client, github, filter, + quirks, method: self.method.clone().unwrap_or(reqwest::Method::GET), accepted: self.accepted.clone().unwrap_or(None), }) @@ -171,6 +182,7 @@ impl Client { if self.filter.excluded(&request) { return Ok(Response::new(request.uri, Status::Excluded, request.source)); } + let status = self.check_main(&request).await?; Ok(Response::new(request.uri, status, request.source)) } @@ -233,9 +245,27 @@ impl Client { } async fn check_default(&self, url: &Url) -> Status { - let request = self + let final_method = self.method.clone(); + let mut final_url = url.to_owned(); + let mut additional_headers = None; + + let quirk = self.quirks.rewrite(url); + if let Some(quirk) = quirk { + println!("Applying quirk: {:?}", quirk); + if let Some(url_func) = quirk.url { + final_url = url_func(url.to_owned()); + } + additional_headers = quirk.headers; + } + + println!("Final url: {:?}", final_url); + let mut request = self .reqwest_client - .request(self.method.clone(), url.as_str()); + .request(final_method, final_url.as_str()); + if let Some(headers) = additional_headers { + request = request.headers(headers); + } + let res = request.send().await; match res { Ok(response) => Status::new(response.status(), self.accepted.clone()), diff --git a/src/lib.rs b/src/lib.rs index 4afc504..5dfbe56 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,6 +42,7 @@ doctest!("../README.md"); mod client; mod client_pool; mod filter; +mod quirks; mod types; mod uri; diff --git a/src/quirks/mod.rs b/src/quirks/mod.rs new file mode 100644 index 0000000..e67b1ea --- /dev/null +++ b/src/quirks/mod.rs @@ -0,0 +1,78 @@ +use headers::HeaderMap; +use http::{header::USER_AGENT, Method}; +use regex::Regex; +use reqwest::Url; + +/// Sadly some pages only return plaintext results if Google is trying to crawl them. +const GOOGLEBOT: &'static str = + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)"; + +// Adapted from https://github.com/bluss/maplit for HeaderMaps +macro_rules! headers { + (@single $($x:tt)*) => (()); + (@count $($rest:expr),*) => (<[()]>::len(&[$(headers!(@single $rest)),*])); + + ($($key:expr => $value:expr,)+) => { headers!($($key => $value),+) }; + ($($key:expr => $value:expr),*) => { + { + let _cap = headers!(@count $($key),*); + let mut _map = headers::HeaderMap::with_capacity(_cap); + $( + let _ = _map.insert($key, $value); + )* + _map + } + }; +} + +#[derive(Debug, Clone)] +pub struct Quirk { + pub pattern: Regex, + pub method: Option, + pub headers: Option, + pub url: Option Url>, +} + +#[derive(Debug, Clone)] +pub struct Quirks { + quirks: Vec, +} + +impl Quirks { + pub fn init() -> Self { + let quirks = vec![ + Quirk { + pattern: Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap(), + method: Some(Method::HEAD), + headers: Some(headers!( + USER_AGENT => GOOGLEBOT + .parse() + .unwrap(), + )), + url: None, + }, + Quirk { + // https://stackoverflow.com/a/19377429/270334 + pattern: Regex::new(r"^(https?://)?(www\.)?(youtube\.com|youtu\.?be)").unwrap(), + method: Some(Method::HEAD), + headers: None, + url: Some(|orig_url| { + let mut url = Url::parse("https://www.youtube.com/oembed?").unwrap(); + url.set_query(Some(&format!("url={}", orig_url.as_str()))); + url + }), + }, + ]; + + Self { quirks } + } + + pub fn rewrite(&self, url: &Url) -> Option { + for quirk in &self.quirks { + if quirk.pattern.is_match(url.as_str()) { + return Some(quirk.clone()); + } + } + None + } +}