working prototype

This commit is contained in:
Matthias Endler 2021-02-22 14:21:04 +01:00
parent 4f68f54237
commit 1e7c1709ff
3 changed files with 111 additions and 2 deletions

View file

@ -13,6 +13,7 @@ use url::Url;
use crate::filter::Excludes;
use crate::filter::Filter;
use crate::filter::Includes;
use crate::quirks::Quirks;
use crate::types::{Response, Status};
use crate::uri::Uri;
use crate::Request;
@ -22,11 +23,18 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
#[derive(Debug, Clone)]
pub struct Client {
/// The underlying reqwest client instance that handles the HTTP requests
reqwest_client: reqwest::Client,
/// Github API client
github: Option<Github>,
/// Filtered domain handling
filter: Filter,
/// The default request HTTP method to use
method: reqwest::Method,
/// The set of accepted HTTP status codes for valid URIs
accepted: Option<HashSet<reqwest::StatusCode>>,
/// Override behavior for certain known issues with URIs
quirks: Quirks,
}
/// A link checker using an API token for Github links
@ -152,10 +160,13 @@ impl ClientBuilder {
let filter = Filter::new(Some(includes), Some(excludes), scheme);
let quirks = Quirks::init();
Ok(Client {
reqwest_client,
github,
filter,
quirks,
method: self.method.clone().unwrap_or(reqwest::Method::GET),
accepted: self.accepted.clone().unwrap_or(None),
})
@ -171,6 +182,7 @@ impl Client {
if self.filter.excluded(&request) {
return Ok(Response::new(request.uri, Status::Excluded, request.source));
}
let status = self.check_main(&request).await?;
Ok(Response::new(request.uri, status, request.source))
}
@ -233,9 +245,27 @@ impl Client {
}
async fn check_default(&self, url: &Url) -> Status {
let request = self
let final_method = self.method.clone();
let mut final_url = url.to_owned();
let mut additional_headers = None;
let quirk = self.quirks.rewrite(url);
if let Some(quirk) = quirk {
println!("Applying quirk: {:?}", quirk);
if let Some(url_func) = quirk.url {
final_url = url_func(url.to_owned());
}
additional_headers = quirk.headers;
}
println!("Final url: {:?}", final_url);
let mut request = self
.reqwest_client
.request(self.method.clone(), url.as_str());
.request(final_method, final_url.as_str());
if let Some(headers) = additional_headers {
request = request.headers(headers);
}
let res = request.send().await;
match res {
Ok(response) => Status::new(response.status(), self.accepted.clone()),

View file

@ -42,6 +42,7 @@ doctest!("../README.md");
mod client;
mod client_pool;
mod filter;
mod quirks;
mod types;
mod uri;

78
src/quirks/mod.rs Normal file
View file

@ -0,0 +1,78 @@
use headers::HeaderMap;
use http::{header::USER_AGENT, Method};
use regex::Regex;
use reqwest::Url;
/// Sadly some pages only return plaintext results if Google is trying to crawl them.
const GOOGLEBOT: &'static str =
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)";
// Adapted from https://github.com/bluss/maplit for HeaderMaps
macro_rules! headers {
(@single $($x:tt)*) => (());
(@count $($rest:expr),*) => (<[()]>::len(&[$(headers!(@single $rest)),*]));
($($key:expr => $value:expr,)+) => { headers!($($key => $value),+) };
($($key:expr => $value:expr),*) => {
{
let _cap = headers!(@count $($key),*);
let mut _map = headers::HeaderMap::with_capacity(_cap);
$(
let _ = _map.insert($key, $value);
)*
_map
}
};
}
#[derive(Debug, Clone)]
pub struct Quirk {
pub pattern: Regex,
pub method: Option<reqwest::Method>,
pub headers: Option<HeaderMap>,
pub url: Option<fn(Url) -> Url>,
}
#[derive(Debug, Clone)]
pub struct Quirks {
quirks: Vec<Quirk>,
}
impl Quirks {
pub fn init() -> Self {
let quirks = vec![
Quirk {
pattern: Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap(),
method: Some(Method::HEAD),
headers: Some(headers!(
USER_AGENT => GOOGLEBOT
.parse()
.unwrap(),
)),
url: None,
},
Quirk {
// https://stackoverflow.com/a/19377429/270334
pattern: Regex::new(r"^(https?://)?(www\.)?(youtube\.com|youtu\.?be)").unwrap(),
method: Some(Method::HEAD),
headers: None,
url: Some(|orig_url| {
let mut url = Url::parse("https://www.youtube.com/oembed?").unwrap();
url.set_query(Some(&format!("url={}", orig_url.as_str())));
url
}),
},
];
Self { quirks }
}
pub fn rewrite(&self, url: &Url) -> Option<Quirk> {
for quirk in &self.quirks {
if quirk.pattern.is_match(url.as_str()) {
return Some(quirk.clone());
}
}
None
}
}