mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-14 18:30:59 +00:00
working prototype
This commit is contained in:
parent
4f68f54237
commit
1e7c1709ff
3 changed files with 111 additions and 2 deletions
|
|
@ -13,6 +13,7 @@ use url::Url;
|
|||
use crate::filter::Excludes;
|
||||
use crate::filter::Filter;
|
||||
use crate::filter::Includes;
|
||||
use crate::quirks::Quirks;
|
||||
use crate::types::{Response, Status};
|
||||
use crate::uri::Uri;
|
||||
use crate::Request;
|
||||
|
|
@ -22,11 +23,18 @@ const DEFAULT_MAX_REDIRECTS: usize = 5;
|
|||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
/// The underlying reqwest client instance that handles the HTTP requests
|
||||
reqwest_client: reqwest::Client,
|
||||
/// Github API client
|
||||
github: Option<Github>,
|
||||
/// Filtered domain handling
|
||||
filter: Filter,
|
||||
/// The default request HTTP method to use
|
||||
method: reqwest::Method,
|
||||
/// The set of accepted HTTP status codes for valid URIs
|
||||
accepted: Option<HashSet<reqwest::StatusCode>>,
|
||||
/// Override behavior for certain known issues with URIs
|
||||
quirks: Quirks,
|
||||
}
|
||||
|
||||
/// A link checker using an API token for Github links
|
||||
|
|
@ -152,10 +160,13 @@ impl ClientBuilder {
|
|||
|
||||
let filter = Filter::new(Some(includes), Some(excludes), scheme);
|
||||
|
||||
let quirks = Quirks::init();
|
||||
|
||||
Ok(Client {
|
||||
reqwest_client,
|
||||
github,
|
||||
filter,
|
||||
quirks,
|
||||
method: self.method.clone().unwrap_or(reqwest::Method::GET),
|
||||
accepted: self.accepted.clone().unwrap_or(None),
|
||||
})
|
||||
|
|
@ -171,6 +182,7 @@ impl Client {
|
|||
if self.filter.excluded(&request) {
|
||||
return Ok(Response::new(request.uri, Status::Excluded, request.source));
|
||||
}
|
||||
|
||||
let status = self.check_main(&request).await?;
|
||||
Ok(Response::new(request.uri, status, request.source))
|
||||
}
|
||||
|
|
@ -233,9 +245,27 @@ impl Client {
|
|||
}
|
||||
|
||||
async fn check_default(&self, url: &Url) -> Status {
|
||||
let request = self
|
||||
let final_method = self.method.clone();
|
||||
let mut final_url = url.to_owned();
|
||||
let mut additional_headers = None;
|
||||
|
||||
let quirk = self.quirks.rewrite(url);
|
||||
if let Some(quirk) = quirk {
|
||||
println!("Applying quirk: {:?}", quirk);
|
||||
if let Some(url_func) = quirk.url {
|
||||
final_url = url_func(url.to_owned());
|
||||
}
|
||||
additional_headers = quirk.headers;
|
||||
}
|
||||
|
||||
println!("Final url: {:?}", final_url);
|
||||
let mut request = self
|
||||
.reqwest_client
|
||||
.request(self.method.clone(), url.as_str());
|
||||
.request(final_method, final_url.as_str());
|
||||
if let Some(headers) = additional_headers {
|
||||
request = request.headers(headers);
|
||||
}
|
||||
|
||||
let res = request.send().await;
|
||||
match res {
|
||||
Ok(response) => Status::new(response.status(), self.accepted.clone()),
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ doctest!("../README.md");
|
|||
mod client;
|
||||
mod client_pool;
|
||||
mod filter;
|
||||
mod quirks;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
|
|
|
|||
78
src/quirks/mod.rs
Normal file
78
src/quirks/mod.rs
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
use headers::HeaderMap;
|
||||
use http::{header::USER_AGENT, Method};
|
||||
use regex::Regex;
|
||||
use reqwest::Url;
|
||||
|
||||
/// Sadly some pages only return plaintext results if Google is trying to crawl them.
|
||||
const GOOGLEBOT: &'static str =
|
||||
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://google.com/bot.html)";
|
||||
|
||||
// Adapted from https://github.com/bluss/maplit for HeaderMaps
|
||||
macro_rules! headers {
|
||||
(@single $($x:tt)*) => (());
|
||||
(@count $($rest:expr),*) => (<[()]>::len(&[$(headers!(@single $rest)),*]));
|
||||
|
||||
($($key:expr => $value:expr,)+) => { headers!($($key => $value),+) };
|
||||
($($key:expr => $value:expr),*) => {
|
||||
{
|
||||
let _cap = headers!(@count $($key),*);
|
||||
let mut _map = headers::HeaderMap::with_capacity(_cap);
|
||||
$(
|
||||
let _ = _map.insert($key, $value);
|
||||
)*
|
||||
_map
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Quirk {
|
||||
pub pattern: Regex,
|
||||
pub method: Option<reqwest::Method>,
|
||||
pub headers: Option<HeaderMap>,
|
||||
pub url: Option<fn(Url) -> Url>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Quirks {
|
||||
quirks: Vec<Quirk>,
|
||||
}
|
||||
|
||||
impl Quirks {
|
||||
pub fn init() -> Self {
|
||||
let quirks = vec![
|
||||
Quirk {
|
||||
pattern: Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap(),
|
||||
method: Some(Method::HEAD),
|
||||
headers: Some(headers!(
|
||||
USER_AGENT => GOOGLEBOT
|
||||
.parse()
|
||||
.unwrap(),
|
||||
)),
|
||||
url: None,
|
||||
},
|
||||
Quirk {
|
||||
// https://stackoverflow.com/a/19377429/270334
|
||||
pattern: Regex::new(r"^(https?://)?(www\.)?(youtube\.com|youtu\.?be)").unwrap(),
|
||||
method: Some(Method::HEAD),
|
||||
headers: None,
|
||||
url: Some(|orig_url| {
|
||||
let mut url = Url::parse("https://www.youtube.com/oembed?").unwrap();
|
||||
url.set_query(Some(&format!("url={}", orig_url.as_str())));
|
||||
url
|
||||
}),
|
||||
},
|
||||
];
|
||||
|
||||
Self { quirks }
|
||||
}
|
||||
|
||||
pub fn rewrite(&self, url: &Url) -> Option<Quirk> {
|
||||
for quirk in &self.quirks {
|
||||
if quirk.pattern.is_match(url.as_str()) {
|
||||
return Some(quirk.clone());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue