mirror of
https://github.com/Hopiu/lychee.git
synced 2026-05-19 11:11:06 +00:00
Split up file checking into file discovery and validation of path exists
This commit is contained in:
parent
bfa3b1b6a1
commit
887f1b9589
8 changed files with 114 additions and 63 deletions
|
|
@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
|
|||
use crate::{
|
||||
filter::{Excludes, Filter, Includes},
|
||||
quirks::Quirks,
|
||||
uri::Uri,
|
||||
ErrorKind, Request, Response, Result, Status,
|
||||
ErrorKind, Request, Response, Result, Status, Uri,
|
||||
};
|
||||
|
||||
const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
|
|
@ -178,6 +177,8 @@ impl Client {
|
|||
let Request { uri, source } = Request::try_from(request)?;
|
||||
let status = if self.filter.is_excluded(&uri) {
|
||||
Status::Excluded
|
||||
} else if uri.is_file() {
|
||||
self.check_file(&uri).await
|
||||
} else if uri.is_mail() {
|
||||
self.check_mail(&uri).await
|
||||
} else {
|
||||
|
|
@ -250,6 +251,15 @@ impl Client {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
if let Ok(path) = uri.inner.to_file_path() {
|
||||
if path.exists() {
|
||||
return Status::Ok(StatusCode::OK);
|
||||
}
|
||||
}
|
||||
ErrorKind::InvalidFileUri(uri.clone()).into()
|
||||
}
|
||||
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
|
||||
let result = &(check_email(&input).await)[0];
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result};
|
||||
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
|
|
|
|||
|
|
@ -8,11 +8,12 @@ use linkify::LinkFinder;
|
|||
use log::info;
|
||||
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
fs_tree,
|
||||
fs,
|
||||
types::{FileType, InputContent},
|
||||
Base, Input, Request, Result, Uri,
|
||||
Base, ErrorKind, Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
// Use LinkFinder here to offload the actual link searching in plaintext.
|
||||
|
|
@ -113,27 +114,27 @@ pub(crate) fn extract_links(
|
|||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
|
||||
// Only keep legit URLs. This sorts out things like anchors.
|
||||
// Silently ignore the parse failures for now.
|
||||
// Only keep legit URLs. For example this filters out anchors.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
requests.insert(Request::new(uri, input_content.input.clone()));
|
||||
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
|
||||
requests.insert(Request::new(
|
||||
Uri { url: new_url },
|
||||
input_content.input.clone(),
|
||||
));
|
||||
Request::new(Uri { inner: new_url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) {
|
||||
let input_content = Input::path_content(path)?;
|
||||
requests.extend(extract_links(&input_content, base)?);
|
||||
} else {
|
||||
info!("Cannot find path to {} in filesystem", &link);
|
||||
}
|
||||
let link = fs::sanitize(link);
|
||||
let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
|
||||
Request::new(
|
||||
Uri {
|
||||
inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?,
|
||||
},
|
||||
input_content.input.clone(),
|
||||
)
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
}
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
|
|
@ -151,10 +152,7 @@ mod test {
|
|||
use pretty_assertions::assert_eq;
|
||||
use url::Url;
|
||||
|
||||
use super::{
|
||||
extract_links, extract_links_from_html, extract_links_from_markdown,
|
||||
extract_links_from_plaintext, find_links,
|
||||
};
|
||||
use super::*;
|
||||
use crate::{
|
||||
test_utils::{mail, website},
|
||||
Uri,
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
|
|||
pub use excludes::Excludes;
|
||||
pub use includes::Includes;
|
||||
|
||||
use crate::uri::Uri;
|
||||
use crate::Uri;
|
||||
|
||||
/// Pre-defined exclusions for known false-positives
|
||||
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
|
||||
|
|
|
|||
|
|
@ -1,35 +1,49 @@
|
|||
use crate::{Base, ErrorKind, Result};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
pub(crate) fn find(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
|
||||
if dst.exists() {
|
||||
return Ok(dst.to_path_buf());
|
||||
}
|
||||
if dst.is_dir() {
|
||||
return Err(ErrorKind::FileNotFound(dst.into()));
|
||||
// Returns the base if it is a valid `PathBuf`
|
||||
fn get_base_dir(base: &Option<Base>) -> Option<PathBuf> {
|
||||
base.as_ref().and_then(|b| b.dir())
|
||||
}
|
||||
|
||||
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
|
||||
if dst.is_relative() {
|
||||
// Find `dst` in the parent directory of `src`
|
||||
if let Some(parent) = src.parent() {
|
||||
let rel_path = parent.join(dst.to_path_buf());
|
||||
return Ok(rel_path);
|
||||
}
|
||||
}
|
||||
if dst.is_absolute() {
|
||||
// Absolute local links (leading slash) require the base_url to
|
||||
// define the document root.
|
||||
if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) {
|
||||
let absolute = base_dir.join(dst.to_path_buf());
|
||||
if absolute.exists() {
|
||||
return Ok(absolute);
|
||||
}
|
||||
}
|
||||
}
|
||||
if dst.is_relative() {
|
||||
// Find `dst` in the `root` path
|
||||
if let Some(parent) = src.parent() {
|
||||
let relative = parent.join(dst.to_path_buf());
|
||||
if relative.exists() {
|
||||
return Ok(relative);
|
||||
}
|
||||
if let Some(base_dir) = get_base_dir(base) {
|
||||
let abs_path = join(base_dir, dst);
|
||||
return Ok(abs_path);
|
||||
}
|
||||
}
|
||||
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
|
||||
}
|
||||
|
||||
// A cumbersome way to concatenate paths without checking their
|
||||
// existence on disk. See https://github.com/rust-lang/rust/issues/16507
|
||||
fn join(base: PathBuf, dst: &Path) -> PathBuf {
|
||||
let mut abs = base.into_os_string();
|
||||
let target_str = dst.as_os_str();
|
||||
abs.push(target_str);
|
||||
PathBuf::from(abs)
|
||||
}
|
||||
|
||||
/// A little helper function to remove the get parameters from a URL link.
|
||||
/// The link is not a URL but a String as that link may not have a base domain.
|
||||
pub(crate) fn sanitize(link: String) -> String {
|
||||
let path = match link.split_once('?') {
|
||||
Some((path, _params)) => path,
|
||||
None => link.as_str(),
|
||||
};
|
||||
path.to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_fs_tree {
|
||||
use std::fs::File;
|
||||
|
|
@ -37,6 +51,31 @@ mod test_fs_tree {
|
|||
use super::*;
|
||||
use crate::Result;
|
||||
|
||||
#[test]
|
||||
fn test_sanitize() {
|
||||
assert_eq!(sanitize("/".to_string()), "/".to_string());
|
||||
assert_eq!(
|
||||
sanitize("index.html?foo=bar".to_string()),
|
||||
"index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
sanitize("/index.html?foo=bar".to_string()),
|
||||
"/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()),
|
||||
"/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
sanitize("https://example.org/index.html?foo=bar".to_string()),
|
||||
"https://example.org/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
sanitize("test.png?foo=bar".to_string()),
|
||||
"test.png".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
// dummy root
|
||||
// /path/to/foo.html
|
||||
#[test]
|
||||
|
|
@ -151,15 +190,4 @@ mod test_fs_tree {
|
|||
assert_eq!(find(&root, &dst, &None)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// /other/path/to
|
||||
#[test]
|
||||
fn test_dst_is_dir() -> Result<()> {
|
||||
let root = PathBuf::from("/path/to/");
|
||||
let dir = tempfile::tempdir()?;
|
||||
File::create(&dir)?;
|
||||
assert!(find(&root, &dir.into_path(), &None).is_err());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -50,10 +50,9 @@ mod client;
|
|||
mod client_pool;
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
mod fs_tree;
|
||||
mod fs;
|
||||
mod quirks;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
/// Functionality to extract URIs from inputs
|
||||
pub mod extract;
|
||||
|
|
@ -78,6 +77,5 @@ pub use crate::{
|
|||
client_pool::ClientPool,
|
||||
collector::Collector,
|
||||
filter::{Excludes, Filter, Includes},
|
||||
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
|
||||
uri::Uri,
|
||||
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -10,15 +10,20 @@ use crate::Uri;
|
|||
#[derive(Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum ErrorKind {
|
||||
// TODO: maybe need to be splitted; currently first slot is Some only for reading files
|
||||
// TODO: maybe needs to be split; currently first element is `Some` only for
|
||||
// reading files
|
||||
/// Any form of I/O error occurred while reading from a given path.
|
||||
IoError(Option<PathBuf>, std::io::Error),
|
||||
/// Network error when trying to connect to an endpoint via reqwest.
|
||||
ReqwestError(reqwest::Error),
|
||||
/// Network error when trying to connect to an endpoint via hubcaps.
|
||||
HubcapsError(hubcaps::Error),
|
||||
/// The given string can not be parsed into a valid URL or e-mail address
|
||||
/// The given string can not be parsed into a valid URL, e-mail address, or file path
|
||||
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
|
||||
/// The given URI cannot be converted to a file path
|
||||
InvalidFileUri(Uri),
|
||||
/// The given path cannot be converted to a URI
|
||||
InvalidPath(PathBuf),
|
||||
/// The given mail address is unreachable
|
||||
UnreachableEmailAddress(Uri),
|
||||
/// The given header could not be parsed.
|
||||
|
|
@ -70,10 +75,12 @@ impl Hash for ErrorKind {
|
|||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
|
||||
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
|
||||
Self::InvalidFileUri(u) => u.hash(state),
|
||||
Self::InvalidPath(p) => p.hash(state),
|
||||
Self::UnreachableEmailAddress(u) => u.hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
|
||||
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
|
||||
ErrorKind::InvalidBase(base, e) => (base, e).hash(state),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -101,6 +108,8 @@ impl Display for ErrorKind {
|
|||
Self::UrlParseError(s, (url_err, None)) => {
|
||||
write!(f, "Cannot parse {} as website url ({})", s, url_err)
|
||||
}
|
||||
Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u),
|
||||
Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()),
|
||||
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
|
||||
Self::InvalidHeader(e) => e.fmt(f),
|
||||
Self::InvalidGlobPattern(e) => e.fmt(f),
|
||||
|
|
@ -157,6 +166,12 @@ impl From<hubcaps::errors::Error> for ErrorKind {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<url::ParseError> for ErrorKind {
|
||||
fn from(e: url::ParseError) -> Self {
|
||||
Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(String, url::ParseError)> for ErrorKind {
|
||||
fn from(value: (String, url::ParseError)) -> Self {
|
||||
Self::UrlParseError(value.0, (value.1, None))
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ mod input;
|
|||
mod request;
|
||||
mod response;
|
||||
mod status;
|
||||
mod uri;
|
||||
|
||||
pub use base::Base;
|
||||
pub use error::ErrorKind;
|
||||
|
|
@ -15,6 +16,7 @@ pub use input::{Input, InputContent};
|
|||
pub use request::Request;
|
||||
pub use response::{Response, ResponseBody};
|
||||
pub use status::Status;
|
||||
pub use uri::Uri;
|
||||
|
||||
/// The lychee `Result` type
|
||||
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;
|
||||
|
|
|
|||
Loading…
Reference in a new issue