Split up file checking into file discovery and validation of path exists

This commit is contained in:
Matthias 2021-07-01 01:44:12 +02:00
parent bfa3b1b6a1
commit 887f1b9589
8 changed files with 114 additions and 63 deletions

View file

@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
use crate::{
filter::{Excludes, Filter, Includes},
quirks::Quirks,
uri::Uri,
ErrorKind, Request, Response, Result, Status,
ErrorKind, Request, Response, Result, Status, Uri,
};
const DEFAULT_MAX_REDIRECTS: usize = 5;
@ -178,6 +177,8 @@ impl Client {
let Request { uri, source } = Request::try_from(request)?;
let status = if self.filter.is_excluded(&uri) {
Status::Excluded
} else if uri.is_file() {
self.check_file(&uri).await
} else if uri.is_mail() {
self.check_mail(&uri).await
} else {
@ -250,6 +251,15 @@ impl Client {
}
}
pub async fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.inner.to_file_path() {
if path.exists() {
return Status::Ok(StatusCode::OK);
}
}
ErrorKind::InvalidFileUri(uri.clone()).into()
}
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];

View file

@ -1,4 +1,4 @@
use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result};
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
use std::collections::HashSet;
/// Collector keeps the state of link collection

View file

@ -8,11 +8,12 @@ use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use reqwest::Url;
use crate::{
fs_tree,
fs,
types::{FileType, InputContent},
Base, Input, Request, Result, Uri,
Base, ErrorKind, Input, Request, Result, Uri,
};
// Use LinkFinder here to offload the actual link searching in plaintext.
@ -113,27 +114,27 @@ pub(crate) fn extract_links(
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
if let Ok(uri) = Uri::try_from(link.as_str()) {
requests.insert(Request::new(uri, input_content.input.clone()));
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
Request::new(uri, input_content.input.clone())
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
requests.insert(Request::new(
Uri { url: new_url },
input_content.input.clone(),
));
Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) {
let input_content = Input::path_content(path)?;
requests.extend(extract_links(&input_content, base)?);
} else {
info!("Cannot find path to {} in filesystem", &link);
}
let link = fs::sanitize(link);
let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
Request::new(
Uri {
inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?,
},
input_content.input.clone(),
)
} else {
info!("Handling of {} not implemented yet", &link);
}
continue;
};
requests.insert(req);
}
Ok(requests)
}
@ -151,10 +152,7 @@ mod test {
use pretty_assertions::assert_eq;
use url::Url;
use super::{
extract_links, extract_links_from_html, extract_links_from_markdown,
extract_links_from_plaintext, find_links,
};
use super::*;
use crate::{
test_utils::{mail, website},
Uri,

View file

@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
pub use excludes::Excludes;
pub use includes::Includes;
use crate::uri::Uri;
use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];

View file

@ -1,35 +1,49 @@
use crate::{Base, ErrorKind, Result};
use std::path::{Path, PathBuf};
pub(crate) fn find(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
if dst.exists() {
return Ok(dst.to_path_buf());
}
if dst.is_dir() {
return Err(ErrorKind::FileNotFound(dst.into()));
// Returns the base if it is a valid `PathBuf`
fn get_base_dir(base: &Option<Base>) -> Option<PathBuf> {
base.as_ref().and_then(|b| b.dir())
}
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
if dst.is_relative() {
// Find `dst` in the parent directory of `src`
if let Some(parent) = src.parent() {
let rel_path = parent.join(dst.to_path_buf());
return Ok(rel_path);
}
}
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// define the document root.
if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) {
let absolute = base_dir.join(dst.to_path_buf());
if absolute.exists() {
return Ok(absolute);
}
}
}
if dst.is_relative() {
// Find `dst` in the `root` path
if let Some(parent) = src.parent() {
let relative = parent.join(dst.to_path_buf());
if relative.exists() {
return Ok(relative);
}
if let Some(base_dir) = get_base_dir(base) {
let abs_path = join(base_dir, dst);
return Ok(abs_path);
}
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
// A cumbersome way to concatenate paths without checking their
// existence on disk. See https://github.com/rust-lang/rust/issues/16507
fn join(base: PathBuf, dst: &Path) -> PathBuf {
let mut abs = base.into_os_string();
let target_str = dst.as_os_str();
abs.push(target_str);
PathBuf::from(abs)
}
/// A little helper function to remove the get parameters from a URL link.
/// The link is not a URL but a String as that link may not have a base domain.
pub(crate) fn sanitize(link: String) -> String {
let path = match link.split_once('?') {
Some((path, _params)) => path,
None => link.as_str(),
};
path.to_string()
}
#[cfg(test)]
mod test_fs_tree {
use std::fs::File;
@ -37,6 +51,31 @@ mod test_fs_tree {
use super::*;
use crate::Result;
#[test]
fn test_sanitize() {
assert_eq!(sanitize("/".to_string()), "/".to_string());
assert_eq!(
sanitize("index.html?foo=bar".to_string()),
"index.html".to_string()
);
assert_eq!(
sanitize("/index.html?foo=bar".to_string()),
"/index.html".to_string()
);
assert_eq!(
sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()),
"/index.html".to_string()
);
assert_eq!(
sanitize("https://example.org/index.html?foo=bar".to_string()),
"https://example.org/index.html".to_string()
);
assert_eq!(
sanitize("test.png?foo=bar".to_string()),
"test.png".to_string()
);
}
// dummy root
// /path/to/foo.html
#[test]
@ -151,15 +190,4 @@ mod test_fs_tree {
assert_eq!(find(&root, &dst, &None)?, dst);
Ok(())
}
// /path/to/index.html
// /other/path/to
#[test]
fn test_dst_is_dir() -> Result<()> {
let root = PathBuf::from("/path/to/");
let dir = tempfile::tempdir()?;
File::create(&dir)?;
assert!(find(&root, &dir.into_path(), &None).is_err());
Ok(())
}
}

View file

@ -50,10 +50,9 @@ mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod fs_tree;
mod fs;
mod quirks;
mod types;
mod uri;
/// Functionality to extract URIs from inputs
pub mod extract;
@ -78,6 +77,5 @@ pub use crate::{
client_pool::ClientPool,
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
uri::Uri,
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
};

View file

@ -10,15 +10,20 @@ use crate::Uri;
#[derive(Debug)]
#[non_exhaustive]
pub enum ErrorKind {
// TODO: maybe need to be splitted; currently first slot is Some only for reading files
// TODO: maybe needs to be split; currently first element is `Some` only for
// reading files
/// Any form of I/O error occurred while reading from a given path.
IoError(Option<PathBuf>, std::io::Error),
/// Network error when trying to connect to an endpoint via reqwest.
ReqwestError(reqwest::Error),
/// Network error when trying to connect to an endpoint via hubcaps.
HubcapsError(hubcaps::Error),
/// The given string can not be parsed into a valid URL or e-mail address
/// The given string can not be parsed into a valid URL, e-mail address, or file path
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
/// The given URI cannot be converted to a file path
InvalidFileUri(Uri),
/// The given path cannot be converted to a URI
InvalidPath(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
@ -70,10 +75,12 @@ impl Hash for ErrorKind {
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
Self::InvalidFileUri(u) => u.hash(state),
Self::InvalidPath(p) => p.hash(state),
Self::UnreachableEmailAddress(u) => u.hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
ErrorKind::InvalidBase(base, e) => (base, e).hash(state),
}
}
}
@ -101,6 +108,8 @@ impl Display for ErrorKind {
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u),
Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()),
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
@ -157,6 +166,12 @@ impl From<hubcaps::errors::Error> for ErrorKind {
}
}
impl From<url::ParseError> for ErrorKind {
fn from(e: url::ParseError) -> Self {
Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
}
}
impl From<(String, url::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, None))

View file

@ -7,6 +7,7 @@ mod input;
mod request;
mod response;
mod status;
mod uri;
pub use base::Base;
pub use error::ErrorKind;
@ -15,6 +16,7 @@ pub use input::{Input, InputContent};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
pub use uri::Uri;
/// The lychee `Result` type
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;