From 887f1b9589ec3cc7ab23d00e74c4986e7033c02e Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 1 Jul 2021 01:44:12 +0200 Subject: [PATCH] Split up file checking into file discovery and validation of path exists --- lychee-lib/src/client.rs | 14 ++++- lychee-lib/src/collector.rs | 2 +- lychee-lib/src/extract.rs | 40 ++++++------- lychee-lib/src/filter/mod.rs | 2 +- lychee-lib/src/{fs_tree.rs => fs.rs} | 90 ++++++++++++++++++---------- lychee-lib/src/lib.rs | 6 +- lychee-lib/src/types/error.rs | 21 ++++++- lychee-lib/src/types/mod.rs | 2 + 8 files changed, 114 insertions(+), 63 deletions(-) rename lychee-lib/src/{fs_tree.rs => fs.rs} (67%) diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index c1175ba..a69167c 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -20,8 +20,7 @@ use typed_builder::TypedBuilder; use crate::{ filter::{Excludes, Filter, Includes}, quirks::Quirks, - uri::Uri, - ErrorKind, Request, Response, Result, Status, + ErrorKind, Request, Response, Result, Status, Uri, }; const DEFAULT_MAX_REDIRECTS: usize = 5; @@ -178,6 +177,8 @@ impl Client { let Request { uri, source } = Request::try_from(request)?; let status = if self.filter.is_excluded(&uri) { Status::Excluded + } else if uri.is_file() { + self.check_file(&uri).await } else if uri.is_mail() { self.check_mail(&uri).await } else { @@ -250,6 +251,15 @@ impl Client { } } + pub async fn check_file(&self, uri: &Uri) -> Status { + if let Ok(path) = uri.inner.to_file_path() { + if path.exists() { + return Status::Ok(StatusCode::OK); + } + } + ErrorKind::InvalidFileUri(uri.clone()).into() + } + pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 712dc09..b5e69d9 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,4 @@ -use crate::{extract::extract_links, uri::Uri, Base, Input, Request, Result}; +use crate::{extract::extract_links, Base, Input, Request, Result, Uri}; use std::collections::HashSet; /// Collector keeps the state of link collection diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 2f65c26..65b6ee8 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -8,11 +8,12 @@ use linkify::LinkFinder; use log::info; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pulldown_cmark::{Event as MDEvent, Parser, Tag}; +use reqwest::Url; use crate::{ - fs_tree, + fs, types::{FileType, InputContent}, - Base, Input, Request, Result, Uri, + Base, ErrorKind, Input, Request, Result, Uri, }; // Use LinkFinder here to offload the actual link searching in plaintext. @@ -113,27 +114,27 @@ pub(crate) fn extract_links( FileType::Plaintext => extract_links_from_plaintext(&input_content.content), }; - // Only keep legit URLs. This sorts out things like anchors. - // Silently ignore the parse failures for now. + // Only keep legit URLs. For example this filters out anchors. let mut requests: HashSet = HashSet::new(); for link in links { - if let Ok(uri) = Uri::try_from(link.as_str()) { - requests.insert(Request::new(uri, input_content.input.clone())); + let req = if let Ok(uri) = Uri::try_from(link.as_str()) { + Request::new(uri, input_content.input.clone()) } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) { - requests.insert(Request::new( - Uri { url: new_url }, - input_content.input.clone(), - )); + Request::new(Uri { inner: new_url }, input_content.input.clone()) } else if let Input::FsPath(root) = &input_content.input { - if let Ok(path) = fs_tree::find(&root, &PathBuf::from(&link), base) { - let input_content = Input::path_content(path)?; - requests.extend(extract_links(&input_content, base)?); - } else { - info!("Cannot find path to {} in filesystem", &link); - } + let link = fs::sanitize(link); + let path = fs::resolve(&root, &PathBuf::from(&link), base)?; + Request::new( + Uri { + inner: Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?, + }, + input_content.input.clone(), + ) } else { info!("Handling of {} not implemented yet", &link); - } + continue; + }; + requests.insert(req); } Ok(requests) } @@ -151,10 +152,7 @@ mod test { use pretty_assertions::assert_eq; use url::Url; - use super::{ - extract_links, extract_links_from_html, extract_links_from_markdown, - extract_links_from_plaintext, find_links, - }; + use super::*; use crate::{ test_utils::{mail, website}, Uri, diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index f9daac8..0726aa6 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr}; pub use excludes::Excludes; pub use includes::Includes; -use crate::uri::Uri; +use crate::Uri; /// Pre-defined exclusions for known false-positives static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; diff --git a/lychee-lib/src/fs_tree.rs b/lychee-lib/src/fs.rs similarity index 67% rename from lychee-lib/src/fs_tree.rs rename to lychee-lib/src/fs.rs index 344dd1a..98255b1 100644 --- a/lychee-lib/src/fs_tree.rs +++ b/lychee-lib/src/fs.rs @@ -1,35 +1,49 @@ use crate::{Base, ErrorKind, Result}; use std::path::{Path, PathBuf}; -pub(crate) fn find(src: &Path, dst: &Path, base: &Option) -> Result { - if dst.exists() { - return Ok(dst.to_path_buf()); - } - if dst.is_dir() { - return Err(ErrorKind::FileNotFound(dst.into())); +// Returns the base if it is a valid `PathBuf` +fn get_base_dir(base: &Option) -> Option { + base.as_ref().and_then(|b| b.dir()) +} + +pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { + if dst.is_relative() { + // Find `dst` in the parent directory of `src` + if let Some(parent) = src.parent() { + let rel_path = parent.join(dst.to_path_buf()); + return Ok(rel_path); + } } if dst.is_absolute() { // Absolute local links (leading slash) require the base_url to // define the document root. - if let Some(base_dir) = base.as_ref().and_then(|b| b.dir()) { - let absolute = base_dir.join(dst.to_path_buf()); - if absolute.exists() { - return Ok(absolute); - } - } - } - if dst.is_relative() { - // Find `dst` in the `root` path - if let Some(parent) = src.parent() { - let relative = parent.join(dst.to_path_buf()); - if relative.exists() { - return Ok(relative); - } + if let Some(base_dir) = get_base_dir(base) { + let abs_path = join(base_dir, dst); + return Ok(abs_path); } } Err(ErrorKind::FileNotFound(dst.to_path_buf())) } +// A cumbersome way to concatenate paths without checking their +// existence on disk. See https://github.com/rust-lang/rust/issues/16507 +fn join(base: PathBuf, dst: &Path) -> PathBuf { + let mut abs = base.into_os_string(); + let target_str = dst.as_os_str(); + abs.push(target_str); + PathBuf::from(abs) +} + +/// A little helper function to remove the get parameters from a URL link. +/// The link is not a URL but a String as that link may not have a base domain. +pub(crate) fn sanitize(link: String) -> String { + let path = match link.split_once('?') { + Some((path, _params)) => path, + None => link.as_str(), + }; + path.to_string() +} + #[cfg(test)] mod test_fs_tree { use std::fs::File; @@ -37,6 +51,31 @@ mod test_fs_tree { use super::*; use crate::Result; + #[test] + fn test_sanitize() { + assert_eq!(sanitize("/".to_string()), "/".to_string()); + assert_eq!( + sanitize("index.html?foo=bar".to_string()), + "index.html".to_string() + ); + assert_eq!( + sanitize("/index.html?foo=bar".to_string()), + "/index.html".to_string() + ); + assert_eq!( + sanitize("/index.html?foo=bar&baz=zorx?bla=blub".to_string()), + "/index.html".to_string() + ); + assert_eq!( + sanitize("https://example.org/index.html?foo=bar".to_string()), + "https://example.org/index.html".to_string() + ); + assert_eq!( + sanitize("test.png?foo=bar".to_string()), + "test.png".to_string() + ); + } + // dummy root // /path/to/foo.html #[test] @@ -151,15 +190,4 @@ mod test_fs_tree { assert_eq!(find(&root, &dst, &None)?, dst); Ok(()) } - - // /path/to/index.html - // /other/path/to - #[test] - fn test_dst_is_dir() -> Result<()> { - let root = PathBuf::from("/path/to/"); - let dir = tempfile::tempdir()?; - File::create(&dir)?; - assert!(find(&root, &dir.into_path(), &None).is_err()); - Ok(()) - } } diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 3d57e5d..5af79c6 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -50,10 +50,9 @@ mod client; mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; -mod fs_tree; +mod fs; mod quirks; mod types; -mod uri; /// Functionality to extract URIs from inputs pub mod extract; @@ -78,6 +77,5 @@ pub use crate::{ client_pool::ClientPool, collector::Collector, filter::{Excludes, Filter, Includes}, - types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status}, - uri::Uri, + types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, }; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 9b187cc..60cafa0 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -10,15 +10,20 @@ use crate::Uri; #[derive(Debug)] #[non_exhaustive] pub enum ErrorKind { - // TODO: maybe need to be splitted; currently first slot is Some only for reading files + // TODO: maybe needs to be split; currently first element is `Some` only for + // reading files /// Any form of I/O error occurred while reading from a given path. IoError(Option, std::io::Error), /// Network error when trying to connect to an endpoint via reqwest. ReqwestError(reqwest::Error), /// Network error when trying to connect to an endpoint via hubcaps. HubcapsError(hubcaps::Error), - /// The given string can not be parsed into a valid URL or e-mail address + /// The given string can not be parsed into a valid URL, e-mail address, or file path UrlParseError(String, (url::ParseError, Option)), + /// The given URI cannot be converted to a file path + InvalidFileUri(Uri), + /// The given path cannot be converted to a URI + InvalidPath(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. @@ -70,10 +75,12 @@ impl Hash for ErrorKind { Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state), + Self::InvalidFileUri(u) => u.hash(state), + Self::InvalidPath(p) => p.hash(state), + Self::UnreachableEmailAddress(u) => u.hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), Self::MissingGitHubToken => std::mem::discriminant(self).hash(state), - ErrorKind::InvalidBase(base, e) => (base, e).hash(state), } } } @@ -101,6 +108,8 @@ impl Display for ErrorKind { Self::UrlParseError(s, (url_err, None)) => { write!(f, "Cannot parse {} as website url ({})", s, url_err) } + Self::InvalidFileUri(u) => write!(f, "Invalid file URI: {}", u), + Self::InvalidPath(p) => write!(f, "Invalid path: {}", p.display()), Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), @@ -157,6 +166,12 @@ impl From for ErrorKind { } } +impl From for ErrorKind { + fn from(e: url::ParseError) -> Self { + Self::UrlParseError("Cannot parse URL".to_string(), (e, None)) + } +} + impl From<(String, url::ParseError)> for ErrorKind { fn from(value: (String, url::ParseError)) -> Self { Self::UrlParseError(value.0, (value.1, None)) diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 63fec72..9453d5e 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -7,6 +7,7 @@ mod input; mod request; mod response; mod status; +mod uri; pub use base::Base; pub use error::ErrorKind; @@ -15,6 +16,7 @@ pub use input::{Input, InputContent}; pub use request::Request; pub use response::{Response, ResponseBody}; pub use status::Status; +pub use uri::Uri; /// The lychee `Result` type pub type Result = std::result::Result;