> From for FileType {
- /// Detect if the given path points to a Markdown, HTML, or plaintext file.
- fn from(p: P) -> FileType {
- let path = p.as_ref();
- // Assume HTML in case of no extension.
- // Note: this is only reasonable for URLs; not paths on disk.
- // For example, `README` without an extension is more likely to be a plaintext file.
- // A better solution would be to also implement `From for FileType`.
- // Unfortunately that's not possible without refactoring, as
- // `AsRef` could be implemented for `Url` in the future, which is why
- // `From for FileType` is not allowed.
- match path.extension().and_then(std::ffi::OsStr::to_str) {
- Some("md" | "markdown") => FileType::Markdown,
- Some("htm" | "html") | None => FileType::Html,
- Some(_) => FileType::Plaintext,
- }
- }
-}
-
-// Use LinkFinder here to offload the actual link searching in plaintext.
-fn find_links(input: &str) -> Vec {
- let finder = LinkFinder::new();
- finder.links(input).collect()
-}
-
-/// Extract unparsed URL strings from a markdown string.
+/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
- MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
- vec![url.to_string()]
- }
+ MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()],
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
@@ -68,15 +64,15 @@ fn extract_links_from_markdown(input: &str) -> Vec {
.collect()
}
-/// Extract unparsed URL strings from a HTML string.
+/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
- // we pass mutable urls reference to avoid extra allocations in each
- // recursive descent
+ // We pass mutable URL references here to avoid
+ // extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
@@ -101,7 +97,7 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
for attr in attrs.borrow().iter() {
let attr_value = attr.value.to_string();
- if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+ if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
} else {
urls.append(&mut extract_links_from_plaintext(&attr_value));
@@ -113,56 +109,34 @@ fn walk_html_links(mut urls: &mut Vec, node: &Handle) {
}
// recursively traverse the document's nodes -- this doesn't need any extra
- // exit conditions because the document is a tree
+ // exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
}
}
-/// Determine if element's attribute contains a link / URL.
-fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
- // See a comprehensive list of attributes that might contain URLs/URIs
- // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
- matches!(
- (attr_name, elem_name),
- ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
- )
-}
-
-/// Extract unparsed URL strings from a plaintext.
+/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec {
- find_links(input)
+ url::find_links(input)
.iter()
.map(|l| String::from(l.as_str()))
.collect()
}
-pub(crate) fn extract_links(
- input_content: &InputContent,
- base_url: &Option,
-) -> HashSet {
- let links = match input_content.file_type {
- FileType::Markdown => extract_links_from_markdown(&input_content.content),
- FileType::Html => extract_links_from_html(&input_content.content),
- FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
- };
-
- // Only keep legit URLs. This sorts out things like anchors.
- // Silently ignore the parse failures for now.
- let mut requests: HashSet = HashSet::new();
- for link in links {
- if let Ok(uri) = Uri::try_from(link.as_str()) {
- requests.insert(Request::new(uri, input_content.input.clone()));
- } else if !Path::new(&link).exists() {
- if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
- requests.insert(Request::new(
- Uri { url: new_url },
- input_content.input.clone(),
- ));
- }
- };
- }
- requests
+fn create_uri_from_path(src: &Path, base: &Option, dst: &str) -> Result {
+ let dst = url::remove_get_params_and_fragment(dst);
+ // Avoid double-encoding already encoded destination paths by removing any
+ // potential encoding (e.g. `web%20site` becomes `web site`).
+ // That's because Url::from_file_path will encode the full URL in the end.
+ // This behavior cannot be configured.
+ // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
+ // TODO: This is not a perfect solution.
+ // Ideally, only `src` and `base` should be URL encoded (as is done by
+ // `from_file_path` at the moment) while `dst` is left untouched and simply
+ // appended to the end.
+ let decoded = percent_decode_str(dst).decode_utf8()?.to_string();
+ let path = path::resolve(src, &PathBuf::from(decoded), base)?;
+ Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path))
}
#[cfg(test)]
@@ -176,17 +150,24 @@ mod test {
};
use pretty_assertions::assert_eq;
- use url::Url;
- use super::{
- extract_links, extract_links_from_html, extract_links_from_markdown,
- extract_links_from_plaintext, find_links, FileType,
- };
+ use super::*;
use crate::{
- collector::InputContent,
+ helpers::url::find_links,
test_utils::{mail, website},
Uri,
};
+ use crate::{
+ types::{FileType, InputContent},
+ Base,
+ };
+
+ #[test]
+ fn test_create_uri_from_path() {
+ let result =
+ create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
+ assert_eq!(result.as_str(), "file:///test+encoding");
+ }
fn load_fixture(filename: &str) -> String {
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
@@ -207,13 +188,13 @@ mod test {
}
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet {
- extract_links(
- &InputContent::from_string(input, file_type),
- &base_url.map(|u| Url::parse(u).unwrap()),
- )
- .into_iter()
- .map(|r| r.uri)
- .collect()
+ let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
+ extract_links(&InputContent::from_string(input, file_type), &base)
+ // unwrap is fine here as this helper function is only used in tests
+ .unwrap()
+ .into_iter()
+ .map(|r| r.uri)
+ .collect()
}
#[test]
diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs
index f9daac8..0726aa6 100644
--- a/lychee-lib/src/filter/mod.rs
+++ b/lychee-lib/src/filter/mod.rs
@@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
pub use excludes::Excludes;
pub use includes::Includes;
-use crate::uri::Uri;
+use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
diff --git a/lychee-lib/src/helpers/mod.rs b/lychee-lib/src/helpers/mod.rs
new file mode 100644
index 0000000..94f2d21
--- /dev/null
+++ b/lychee-lib/src/helpers/mod.rs
@@ -0,0 +1,2 @@
+pub(crate) mod path;
+pub(crate) mod url;
diff --git a/lychee-lib/src/helpers/path.rs b/lychee-lib/src/helpers/path.rs
new file mode 100644
index 0000000..b31d522
--- /dev/null
+++ b/lychee-lib/src/helpers/path.rs
@@ -0,0 +1,141 @@
+use crate::{Base, ErrorKind, Result};
+use path_clean::PathClean;
+use std::env;
+use std::path::{Path, PathBuf};
+
+// Returns the base if it is a valid `PathBuf`
+fn get_base_dir(base: &Option) -> Option {
+ base.as_ref().and_then(Base::dir)
+}
+
+// https://stackoverflow.com/a/54817755/270334
+pub(crate) fn absolute_path(path: impl AsRef) -> Result {
+ let path = path.as_ref();
+
+ let absolute_path = if path.is_absolute() {
+ path.to_path_buf()
+ } else {
+ env::current_dir()?.join(path)
+ }
+ .clean();
+
+ Ok(absolute_path)
+}
+
+// Get the parent directory of a given `Path`.
+fn dirname(src: &Path) -> PathBuf {
+ if src.is_file() {
+ src.to_path_buf()
+ .parent()
+ .map_or(PathBuf::new(), Path::to_path_buf)
+ } else {
+ src.to_path_buf()
+ }
+}
+
+// Resolve `dst` that was linked to from within `src`
+pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result {
+ if dst.is_relative() {
+ // Find `dst` in the parent directory of `src`
+ if let Some(parent) = src.parent() {
+ let rel_path = parent.join(dst.to_path_buf());
+ return absolute_path(&rel_path);
+ }
+ }
+ if dst.is_absolute() {
+ // Absolute local links (leading slash) require the `base_url` to
+ // define the document root.
+ let base = get_base_dir(base).ok_or_else(|| {
+ ErrorKind::InvalidBase(
+ "".to_string(),
+ format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
+ )
+ })?;
+ let abs_path = join(dirname(&base), dst);
+ return absolute_path(&abs_path);
+ }
+ Err(ErrorKind::FileNotFound(dst.to_path_buf()))
+}
+
+// A cumbersome way to concatenate paths without checking their
+// existence on disk. See https://github.com/rust-lang/rust/issues/16507
+fn join(base: PathBuf, dst: &Path) -> PathBuf {
+ let mut abs = base.into_os_string();
+ let target_str = dst.as_os_str();
+ abs.push(target_str);
+ PathBuf::from(abs)
+}
+
+#[cfg(test)]
+mod test_path {
+ use super::*;
+ use crate::Result;
+
+ // index.html
+ // ./foo.html
+ #[test]
+ fn test_resolve_relative() -> Result<()> {
+ let dummy = PathBuf::from("index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&dummy, &abs_path, &None)?,
+ env::current_dir()?.join("foo.html")
+ );
+ Ok(())
+ }
+
+ // ./index.html
+ // ./foo.html
+ #[test]
+ fn test_resolve_relative_index() -> Result<()> {
+ let dummy = PathBuf::from("./index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&dummy, &abs_path, &None)?,
+ env::current_dir()?.join("foo.html")
+ );
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // ./foo.html
+ #[test]
+ fn test_resolve_from_absolute() -> Result<()> {
+ let abs_index = PathBuf::from("/path/to/index.html");
+ let abs_path = PathBuf::from("./foo.html");
+ assert_eq!(
+ resolve(&abs_index, &abs_path, &None)?,
+ PathBuf::from("/path/to/foo.html")
+ );
+ Ok(())
+ }
+
+ // dummy
+ // foo.html
+ // valid base dir
+ #[test]
+ fn test_resolve_absolute_from_base_dir() -> Result<()> {
+ let dummy = PathBuf::new();
+ let abs_path = PathBuf::from("/foo.html");
+ let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
+ assert_eq!(
+ resolve(&dummy, &abs_path, &base)?,
+ PathBuf::from("/some/absolute/base/dir/foo.html")
+ );
+ Ok(())
+ }
+
+ // /path/to/index.html
+ // /other/path/to/foo.html
+ #[test]
+ fn test_resolve_absolute_from_absolute() -> Result<()> {
+ let abs_index = PathBuf::from("/path/to/index.html");
+ let abs_path = PathBuf::from("/other/path/to/foo.html");
+ let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
+ assert_eq!(
+ resolve(&abs_index, &abs_path, &base)?,
+ PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html")
+ );
+ Ok(())
+ }
+}
diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs
new file mode 100644
index 0000000..712d6f8
--- /dev/null
+++ b/lychee-lib/src/helpers/url.rs
@@ -0,0 +1,93 @@
+use linkify::LinkFinder;
+
+/// Remove all GET parameters from a URL.
+/// The link is not a URL but a String as it may not have a base domain.
+pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
+ let path = match url.split_once('#') {
+ Some((path_without_fragment, _fragment)) => path_without_fragment,
+ None => url,
+ };
+ let path = match path.split_once('?') {
+ Some((path_without_params, _params)) => path_without_params,
+ None => path,
+ };
+ path
+}
+
+/// Determine if an element's attribute contains a link / URL.
+pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
+ // See a comprehensive list of attributes that might contain URLs/URIs
+ // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
+ matches!(
+ (attr_name, elem_name),
+ ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
+ )
+}
+
+// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
+pub(crate) fn is_anchor(url: &str) -> bool {
+ url.starts_with('#')
+}
+
+// Use `LinkFinder` to offload the raw link searching in plaintext
+pub(crate) fn find_links(input: &str) -> Vec {
+ let finder = LinkFinder::new();
+ finder.links(input).collect()
+}
+
+#[cfg(test)]
+mod test_fs_tree {
+ use super::*;
+
+ #[test]
+ fn test_is_anchor() {
+ assert!(is_anchor("#anchor"));
+ assert!(!is_anchor("notan#anchor"));
+ }
+
+ #[test]
+ fn test_remove_get_params_and_fragment() {
+ assert_eq!(remove_get_params_and_fragment("/"), "/");
+ assert_eq!(
+ remove_get_params_and_fragment("index.html?foo=bar"),
+ "index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("/index.html?foo=bar"),
+ "/index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
+ "/index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("https://example.org/index.html?foo=bar"),
+ "https://example.org/index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("test.png?foo=bar"),
+ "test.png"
+ );
+
+ assert_eq!(
+ remove_get_params_and_fragment("https://example.org/index.html#anchor"),
+ "https://example.org/index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"),
+ "https://example.org/index.html"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("test.png?foo=bar#anchor"),
+ "test.png"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("test.png#anchor?anchor!?"),
+ "test.png"
+ );
+ assert_eq!(
+ remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
+ "test.png"
+ );
+ }
+}
diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs
index 71ba9d6..22b76f8 100644
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@@ -41,18 +41,18 @@
)]
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
#![deny(missing_docs)]
+#![allow(clippy::module_name_repetitions)]
#[cfg(doctest)]
doc_comment::doctest!("../../README.md");
mod client;
mod client_pool;
-mod quirks;
-mod types;
-mod uri;
-
/// A pool of clients, to handle concurrent checks
pub mod collector;
+mod helpers;
+mod quirks;
+mod types;
/// Functionality to extract URIs from inputs
pub mod extract;
@@ -75,8 +75,7 @@ use ring as _; // required for apple silicon
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
- collector::{Collector, Input},
+ collector::Collector,
filter::{Excludes, Filter, Includes},
- types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
- uri::Uri,
+ types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
};
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
new file mode 100644
index 0000000..affeacc
--- /dev/null
+++ b/lychee-lib/src/types/base.rs
@@ -0,0 +1,83 @@
+use reqwest::Url;
+use serde::{Deserialize, Serialize};
+use std::{convert::TryFrom, path::PathBuf};
+
+use crate::ErrorKind;
+
+/// When encountering links without a full domain in a document,
+/// the base determines where this resource can be found.
+/// Both, local and remote targets are supported.
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
+#[allow(variant_size_differences)]
+pub enum Base {
+ /// Local file path pointing to root directory
+ Local(PathBuf),
+ /// Remote URL pointing to a website homepage
+ Remote(Url),
+}
+
+impl Base {
+ /// Join link with base url
+ #[must_use]
+ pub fn join(&self, link: &str) -> Option {
+ match self {
+ Self::Remote(url) => url.join(link).ok(),
+ Self::Local(_) => None,
+ }
+ }
+
+ /// Return the directory if the base is local
+ #[must_use]
+ pub fn dir(&self) -> Option {
+ match self {
+ Self::Remote(_) => None,
+ Self::Local(d) => Some(d.clone()),
+ }
+ }
+}
+
+impl TryFrom<&str> for Base {
+ type Error = ErrorKind;
+
+ fn try_from(value: &str) -> Result {
+ if let Ok(url) = Url::parse(value) {
+ if url.cannot_be_a_base() {
+ return Err(ErrorKind::InvalidBase(
+ value.to_string(),
+ "The given URL cannot be a base".to_string(),
+ ));
+ }
+ return Ok(Self::Remote(url));
+ }
+ Ok(Self::Local(PathBuf::from(value)))
+ }
+}
+
+#[cfg(test)]
+mod test_base {
+ use crate::Result;
+
+ use super::*;
+
+ #[test]
+ fn test_valid_remote() -> Result<()> {
+ let base = Base::try_from("https://endler.dev")?;
+ assert_eq!(
+ base,
+ Base::Remote(Url::parse("https://endler.dev").unwrap())
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn test_invalid_url() {
+ assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
+ }
+
+ #[test]
+ fn test_valid_local() -> Result<()> {
+ let dir = tempfile::tempdir()?;
+ Base::try_from(dir.as_ref().to_str().unwrap())?;
+ Ok(())
+ }
+}
diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs
index 0710f5e..4a76141 100644
--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@@ -10,21 +10,32 @@ use crate::Uri;
#[derive(Debug)]
#[non_exhaustive]
pub enum ErrorKind {
- // TODO: maybe need to be splitted; currently first slot is Some only for reading files
+ // TODO: maybe needs to be split; currently first element is `Some` only for
+ // reading files
/// Any form of I/O error occurred while reading from a given path.
IoError(Option, std::io::Error),
+ /// Errors which can occur when attempting to interpret a sequence of u8 as a string
+ Utf8Error(std::str::Utf8Error),
/// Network error when trying to connect to an endpoint via reqwest.
ReqwestError(reqwest::Error),
/// Network error when trying to connect to an endpoint via hubcaps.
HubcapsError(hubcaps::Error),
- /// The given string can not be parsed into a valid URL or e-mail address
+ /// The given string can not be parsed into a valid URL, e-mail address, or file path
UrlParseError(String, (url::ParseError, Option)),
+ /// The given URI cannot be converted to a file path
+ InvalidFilePath(Uri),
+ /// The given path cannot be converted to a URI
+ InvalidUrl(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
/// A possible error when converting a `HeaderValue` from a string or byte
/// slice.
InvalidHeader(InvalidHeaderValue),
+ /// The given string can not be parsed into a valid base URL or base directory
+ InvalidBase(String, String),
+ /// Cannot find local file
+ FileNotFound(PathBuf),
/// The given UNIX glob pattern is invalid
InvalidGlobPattern(glob::PatternError),
/// The Github API could not be called because of a missing Github token.
@@ -63,8 +74,14 @@ impl Hash for ErrorKind {
Self::IoError(p, e) => (p, e.kind()).hash(state),
Self::ReqwestError(e) => e.to_string().hash(state),
Self::HubcapsError(e) => e.to_string().hash(state),
+ Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
- Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
+ Self::InvalidUrl(p) => p.hash(state),
+ Self::Utf8Error(e) => e.to_string().hash(state),
+ Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
+ u.hash(state);
+ }
+ Self::InvalidBase(base, e) => (base, e).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
@@ -84,6 +101,7 @@ impl Display for ErrorKind {
Self::IoError(None, e) => e.fmt(f),
Self::ReqwestError(e) => e.fmt(f),
Self::HubcapsError(e) => e.fmt(f),
+ Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
write!(
f,
@@ -94,6 +112,8 @@ impl Display for ErrorKind {
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
+ Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
+ Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
@@ -106,6 +126,8 @@ impl Display for ErrorKind {
"This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
uri
),
+ Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
+ Self::Utf8Error(e) => e.fmt(f),
}
}
}
@@ -125,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind {
}
}
+impl From for ErrorKind {
+ fn from(e: std::str::Utf8Error) -> Self {
+ Self::Utf8Error(e)
+ }
+}
+
impl From for ErrorKind {
fn from(e: std::io::Error) -> Self {
Self::IoError(None, e)
@@ -149,6 +177,12 @@ impl From for ErrorKind {
}
}
+impl From for ErrorKind {
+ fn from(e: url::ParseError) -> Self {
+ Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
+ }
+}
+
impl From<(String, url::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, None))
diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs
new file mode 100644
index 0000000..d0d9510
--- /dev/null
+++ b/lychee-lib/src/types/file.rs
@@ -0,0 +1,37 @@
+use std::path::Path;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+/// `FileType` defines which file types lychee can handle
+pub enum FileType {
+ /// File in HTML format
+ Html,
+ /// File in Markdown format
+ Markdown,
+ /// Generic text file without syntax-specific parsing
+ Plaintext,
+}
+
+impl Default for FileType {
+ fn default() -> Self {
+ Self::Plaintext
+ }
+}
+
+impl> From for FileType {
+ /// Detect if the given path points to a Markdown, HTML, or plaintext file.
+ fn from(p: P) -> FileType {
+ let path = p.as_ref();
+ // Assume HTML in case of no extension.
+ // Note: this is only reasonable for URLs; not paths on disk.
+ // For example, `README` without an extension is more likely to be a plaintext file.
+ // A better solution would be to also implement `From for FileType`.
+ // Unfortunately that's not possible without refactoring, as
+ // `AsRef` could be implemented for `Url` in the future, which is why
+ // `From for FileType` is not allowed.
+ match path.extension().and_then(std::ffi::OsStr::to_str) {
+ Some("md" | "markdown") => FileType::Markdown,
+ Some("htm" | "html") | None => FileType::Html,
+ Some(_) => FileType::Plaintext,
+ }
+ }
+}
diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs
new file mode 100644
index 0000000..ad97355
--- /dev/null
+++ b/lychee-lib/src/types/input.rs
@@ -0,0 +1,214 @@
+use crate::types::FileType;
+use crate::Result;
+use glob::glob_with;
+use reqwest::Url;
+use serde::Serialize;
+use shellexpand::tilde;
+use std::path::{Path, PathBuf};
+use std::{fmt::Display, fs::read_to_string};
+use tokio::io::{stdin, AsyncReadExt};
+
+const STDIN: &str = "-";
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[non_exhaustive]
+/// An exhaustive list of input sources, which lychee accepts
+pub enum Input {
+ /// URL (of HTTP/HTTPS scheme).
+ RemoteUrl(Box),
+ /// Unix shell-style glob pattern.
+ FsGlob {
+ /// The glob pattern matching all input files
+ pattern: String,
+ /// Don't be case sensitive when matching files against a glob
+ ignore_case: bool,
+ },
+ /// File path.
+ FsPath(PathBuf),
+ /// Standard Input.
+ Stdin,
+ /// Raw string input.
+ String(String),
+}
+
+impl Serialize for Input {
+ fn serialize(&self, serializer: S) -> std::result::Result
+ where
+ S: serde::Serializer,
+ {
+ serializer.collect_str(self)
+ }
+}
+
+impl Display for Input {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.write_str(match self {
+ Input::RemoteUrl(url) => url.as_str(),
+ Input::FsGlob { pattern, .. } => pattern,
+ Input::FsPath(path) => path.to_str().unwrap_or_default(),
+ Input::Stdin => "stdin",
+ Input::String(_) => "raw input string",
+ })
+ }
+}
+
+#[derive(Debug)]
+/// Encapsulates the content for a given input
+pub struct InputContent {
+ /// Input source
+ pub input: Input,
+ /// File type of given input
+ pub file_type: FileType,
+ /// Raw UTF-8 string content
+ pub content: String,
+}
+
+impl InputContent {
+ #[must_use]
+ /// Create an instance of `InputContent` from an input string
+ pub fn from_string(s: &str, file_type: FileType) -> Self {
+ // TODO: consider using Cow (to avoid one .clone() for String types)
+ Self {
+ input: Input::String(s.to_owned()),
+ file_type,
+ content: s.to_owned(),
+ }
+ }
+}
+
+impl Input {
+ #[must_use]
+ /// Construct a new `Input` source. In case the input is a `glob` pattern,
+ /// `glob_ignore_case` decides whether matching files against the `glob` is
+ /// case-insensitive or not
+ pub fn new(value: &str, glob_ignore_case: bool) -> Self {
+ if value == STDIN {
+ Self::Stdin
+ } else if let Ok(url) = Url::parse(value) {
+ Self::RemoteUrl(Box::new(url))
+ } else {
+ // this seems to be the only way to determine if this is a glob pattern
+ let is_glob = glob::Pattern::escape(value) != value;
+
+ if is_glob {
+ Self::FsGlob {
+ pattern: value.to_owned(),
+ ignore_case: glob_ignore_case,
+ }
+ } else {
+ Self::FsPath(value.into())
+ }
+ }
+ }
+
+ #[allow(clippy::missing_panics_doc)]
+ /// Retrieve the contents from the input
+ ///
+ /// # Errors
+ ///
+ /// Returns an error if the contents can not be retrieved
+ /// because of an underlying I/O error (e.g. an error while making a
+ /// network request or retrieving the contents from the file system)
+ pub async fn get_contents(
+ &self,
+ file_type_hint: Option,
+ skip_missing: bool,
+ ) -> Result> {
+ match *self {
+ // TODO: should skip_missing also affect URLs?
+ Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
+ Input::FsGlob {
+ ref pattern,
+ ignore_case,
+ } => Ok(Self::glob_contents(pattern, ignore_case).await?),
+ Input::FsPath(ref path) => {
+ let content = Self::path_content(path);
+ match content {
+ Ok(input_content) => Ok(vec![input_content]),
+ Err(_) if skip_missing => Ok(vec![]),
+ Err(e) => Err(e),
+ }
+ }
+ Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
+ Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
+ }
+ }
+
+ async fn url_contents(url: &Url) -> Result {
+ // Assume HTML for default paths
+ let file_type = if url.path().is_empty() || url.path() == "/" {
+ FileType::Html
+ } else {
+ FileType::from(url.as_str())
+ };
+
+ let res = reqwest::get(url.clone()).await?;
+ let input_content = InputContent {
+ input: Input::RemoteUrl(Box::new(url.clone())),
+ file_type,
+ content: res.text().await?,
+ };
+
+ Ok(input_content)
+ }
+
+ async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result> {
+ let mut contents = vec![];
+ let glob_expanded = tilde(&path_glob);
+ let mut match_opts = glob::MatchOptions::new();
+
+ match_opts.case_sensitive = !ignore_case;
+
+ for entry in glob_with(&glob_expanded, match_opts)? {
+ match entry {
+ Ok(path) => {
+ if path.is_dir() {
+ // Directories can still have a suffix which looks like
+ // a file extension like `foo.html`. This can lead to
+ // unexpected behavior with glob patterns like
+ // `**/*.html`. Therefore filter these out.
+ // https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819
+ continue;
+ }
+ let content = Self::path_content(&path)?;
+ contents.push(content);
+ }
+ Err(e) => println!("{:?}", e),
+ }
+ }
+
+ Ok(contents)
+ }
+
+ /// Get the input content of a given path
+ /// # Errors
+ ///
+ /// Will return `Err` if file contents can't be read
+ pub fn path_content + AsRef + Clone>(path: P) -> Result {
+ let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
+ let input_content = InputContent {
+ file_type: FileType::from(path.as_ref()),
+ content,
+ input: Input::FsPath(path.into()),
+ };
+
+ Ok(input_content)
+ }
+
+ async fn stdin_content(file_type_hint: Option) -> Result {
+ let mut content = String::new();
+ let mut stdin = stdin();
+ stdin.read_to_string(&mut content).await?;
+
+ let input_content = InputContent {
+ input: Input::Stdin,
+ file_type: file_type_hint.unwrap_or_default(),
+ content,
+ };
+
+ Ok(input_content)
+ }
+
+ fn string_content(s: &str, file_type_hint: Option) -> InputContent {
+ InputContent::from_string(s, file_type_hint.unwrap_or_default())
+ }
+}
diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs
index a48f7a9..9453d5e 100644
--- a/lychee-lib/src/types/mod.rs
+++ b/lychee-lib/src/types/mod.rs
@@ -1,14 +1,22 @@
#![allow(unreachable_pub)]
+mod base;
mod error;
+mod file;
+mod input;
mod request;
mod response;
mod status;
+mod uri;
+pub use base::Base;
pub use error::ErrorKind;
+pub use file::FileType;
+pub use input::{Input, InputContent};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
+pub use uri::Uri;
/// The lychee `Result` type
pub type Result = std::result::Result;
diff --git a/lychee-lib/src/uri.rs b/lychee-lib/src/types/uri.rs
similarity index 97%
rename from lychee-lib/src/uri.rs
rename to lychee-lib/src/types/uri.rs
index a25aad3..6ad126c 100644
--- a/lychee-lib/src/uri.rs
+++ b/lychee-lib/src/types/uri.rs
@@ -82,9 +82,16 @@ impl Uri {
}
#[inline]
+ /// Check if the URI is a valid mail address
pub(crate) fn is_mail(&self) -> bool {
self.scheme() == "mailto"
}
+
+ #[inline]
+ /// Check if the URI is a file
+ pub(crate) fn is_file(&self) -> bool {
+ self.scheme() == "file"
+ }
}
impl AsRef for Uri {