mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-29 09:24:46 +00:00
Add support for local files
This commit is contained in:
parent
13d0b84389
commit
701fbc9ada
16 changed files with 462 additions and 275 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1391,6 +1391,7 @@ dependencies = [
|
|||
"http",
|
||||
"hubcaps",
|
||||
"linkify",
|
||||
"log",
|
||||
"markup5ever_rcdom",
|
||||
"openssl-sys",
|
||||
"pretty_assertions",
|
||||
|
|
|
|||
1
fixtures/TEST_RELATIVE.html
Normal file
1
fixtures/TEST_RELATIVE.html
Normal file
|
|
@ -0,0 +1 @@
|
|||
<a href="./TEST_RELATIVE_2.html">Foo</a>
|
||||
1
fixtures/TEST_RELATIVE_2.html
Normal file
1
fixtures/TEST_RELATIVE_2.html
Normal file
|
|
@ -0,0 +1 @@
|
|||
<a href="./TEST_RELATIVE_3.html">Bar</a>
|
||||
1
fixtures/TEST_RELATIVE_3.html
Normal file
1
fixtures/TEST_RELATIVE_3.html
Normal file
|
|
@ -0,0 +1 @@
|
|||
<a href="https://example.org">Example link</a>
|
||||
|
|
@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result};
|
|||
use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
|
||||
use http::StatusCode;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use lychee_lib::{
|
||||
collector::{Collector, Input},
|
||||
ClientBuilder, ClientPool, Response,
|
||||
};
|
||||
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response};
|
||||
use openssl_sys as _; // required for vendored-openssl feature
|
||||
use regex::RegexSet;
|
||||
use ring as _; // required for apple silicon
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
|
|||
|
||||
use anyhow::{anyhow, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use lychee_lib::collector::Input;
|
||||
use lychee_lib::Input;
|
||||
use reqwest::Url;
|
||||
use serde::Deserialize;
|
||||
use structopt::{clap::crate_version, StructOpt};
|
||||
|
|
|
|||
37
lychee-bin/tests/local_files.rs
Normal file
37
lychee-bin/tests/local_files.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#[cfg(test)]
|
||||
mod cli {
|
||||
use std::{fs::File, io::Write};
|
||||
|
||||
use assert_cmd::Command;
|
||||
use lychee_lib::Result;
|
||||
use predicates::str::contains;
|
||||
|
||||
fn main_command() -> Command {
|
||||
// this gets the "main" binary name (e.g. `lychee`)
|
||||
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_local_file() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let index_path = dir.path().join("index.html");
|
||||
let mut index = File::create(&index_path)?;
|
||||
writeln!(index, r#"<a href="./foo.html">Foo</a>"#)?;
|
||||
|
||||
let foo_path = dir.path().join("foo.html");
|
||||
let mut foo = File::create(&foo_path)?;
|
||||
writeln!(foo, r#"<a href="https://example.org">example</a>"#)?;
|
||||
|
||||
let mut cmd = main_command();
|
||||
cmd.arg(index_path)
|
||||
.arg("--no-progress")
|
||||
.arg("--verbose")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............1"))
|
||||
.stdout(contains("example.org"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -40,6 +40,7 @@ shellexpand = "2.1.0"
|
|||
tokio = { version = "1.6.0", features = ["full"] }
|
||||
typed-builder = "0.9.1"
|
||||
url = { version = "2.2.2", features = ["serde"] }
|
||||
log = "0.4.14"
|
||||
|
||||
[dev-dependencies]
|
||||
doc-comment = "0.3.3"
|
||||
|
|
|
|||
|
|
@ -1,218 +1,6 @@
|
|||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::Display,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use glob::glob_with;
|
||||
use crate::{extract::extract_links, uri::Uri, Input, Request, Result};
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use shellexpand::tilde;
|
||||
use tokio::{
|
||||
fs::read_to_string,
|
||||
io::{stdin, AsyncReadExt},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::{extract_links, FileType},
|
||||
uri::Uri,
|
||||
Request, Result,
|
||||
};
|
||||
|
||||
const STDIN: &str = "-";
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[non_exhaustive]
|
||||
/// An exhaustive list of input sources, which lychee accepts
|
||||
pub enum Input {
|
||||
/// URL (of HTTP/HTTPS scheme).
|
||||
RemoteUrl(Box<Url>),
|
||||
/// Unix shell-style glob pattern.
|
||||
FsGlob {
|
||||
/// The glob pattern matching all input files
|
||||
pattern: String,
|
||||
/// Don't be case sensitive when matching files against a glob
|
||||
ignore_case: bool,
|
||||
},
|
||||
/// File path.
|
||||
FsPath(PathBuf),
|
||||
/// Standard Input.
|
||||
Stdin,
|
||||
/// Raw string input.
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl Serialize for Input {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Input {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Input::RemoteUrl(url) => url.as_str(),
|
||||
Input::FsGlob { pattern, .. } => pattern,
|
||||
Input::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Input::Stdin => "stdin",
|
||||
Input::String(_) => "raw input string",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub input: Input,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
input: Input::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Input {
|
||||
#[must_use]
|
||||
/// Construct a new `Input` source. In case the input is a `glob` pattern,
|
||||
/// `glob_ignore_case` decides whether matching files against the `glob` is
|
||||
/// case-insensitive or not
|
||||
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
|
||||
if value == STDIN {
|
||||
Self::Stdin
|
||||
} else if let Ok(url) = Url::parse(value) {
|
||||
Self::RemoteUrl(Box::new(url))
|
||||
} else {
|
||||
// this seems to be the only way to determine if this is a glob pattern
|
||||
let is_glob = glob::Pattern::escape(value) != value;
|
||||
|
||||
if is_glob {
|
||||
Self::FsGlob {
|
||||
pattern: value.to_owned(),
|
||||
ignore_case: glob_ignore_case,
|
||||
}
|
||||
} else {
|
||||
Self::FsPath(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
/// Retrieve the contents from the input
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the contents can not be retrieved
|
||||
/// because of an underlying I/O error (e.g. an error while making a
|
||||
/// network request or retrieving the contents from the file system)
|
||||
pub async fn get_contents(
|
||||
&self,
|
||||
file_type_hint: Option<FileType>,
|
||||
skip_missing: bool,
|
||||
) -> Result<Vec<InputContent>> {
|
||||
match *self {
|
||||
// TODO: should skip_missing also affect URLs?
|
||||
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
|
||||
Input::FsGlob {
|
||||
ref pattern,
|
||||
ignore_case,
|
||||
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
|
||||
Input::FsPath(ref path) => {
|
||||
let content = Self::path_content(path).await;
|
||||
match content {
|
||||
Ok(input_content) => Ok(vec![input_content]),
|
||||
Err(_) if skip_missing => Ok(vec![]),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
|
||||
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
|
||||
}
|
||||
}
|
||||
|
||||
async fn url_contents(url: &Url) -> Result<InputContent> {
|
||||
// Assume HTML for default paths
|
||||
let file_type = if url.path().is_empty() || url.path() == "/" {
|
||||
FileType::Html
|
||||
} else {
|
||||
FileType::from(url.as_str())
|
||||
};
|
||||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(Box::new(url.clone())),
|
||||
file_type,
|
||||
content: res.text().await?,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
|
||||
let mut contents = vec![];
|
||||
let glob_expanded = tilde(&path_glob);
|
||||
let mut match_opts = glob::MatchOptions::new();
|
||||
|
||||
match_opts.case_sensitive = !ignore_case;
|
||||
|
||||
for entry in glob_with(&glob_expanded, match_opts)? {
|
||||
match entry {
|
||||
Ok(path) => {
|
||||
let content = Self::path_content(&path).await?;
|
||||
contents.push(content);
|
||||
}
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
|
||||
let content = read_to_string(&path)
|
||||
.await
|
||||
.map_err(|e| (path.clone().into(), e))?;
|
||||
let input_content = InputContent {
|
||||
file_type: FileType::from(path.as_ref()),
|
||||
content,
|
||||
input: Input::FsPath(path.into()),
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
|
||||
let mut content = String::new();
|
||||
let mut stdin = stdin();
|
||||
stdin.read_to_string(&mut content).await?;
|
||||
|
||||
let input_content = InputContent {
|
||||
input: Input::Stdin,
|
||||
file_type: file_type_hint.unwrap_or_default(),
|
||||
content,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
|
||||
InputContent::from_string(s, file_type_hint.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
#[derive(Debug, Clone)]
|
||||
|
|
@ -278,7 +66,7 @@ impl Collector {
|
|||
|
||||
for handle in extract_links_handles {
|
||||
let new_links = handle.await?;
|
||||
links.extend(new_links);
|
||||
links.extend(new_links?);
|
||||
}
|
||||
|
||||
// Filter out already cached links (duplicates)
|
||||
|
|
@ -304,9 +92,9 @@ mod test {
|
|||
|
||||
use super::*;
|
||||
use crate::{
|
||||
extract::FileType,
|
||||
mock_server,
|
||||
test_utils::{mail, website},
|
||||
types::{FileType, Input},
|
||||
Result, Uri,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,51 +1,19 @@
|
|||
use std::{collections::HashSet, convert::TryFrom, path::Path};
|
||||
use std::{collections::HashSet, convert::TryFrom, path::PathBuf};
|
||||
|
||||
use html5ever::{
|
||||
parse_document,
|
||||
tendril::{StrTendril, TendrilSink},
|
||||
};
|
||||
use linkify::LinkFinder;
|
||||
use log::info;
|
||||
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use url::Url;
|
||||
|
||||
use crate::{collector::InputContent, Request, Uri};
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// `FileType` defines which file types lychee can handle
|
||||
pub enum FileType {
|
||||
/// File in HTML format
|
||||
Html,
|
||||
/// File in Markdown format
|
||||
Markdown,
|
||||
/// Generic text file without syntax-specific parsing
|
||||
Plaintext,
|
||||
}
|
||||
|
||||
impl Default for FileType {
|
||||
fn default() -> Self {
|
||||
Self::Plaintext
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: AsRef<Path>> From<P> for FileType {
|
||||
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
|
||||
fn from(p: P) -> FileType {
|
||||
let path = p.as_ref();
|
||||
// Assume HTML in case of no extension.
|
||||
// Note: this is only reasonable for URLs; not paths on disk.
|
||||
// For example, `README` without an extension is more likely to be a plaintext file.
|
||||
// A better solution would be to also implement `From<Url> for FileType`.
|
||||
// Unfortunately that's not possible without refactoring, as
|
||||
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
|
||||
// `From<Url> for FileType` is not allowed.
|
||||
match path.extension().and_then(std::ffi::OsStr::to_str) {
|
||||
Some("md" | "markdown") => FileType::Markdown,
|
||||
Some("htm" | "html") | None => FileType::Html,
|
||||
Some(_) => FileType::Plaintext,
|
||||
}
|
||||
}
|
||||
}
|
||||
use crate::{
|
||||
types::{FileType, InputContent},
|
||||
Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
// Use LinkFinder here to offload the actual link searching in plaintext.
|
||||
fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
|
|
@ -140,7 +108,7 @@ fn extract_links_from_plaintext(input: &str) -> Vec<String> {
|
|||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base_url: &Option<Url>,
|
||||
) -> HashSet<Request> {
|
||||
) -> Result<HashSet<Request>> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
|
|
@ -153,16 +121,23 @@ pub(crate) fn extract_links(
|
|||
for link in links {
|
||||
if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
requests.insert(Request::new(uri, input_content.input.clone()));
|
||||
} else if !Path::new(&link).exists() {
|
||||
if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
|
||||
requests.insert(Request::new(
|
||||
Uri { url: new_url },
|
||||
input_content.input.clone(),
|
||||
));
|
||||
} else if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
|
||||
requests.insert(Request::new(
|
||||
Uri { url: new_url },
|
||||
input_content.input.clone(),
|
||||
));
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if let Ok(path) = crate::fs_tree::find(&root, &PathBuf::from(&link)) {
|
||||
let input_content = Input::path_content(path)?;
|
||||
requests.extend(extract_links(&input_content, base_url)?);
|
||||
} else {
|
||||
info!("Cannot find path to {} in filesystem", &link);
|
||||
}
|
||||
};
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
}
|
||||
}
|
||||
requests
|
||||
Ok(requests)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -180,10 +155,10 @@ mod test {
|
|||
|
||||
use super::{
|
||||
extract_links, extract_links_from_html, extract_links_from_markdown,
|
||||
extract_links_from_plaintext, find_links, FileType,
|
||||
extract_links_from_plaintext, find_links,
|
||||
};
|
||||
use crate::types::{FileType, InputContent};
|
||||
use crate::{
|
||||
collector::InputContent,
|
||||
test_utils::{mail, website},
|
||||
Uri,
|
||||
};
|
||||
|
|
@ -211,6 +186,8 @@ mod test {
|
|||
&InputContent::from_string(input, file_type),
|
||||
&base_url.map(|u| Url::parse(u).unwrap()),
|
||||
)
|
||||
// unwrap is fine here as this helper function is only used in tests
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect()
|
||||
|
|
|
|||
135
lychee-lib/src/fs_tree.rs
Normal file
135
lychee-lib/src/fs_tree.rs
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
use crate::{ErrorKind, Result};
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub(crate) fn find(root: &PathBuf, dst: &PathBuf) -> Result<PathBuf> {
|
||||
if dst.exists() {
|
||||
return Ok(dst.clone());
|
||||
}
|
||||
if dst.is_dir() {
|
||||
return Err(ErrorKind::FileNotFound(dst.clone()));
|
||||
}
|
||||
// Find `dst` in the `root` path
|
||||
if let Some(parent) = root.parent() {
|
||||
let rel = parent.join(dst);
|
||||
if rel.exists() {
|
||||
return Ok(rel);
|
||||
}
|
||||
}
|
||||
return Err(ErrorKind::FileNotFound(dst.clone()));
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_fs_tree {
|
||||
use std::fs::File;
|
||||
|
||||
use super::*;
|
||||
use crate::Result;
|
||||
|
||||
// dummy root
|
||||
// /path/to/foo.html
|
||||
#[test]
|
||||
fn test_find_absolute() -> Result<()> {
|
||||
let dummy = PathBuf::new();
|
||||
let dir = tempfile::tempdir()?;
|
||||
let dst = dir.path().join("foo.html");
|
||||
File::create(&dst)?;
|
||||
assert_eq!(find(&dummy, &dst)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_find_relative() -> Result<()> {
|
||||
let root = PathBuf::from("index.html");
|
||||
let dir = tempfile::tempdir()?;
|
||||
let dst = dir.path().join("./foo.html");
|
||||
File::create(&dst)?;
|
||||
assert_eq!(find(&root, &dst)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ./index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_find_relative_index() -> Result<()> {
|
||||
let root = PathBuf::from("./index.html");
|
||||
let dir = tempfile::tempdir()?;
|
||||
let dst = dir.path().join("./foo.html");
|
||||
File::create(&dst)?;
|
||||
assert_eq!(find(&root, &dst)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_relative_nonexistent() -> Result<()> {
|
||||
let root = PathBuf::from("index.html");
|
||||
// This file does not exist
|
||||
let dst = PathBuf::from("./foo.html");
|
||||
assert!(find(&root, &dst).is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_find_relative_from_absolute() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let root = dir.path().join("index.html");
|
||||
// We create the absolute path to foo.html,
|
||||
// but we address it under its relative path
|
||||
let dst = PathBuf::from("./foo.html");
|
||||
let dst_absolute = dir.path().join("./foo.html");
|
||||
File::create(&dst_absolute)?;
|
||||
assert_eq!(find(&root, &dst)?, dst_absolute);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// ./foo.html (non-existent)
|
||||
#[test]
|
||||
fn test_find_relative_from_absolute_nonexistent() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let root = dir.path().join("index.html");
|
||||
// We create the absolute path to foo.html,
|
||||
// but we address it under its relative path
|
||||
let dst = PathBuf::from("./foo.html");
|
||||
assert!(find(&root, &dst).is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// /other/path/to/foo.html
|
||||
#[test]
|
||||
fn test_find_absolute_from_absolute() -> Result<()> {
|
||||
let root = PathBuf::from("/path/to/index.html");
|
||||
let dir = tempfile::tempdir()?;
|
||||
let dst = dir.path().join("foo.html");
|
||||
File::create(&dst)?;
|
||||
assert_eq!(find(&root, &dst)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to
|
||||
// /other/path/to/foo.html
|
||||
#[test]
|
||||
fn test_root_is_dir() -> Result<()> {
|
||||
let root = PathBuf::from("/path/to/");
|
||||
let dir = tempfile::tempdir()?;
|
||||
let dst = dir.path().join("foo.html");
|
||||
File::create(&dst)?;
|
||||
assert_eq!(find(&root, &dst)?, dst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// /other/path/to
|
||||
#[test]
|
||||
fn test_dst_is_dir() -> Result<()> {
|
||||
let root = PathBuf::from("/path/to/");
|
||||
let dir = tempfile::tempdir()?;
|
||||
File::create(&dir)?;
|
||||
assert!(find(&root, &dir.into_path()).is_err());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -47,13 +47,13 @@ doc_comment::doctest!("../../README.md");
|
|||
|
||||
mod client;
|
||||
mod client_pool;
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
mod fs_tree;
|
||||
mod quirks;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
|
||||
/// Functionality to extract URIs from inputs
|
||||
pub mod extract;
|
||||
|
||||
|
|
@ -75,8 +75,8 @@ use ring as _; // required for apple silicon
|
|||
pub use crate::{
|
||||
client::{check, ClientBuilder},
|
||||
client_pool::ClientPool,
|
||||
collector::{Collector, Input},
|
||||
collector::Collector,
|
||||
filter::{Excludes, Filter, Includes},
|
||||
types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
|
||||
types::{ErrorKind, Input, Request, Response, ResponseBody, Result, Status},
|
||||
uri::Uri,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ pub enum ErrorKind {
|
|||
/// A possible error when converting a `HeaderValue` from a string or byte
|
||||
/// slice.
|
||||
InvalidHeader(InvalidHeaderValue),
|
||||
/// Cannot find local file
|
||||
FileNotFound(PathBuf),
|
||||
/// The given UNIX glob pattern is invalid
|
||||
InvalidGlobPattern(glob::PatternError),
|
||||
/// The Github API could not be called because of a missing Github token.
|
||||
|
|
@ -63,6 +65,7 @@ impl Hash for ErrorKind {
|
|||
Self::IoError(p, e) => (p, e.kind()).hash(state),
|
||||
Self::ReqwestError(e) => e.to_string().hash(state),
|
||||
Self::HubcapsError(e) => e.to_string().hash(state),
|
||||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
|
||||
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
|
|
@ -84,6 +87,7 @@ impl Display for ErrorKind {
|
|||
Self::IoError(None, e) => e.fmt(f),
|
||||
Self::ReqwestError(e) => e.fmt(f),
|
||||
Self::HubcapsError(e) => e.fmt(f),
|
||||
Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
|
||||
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
|
||||
write!(
|
||||
f,
|
||||
|
|
|
|||
37
lychee-lib/src/types/file.rs
Normal file
37
lychee-lib/src/types/file.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
use std::path::Path;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// `FileType` defines which file types lychee can handle
|
||||
pub enum FileType {
|
||||
/// File in HTML format
|
||||
Html,
|
||||
/// File in Markdown format
|
||||
Markdown,
|
||||
/// Generic text file without syntax-specific parsing
|
||||
Plaintext,
|
||||
}
|
||||
|
||||
impl Default for FileType {
|
||||
fn default() -> Self {
|
||||
Self::Plaintext
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: AsRef<Path>> From<P> for FileType {
|
||||
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
|
||||
fn from(p: P) -> FileType {
|
||||
let path = p.as_ref();
|
||||
// Assume HTML in case of no extension.
|
||||
// Note: this is only reasonable for URLs; not paths on disk.
|
||||
// For example, `README` without an extension is more likely to be a plaintext file.
|
||||
// A better solution would be to also implement `From<Url> for FileType`.
|
||||
// Unfortunately that's not possible without refactoring, as
|
||||
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
|
||||
// `From<Url> for FileType` is not allowed.
|
||||
match path.extension().and_then(std::ffi::OsStr::to_str) {
|
||||
Some("md") | Some("markdown") => FileType::Markdown,
|
||||
Some("htm") | Some("html") | None => FileType::Html,
|
||||
Some(_) => FileType::Plaintext,
|
||||
}
|
||||
}
|
||||
}
|
||||
203
lychee-lib/src/types/input.rs
Normal file
203
lychee-lib/src/types/input.rs
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
use crate::types::FileType;
|
||||
use crate::Result;
|
||||
use glob::glob_with;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use shellexpand::tilde;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{fmt::Display, fs::read_to_string};
|
||||
use tokio::io::{stdin, AsyncReadExt};
|
||||
|
||||
const STDIN: &str = "-";
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[non_exhaustive]
|
||||
/// An exhaustive list of input sources, which lychee accepts
|
||||
pub enum Input {
|
||||
/// URL (of HTTP/HTTPS scheme).
|
||||
RemoteUrl(Box<Url>),
|
||||
/// Unix shell-style glob pattern.
|
||||
FsGlob {
|
||||
/// The glob pattern matching all input files
|
||||
pattern: String,
|
||||
/// Don't be case sensitive when matching files against a glob
|
||||
ignore_case: bool,
|
||||
},
|
||||
/// File path.
|
||||
FsPath(PathBuf),
|
||||
/// Standard Input.
|
||||
Stdin,
|
||||
/// Raw string input.
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl Serialize for Input {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Input {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Input::RemoteUrl(url) => url.as_str(),
|
||||
Input::FsGlob { pattern, .. } => pattern,
|
||||
Input::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Input::Stdin => "stdin",
|
||||
Input::String(_) => "raw input string",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub input: Input,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
input: Input::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Input {
|
||||
#[must_use]
|
||||
/// Construct a new `Input` source. In case the input is a `glob` pattern,
|
||||
/// `glob_ignore_case` decides whether matching files against the `glob` is
|
||||
/// case-insensitive or not
|
||||
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
|
||||
if value == STDIN {
|
||||
Self::Stdin
|
||||
} else if let Ok(url) = Url::parse(&value) {
|
||||
Self::RemoteUrl(Box::new(url))
|
||||
} else {
|
||||
// this seems to be the only way to determine if this is a glob pattern
|
||||
let is_glob = glob::Pattern::escape(value) != value;
|
||||
|
||||
if is_glob {
|
||||
Self::FsGlob {
|
||||
pattern: value.to_owned(),
|
||||
ignore_case: glob_ignore_case,
|
||||
}
|
||||
} else {
|
||||
Self::FsPath(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
/// Retrieve the contents from the input
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the contents can not be retrieved
|
||||
/// because of an underlying I/O error (e.g. an error while making a
|
||||
/// network request or retrieving the contents from the file system)
|
||||
pub async fn get_contents(
|
||||
&self,
|
||||
file_type_hint: Option<FileType>,
|
||||
skip_missing: bool,
|
||||
) -> Result<Vec<InputContent>> {
|
||||
match *self {
|
||||
// TODO: should skip_missing also affect URLs?
|
||||
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
|
||||
Input::FsGlob {
|
||||
ref pattern,
|
||||
ignore_case,
|
||||
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
|
||||
Input::FsPath(ref path) => {
|
||||
let content = Self::path_content(path);
|
||||
match content {
|
||||
Ok(input_content) => Ok(vec![input_content]),
|
||||
Err(_) if skip_missing => Ok(vec![]),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
|
||||
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
|
||||
}
|
||||
}
|
||||
|
||||
async fn url_contents(url: &Url) -> Result<InputContent> {
|
||||
// Assume HTML for default paths
|
||||
let file_type = if url.path().is_empty() || url.path() == "/" {
|
||||
FileType::Html
|
||||
} else {
|
||||
FileType::from(url.as_str())
|
||||
};
|
||||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(Box::new(url.clone())),
|
||||
file_type,
|
||||
content: res.text().await?,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
|
||||
let mut contents = vec![];
|
||||
let glob_expanded = tilde(&path_glob);
|
||||
let mut match_opts = glob::MatchOptions::new();
|
||||
|
||||
match_opts.case_sensitive = !ignore_case;
|
||||
|
||||
for entry in glob_with(&glob_expanded, match_opts)? {
|
||||
match entry {
|
||||
Ok(path) => {
|
||||
let content = Self::path_content(&path)?;
|
||||
contents.push(content);
|
||||
}
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
/// Get the input content of a given path
|
||||
pub fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
|
||||
let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
|
||||
let input_content = InputContent {
|
||||
file_type: FileType::from(path.as_ref()),
|
||||
content,
|
||||
input: Input::FsPath(path.into()),
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
|
||||
let mut content = String::new();
|
||||
let mut stdin = stdin();
|
||||
stdin.read_to_string(&mut content).await?;
|
||||
|
||||
let input_content = InputContent {
|
||||
input: Input::Stdin,
|
||||
file_type: file_type_hint.unwrap_or_default(),
|
||||
content,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
|
||||
InputContent::from_string(s, file_type_hint.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,11 +1,15 @@
|
|||
#![allow(unreachable_pub)]
|
||||
|
||||
mod error;
|
||||
mod file;
|
||||
mod input;
|
||||
mod request;
|
||||
mod response;
|
||||
mod status;
|
||||
|
||||
pub use error::ErrorKind;
|
||||
pub use file::FileType;
|
||||
pub use input::{Input, InputContent};
|
||||
pub use request::Request;
|
||||
pub use response::{Response, ResponseBody};
|
||||
pub use status::Status;
|
||||
|
|
|
|||
Loading…
Reference in a new issue