Add support for local files #262

This commit is contained in:
Matthias 2021-09-09 19:50:37 +02:00 committed by GitHub
commit 9b5fc399ed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 966 additions and 374 deletions

View file

@ -43,17 +43,17 @@ jobs:
fail-fast: false
steps:
- name: Install musl tools
if: contains(matrix.target, 'musl')
if: ${{ contains(matrix.target, 'musl') }}
run: sudo apt-get install -y musl-tools
- name: Install arm tools
if: contains(matrix.target, 'arm')
if: ${{ contains(matrix.target, 'arm') }}
run: |
echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV
sudo apt-get install -y binutils-arm-linux-gnueabihf
- name: Install aarch64 tools
if: contains(matrix.target, 'aarch64')
if: ${{ contains(matrix.target, 'aarch64') }}
run: |
echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV
sudo apt-get install -y binutils-aarch64-linux-gnu

View file

@ -56,7 +56,7 @@ jobs:
- run: cargo-publish-all --dry-run
publish:
if: startsWith(github.ref, 'refs/tags/')
if: ${{ startsWith(github.ref, 'refs/tags/') }}
needs:
- test
- lint

9
Cargo.lock generated
View file

@ -1406,8 +1406,11 @@ dependencies = [
"http",
"hubcaps",
"linkify",
"log",
"markup5ever_rcdom",
"openssl-sys",
"path-clean",
"percent-encoding",
"pretty_assertions",
"pulldown-cmark",
"regex",
@ -1732,6 +1735,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "path-clean"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd"
[[package]]
name = "pem"
version = "0.8.3"

View file

@ -18,10 +18,14 @@ docker-run: ## Run Docker image
docker-push: ## Push image to Docker Hub
docker push $(IMAGE_NAME)
.PHONY: build-local
.PHONY: build
build: ## Build Rust code locally
cargo build
.PHONY: install
install: ## Install project locally
cargo install --path lychee-bin
.PHONY: run
run: ## Run Rust code locally
cargo run

View file

@ -161,11 +161,15 @@ lychee ~/projects/*/README.md
# check links in local files (lychee supports advanced globbing and ~ expansion):
lychee "~/projects/big_project/**/README.*"
# ignore case when globbing and check result for each link:
lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*"
# check links from epub file (requires atool: https://www.nongnu.org/atool)
acat -F zip {file.epub} "*.xhtml" "*.html" | lychee -
# check links in directory; block network requests
lychee --offline path/to/directory
```
### GitHub token
@ -202,6 +206,7 @@ FLAGS:
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
-n, --no-progress Do not show progress bar.
This is recommended for non-interactive shells (e.g. for continuous integration)
--offline Only check local files and block network requests
--require-https When HTTPS is available, treat HTTP links as errors
--skip-missing Skip missing input files (default is to error if they don't exist)
-V, --version Prints version information
@ -209,7 +214,8 @@ FLAGS:
OPTIONS:
-a, --accept <accept> Comma-separated list of accepted status codes for valid links
-b, --base-url <base-url> Base URL to check relative URLs
-b, --base <base> Base URL or website root directory to check relative URLs e.g.
https://example.org or `/path/to/public`
--basic-auth <basic-auth> Basic authentication support. E.g. `username:password`
-c, --config <config-file> Configuration file to use [default: ./lychee.toml]
--exclude <exclude>... Exclude URLs from checking (supports regex)
@ -310,7 +316,8 @@ Try one of these links to get started:
- [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
- [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors.
Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started.
Begin my making sure the following commands succeed without errors.
```bash
cargo test # runs tests

View file

@ -14,12 +14,12 @@ async fn main() -> Result<()> {
];
let links = Collector::new(
None, // base_url
None, // base
false, // don't skip missing inputs
10, // max concurrency
)
.collect_links(
inputs, // base_url
inputs, // base url or directory
)
.await?;

View file

@ -1,5 +1,5 @@
This link should be ignored as it is not a fully qualified URL.
![Logo](awesome.png)
Check file link
![Logo](../assets/banner.svg)
![Anchors should be ignored](#awesome)

View file

@ -1,3 +1,3 @@
slack://channel?id=123
file://foo/bar
file:///test_folder/test_file
https://example.org

View file

View file

@ -0,0 +1,21 @@
<html>
<head>
<title>About</title>
</head>
<body>
<h1>About</h1>
<p>
<ul>
<li>
<a href="https://example.org">example</a>
</li>
<li>
<a href="/">home</a>
</li>
<li>
<a href="/post1">Post 1</a>
</li>
</ul>
</p>
</body>
</html>

View file

View file

@ -0,0 +1,21 @@
<html>
<head>
<title>Post 2</title>
</head>
<body>
<h1>Post 2 Title</h1>
<p>
<ul>
<li>
<a href="/">home</a>
</li>
<li>
<a href="/post1">Post 1</a>
</li>
<li>
<a href="../about">Relative</a>
</li>
</ul>
</p>
</body>
</html>

View file

@ -0,0 +1,18 @@
<html>
<head>
<title>Post 1</title>
</head>
<body>
<h1>Post 1 Title</h1>
<p>
<ul>
<li>
<a href="/">home</a>
</li>
<li>
<a href="/post2">Post 2</a>
</li>
</ul>
</p>
</body>
</html>

View file

@ -0,0 +1,27 @@
<html>
<head>
<title>Index</title>
</head>
<body>
<h1>Index Title</h1>
<p>
<ul>
<li>
<a href="/">home</a>
</li>
<li>
<a href="/about">About</a>
</li>
<li>
<a href="/about#fragment">About</a>
</li>
<li>
<a href="/another page">About</a>
</li>
<li>
<a href="/another%20page">About</a>
</li>
</ul>
</p>
</body>
</html>

View file

@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result};
use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
use http::StatusCode;
use indicatif::{ProgressBar, ProgressStyle};
use lychee_lib::{
collector::{Collector, Input},
ClientBuilder, ClientPool, Response,
};
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use ring as _; // required for apple silicon
@ -178,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
let include = RegexSet::new(&cfg.include)?;
let exclude = RegexSet::new(&cfg.exclude)?;
// Offline mode overrides the scheme
let schemes = if cfg.offline {
vec!["file".to_string()]
} else {
cfg.scheme.clone()
};
let client = ClientBuilder::builder()
.includes(include)
.excludes(exclude)
@ -193,14 +197,14 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
.method(method)
.timeout(timeout)
.github_token(cfg.github_token.clone())
.schemes(HashSet::from_iter(cfg.scheme.clone()))
.schemes(HashSet::from_iter(schemes))
.accepted(accepted)
.require_https(cfg.require_https)
.build()
.client()
.map_err(|e| anyhow!(e))?;
let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency)
let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency)
.collect_links(&inputs)
.await
.map_err(|e| anyhow!(e))?;

View file

@ -1,9 +1,8 @@
use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr};
use anyhow::{anyhow, Error, Result};
use lazy_static::lazy_static;
use lychee_lib::collector::Input;
use reqwest::Url;
use lychee_lib::{Base, Input};
use serde::Deserialize;
use structopt::{clap::crate_version, StructOpt};
@ -76,6 +75,10 @@ macro_rules! fold_in {
};
}
fn parse_base(src: &str) -> Result<Base, lychee_lib::ErrorKind> {
Base::try_from(src)
}
#[derive(Debug, StructOpt)]
#[structopt(
name = "lychee",
@ -161,6 +164,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) scheme: Vec<String>,
/// Only check local files and block network requests.
#[structopt(long)]
#[serde(default)]
pub(crate) offline: bool,
/// URLs to check (supports regex). Has preference over all excludes.
#[structopt(long)]
#[serde(default)]
@ -223,10 +231,11 @@ pub(crate) struct Config {
#[serde(default = "method")]
pub(crate) method: String,
/// Base URL to check relative URLs
#[structopt(short, long, parse(try_from_str))]
/// Base URL or website root directory to check relative URLs
/// e.g. https://example.org or `/path/to/public`
#[structopt(short, long, parse(try_from_str = parse_base))]
#[serde(default)]
pub(crate) base_url: Option<Url>,
pub(crate) base: Option<Base>,
/// Basic authentication support. E.g. `username:password`
#[structopt(long)]
@ -311,7 +320,7 @@ impl Config {
accept: None;
timeout: TIMEOUT;
method: METHOD;
base_url: None;
base: None;
basic_auth: None;
github_token: None;
skip_missing: false;

View file

@ -133,15 +133,38 @@ mod cli {
/// Test unsupported URI schemes
#[test]
fn test_unsupported_uri_schemes() -> Result<()> {
test_json_output!(
"TEST_SCHEMES.txt",
MockResponseStats {
total: 1,
successful: 1,
..MockResponseStats::default()
}
)
fn test_unsupported_uri_schemes() {
let mut cmd = main_command();
let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt");
// Exclude file link because it doesn't exist on the filesystem.
// (File URIs are absolute paths, which we don't have.)
// Nevertheless, the `file` scheme should be recognized.
cmd.arg(test_schemes_path)
.arg("--exclude")
.arg("file://")
.env_clear()
.assert()
.success()
.stdout(contains("Total............2"))
.stdout(contains("Successful.......1"))
.stdout(contains("Excluded.........1"));
}
#[test]
fn test_resolve_paths() {
let mut cmd = main_command();
let offline_dir = fixtures_path().join("offline");
cmd.arg("--offline")
.arg("--base")
.arg(&offline_dir)
.arg(&offline_dir.join("index.html"))
.env_clear()
.assert()
.success()
.stdout(contains("Total............3"))
.stdout(contains("Successful.......3"));
}
#[test]
@ -367,7 +390,7 @@ mod cli {
.assert()
.success();
let expected = r#"{"total":10,"successful":10,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
let expected = r#"{"total":11,"successful":11,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
let output = fs::read_to_string(&outfile)?;
assert_eq!(output.split_whitespace().collect::<String>(), expected);
fs::remove_file(outfile)?;
@ -385,7 +408,7 @@ mod cli {
.arg(".*")
.assert()
.success()
.stdout(contains("Excluded........10"));
.stdout(contains("Excluded........11"));
Ok(())
}

View file

@ -0,0 +1,36 @@
#[cfg(test)]
mod cli {
use std::{fs::File, io::Write};
use assert_cmd::Command;
use lychee_lib::Result;
use predicates::str::contains;
fn main_command() -> Command {
// this gets the "main" binary name (e.g. `lychee`)
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
}
#[tokio::test]
async fn test_local_file() -> Result<()> {
let dir = tempfile::tempdir()?;
let index_path = dir.path().join("index.html");
let mut index = File::create(&index_path)?;
writeln!(index, r#"<a href="./foo.html">Foo</a>"#)?;
let foo_path = dir.path().join("foo.html");
File::create(&foo_path)?;
let mut cmd = main_command();
cmd.arg(index_path)
.arg("--no-progress")
.arg("--verbose")
.env_clear()
.assert()
.success()
.stdout(contains("Total............1"))
.stdout(contains("foo.html"));
Ok(())
}
}

View file

@ -40,6 +40,9 @@ shellexpand = "2.1.0"
tokio = { version = "1.6.0", features = ["full"] }
typed-builder = "0.9.1"
url = { version = "2.2.2", features = ["serde"] }
log = "0.4.14"
path-clean = "0.1.0"
percent-encoding = "2.1.0"
[dev-dependencies]
doc-comment = "0.3.3"

View file

@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
use crate::{
filter::{Excludes, Filter, Includes},
quirks::Quirks,
uri::Uri,
ErrorKind, Request, Response, Result, Status,
ErrorKind, Request, Response, Result, Status, Uri,
};
const DEFAULT_MAX_REDIRECTS: usize = 5;
@ -178,6 +177,8 @@ impl Client {
let Request { uri, source } = Request::try_from(request)?;
let status = if self.filter.is_excluded(&uri) {
Status::Excluded
} else if uri.is_file() {
self.check_file(&uri).await
} else if uri.is_mail() {
self.check_mail(&uri).await
} else {
@ -255,6 +256,15 @@ impl Client {
}
}
pub async fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
return Status::Ok(StatusCode::OK);
}
}
ErrorKind::InvalidFilePath(uri.clone()).into()
}
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];
@ -284,11 +294,13 @@ where
mod test {
use std::{
convert::TryInto,
fs::File,
time::{Duration, Instant},
};
use http::{header::HeaderMap, StatusCode};
use reqwest::header;
use tempfile::tempdir;
use super::ClientBuilder;
use crate::{mock_server, test_utils::get_mock_client_response, Uri};
@ -373,6 +385,17 @@ mod test {
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_file() {
let dir = tempdir().unwrap();
let file = dir.path().join("temp");
File::create(file).unwrap();
let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap());
let res = get_mock_client_response(uri).await;
assert!(res.status().is_success());
}
#[tokio::test]
async fn test_custom_headers() {
// See https://github.com/rust-lang/crates.io/issues/788

View file

@ -1,223 +1,10 @@
use std::{
collections::HashSet,
fmt::Display,
path::{Path, PathBuf},
};
use glob::glob_with;
use reqwest::Url;
use serde::Serialize;
use shellexpand::tilde;
use tokio::{
fs::read_to_string,
io::{stdin, AsyncReadExt},
};
use crate::{
extract::{extract_links, FileType},
uri::Uri,
Request, Result,
};
const STDIN: &str = "-";
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
/// An exhaustive list of input sources, which lychee accepts
pub enum Input {
/// URL (of HTTP/HTTPS scheme).
RemoteUrl(Box<Url>),
/// Unix shell-style glob pattern.
FsGlob {
/// The glob pattern matching all input files
pattern: String,
/// Don't be case sensitive when matching files against a glob
ignore_case: bool,
},
/// File path.
FsPath(PathBuf),
/// Standard Input.
Stdin,
/// Raw string input.
String(String),
}
impl Serialize for Input {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl Display for Input {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Input::RemoteUrl(url) => url.as_str(),
Input::FsGlob { pattern, .. } => pattern,
Input::FsPath(path) => path.to_str().unwrap_or_default(),
Input::Stdin => "stdin",
Input::String(_) => "raw input string",
})
}
}
#[derive(Debug)]
/// Encapsulates the content for a given input
pub struct InputContent {
/// Input source
pub input: Input,
/// File type of given input
pub file_type: FileType,
/// Raw UTF-8 string content
pub content: String,
}
impl InputContent {
#[must_use]
/// Create an instance of `InputContent` from an input string
pub fn from_string(s: &str, file_type: FileType) -> Self {
// TODO: consider using Cow (to avoid one .clone() for String types)
Self {
input: Input::String(s.to_owned()),
file_type,
content: s.to_owned(),
}
}
}
impl Input {
#[must_use]
/// Construct a new `Input` source. In case the input is a `glob` pattern,
/// `glob_ignore_case` decides whether matching files against the `glob` is
/// case-insensitive or not
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
if value == STDIN {
Self::Stdin
} else if let Ok(url) = Url::parse(value) {
Self::RemoteUrl(Box::new(url))
} else {
// this seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(value) != value;
if is_glob {
Self::FsGlob {
pattern: value.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
Self::FsPath(value.into())
}
}
}
#[allow(clippy::missing_panics_doc)]
/// Retrieve the contents from the input
///
/// # Errors
///
/// Returns an error if the contents can not be retrieved
/// because of an underlying I/O error (e.g. an error while making a
/// network request or retrieving the contents from the file system)
pub async fn get_contents(
&self,
file_type_hint: Option<FileType>,
skip_missing: bool,
) -> Result<Vec<InputContent>> {
match *self {
// TODO: should skip_missing also affect URLs?
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
Input::FsGlob {
ref pattern,
ignore_case,
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
Input::FsPath(ref path) => {
let content = Self::path_content(path).await;
match content {
Ok(input_content) => Ok(vec![input_content]),
Err(_) if skip_missing => Ok(vec![]),
Err(e) => Err(e),
}
}
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
}
}
async fn url_contents(url: &Url) -> Result<InputContent> {
// Assume HTML for default paths
let file_type = if url.path().is_empty() || url.path() == "/" {
FileType::Html
} else {
FileType::from(url.as_str())
};
let res = reqwest::get(url.clone()).await?;
let input_content = InputContent {
input: Input::RemoteUrl(Box::new(url.clone())),
file_type,
content: res.text().await?,
};
Ok(input_content)
}
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
let mut contents = vec![];
let glob_expanded = tilde(&path_glob);
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;
for entry in glob_with(&glob_expanded, match_opts)? {
match entry {
Ok(path) => {
let content = Self::path_content(&path).await?;
contents.push(content);
}
Err(e) => println!("{:?}", e),
}
}
Ok(contents)
}
async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
let content = read_to_string(&path)
.await
.map_err(|e| (path.clone().into(), e))?;
let input_content = InputContent {
file_type: FileType::from(path.as_ref()),
content,
input: Input::FsPath(path.into()),
};
Ok(input_content)
}
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
let mut content = String::new();
let mut stdin = stdin();
stdin.read_to_string(&mut content).await?;
let input_content = InputContent {
input: Input::Stdin,
file_type: file_type_hint.unwrap_or_default(),
content,
};
Ok(input_content)
}
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
InputContent::from_string(s, file_type_hint.unwrap_or_default())
}
}
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
use std::collections::HashSet;
/// Collector keeps the state of link collection
#[derive(Debug, Clone)]
pub struct Collector {
base_url: Option<Url>,
base: Option<Base>,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet<Uri>,
@ -226,9 +13,9 @@ pub struct Collector {
impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub fn new(base_url: Option<Url>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
Collector {
base_url,
base,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
@ -236,7 +23,8 @@ impl Collector {
}
/// Fetch all unique links from a slice of inputs
/// All relative URLs get prefixed with `base_url` if given.
/// All relative URLs get prefixed with `base` if given.
/// (This can be a directory or a base URL)
///
/// # Errors
///
@ -263,9 +51,9 @@ impl Collector {
while let Some(result) = contents_rx.recv().await {
for input_content in result? {
let base_url = self.base_url.clone();
let base = self.base.clone();
let handle =
tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
extract_links_handles.push(handle);
}
}
@ -278,7 +66,7 @@ impl Collector {
for handle in extract_links_handles {
let new_links = handle.await?;
links.extend(new_links);
links.extend(new_links?);
}
// Filter out already cached links (duplicates)
@ -304,9 +92,9 @@ mod test {
use super::*;
use crate::{
extract::FileType,
mock_server,
test_utils::{mail, website},
types::{FileType, Input},
Result, Uri,
};

View file

@ -1,66 +1,62 @@
use std::{collections::HashSet, convert::TryFrom, path::Path};
use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};
use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use percent_encoding::percent_decode_str;
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use url::Url;
use reqwest::Url;
use crate::{collector::InputContent, Request, Uri};
use crate::{
helpers::{path, url},
types::{FileType, InputContent},
Base, ErrorKind, Input, Request, Result, Uri,
};
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// `FileType` defines which file types lychee can handle
pub enum FileType {
/// File in HTML format
Html,
/// File in Markdown format
Markdown,
/// Generic text file without syntax-specific parsing
Plaintext,
}
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
pub(crate) fn extract_links(
input_content: &InputContent,
base: &Option<Base>,
) -> Result<HashSet<Request>> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
impl Default for FileType {
fn default() -> Self {
Self::Plaintext
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
Request::new(uri, input_content.input.clone())
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(&link) {
// Silently ignore anchor links for now
continue;
}
let url = create_uri_from_path(root, base, &link)?;
Request::new(Uri { url }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
continue;
};
requests.insert(req);
}
Ok(requests)
}
impl<P: AsRef<Path>> From<P> for FileType {
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn from(p: P) -> FileType {
let path = p.as_ref();
// Assume HTML in case of no extension.
// Note: this is only reasonable for URLs; not paths on disk.
// For example, `README` without an extension is more likely to be a plaintext file.
// A better solution would be to also implement `From<Url> for FileType`.
// Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
// `From<Url> for FileType` is not allowed.
match path.extension().and_then(std::ffi::OsStr::to_str) {
Some("md" | "markdown") => FileType::Markdown,
Some("htm" | "html") | None => FileType::Html,
Some(_) => FileType::Plaintext,
}
}
}
// Use LinkFinder here to offload the actual link searching in plaintext.
fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
finder.links(input).collect()
}
/// Extract unparsed URL strings from a markdown string.
/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec<String> {
let parser = Parser::new(input);
parser
.flat_map(|event| match event {
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
vec![url.to_string()]
}
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()],
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
_ => vec![],
@ -68,15 +64,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
.collect()
}
/// Extract unparsed URL strings from a HTML string.
/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec<String> {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
// we pass mutable urls reference to avoid extra allocations in each
// recursive descent
// We pass mutable URL references here to avoid
// extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
@ -101,7 +97,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
for attr in attrs.borrow().iter() {
let attr_value = attr.value.to_string();
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
} else {
urls.append(&mut extract_links_from_plaintext(&attr_value));
@ -113,56 +109,34 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
}
// recursively traverse the document's nodes -- this doesn't need any extra
// exit conditions because the document is a tree
// exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
}
}
/// Determine if element's attribute contains a link / URL.
fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
matches!(
(attr_name, elem_name),
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
)
}
/// Extract unparsed URL strings from a plaintext.
/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
find_links(input)
url::find_links(input)
.iter()
.map(|l| String::from(l.as_str()))
.collect()
}
pub(crate) fn extract_links(
input_content: &InputContent,
base_url: &Option<Url>,
) -> HashSet<Request> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. This sorts out things like anchors.
// Silently ignore the parse failures for now.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
if let Ok(uri) = Uri::try_from(link.as_str()) {
requests.insert(Request::new(uri, input_content.input.clone()));
} else if !Path::new(&link).exists() {
if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
requests.insert(Request::new(
Uri { url: new_url },
input_content.input.clone(),
));
}
};
}
requests
fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Url> {
let dst = url::remove_get_params_and_fragment(dst);
// Avoid double-encoding already encoded destination paths by removing any
// potential encoding (e.g. `web%20site` becomes `web site`).
// That's because Url::from_file_path will encode the full URL in the end.
// This behavior cannot be configured.
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
// TODO: This is not a perfect solution.
// Ideally, only `src` and `base` should be URL encoded (as is done by
// `from_file_path` at the moment) while `dst` is left untouched and simply
// appended to the end.
let decoded = percent_decode_str(dst).decode_utf8()?.to_string();
let path = path::resolve(src, &PathBuf::from(decoded), base)?;
Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path))
}
#[cfg(test)]
@ -176,17 +150,24 @@ mod test {
};
use pretty_assertions::assert_eq;
use url::Url;
use super::{
extract_links, extract_links_from_html, extract_links_from_markdown,
extract_links_from_plaintext, find_links, FileType,
};
use super::*;
use crate::{
collector::InputContent,
helpers::url::find_links,
test_utils::{mail, website},
Uri,
};
use crate::{
types::{FileType, InputContent},
Base,
};
#[test]
fn test_create_uri_from_path() {
let result =
create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
assert_eq!(result.as_str(), "file:///test+encoding");
}
fn load_fixture(filename: &str) -> String {
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
@ -207,13 +188,13 @@ mod test {
}
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
extract_links(
&InputContent::from_string(input, file_type),
&base_url.map(|u| Url::parse(u).unwrap()),
)
.into_iter()
.map(|r| r.uri)
.collect()
let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
extract_links(&InputContent::from_string(input, file_type), &base)
// unwrap is fine here as this helper function is only used in tests
.unwrap()
.into_iter()
.map(|r| r.uri)
.collect()
}
#[test]

View file

@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
pub use excludes::Excludes;
pub use includes::Includes;
use crate::uri::Uri;
use crate::Uri;
/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];

View file

@ -0,0 +1,2 @@
pub(crate) mod path;
pub(crate) mod url;

View file

@ -0,0 +1,141 @@
use crate::{Base, ErrorKind, Result};
use path_clean::PathClean;
use std::env;
use std::path::{Path, PathBuf};
// Returns the base if it is a valid `PathBuf`
fn get_base_dir(base: &Option<Base>) -> Option<PathBuf> {
base.as_ref().and_then(Base::dir)
}
// https://stackoverflow.com/a/54817755/270334
pub(crate) fn absolute_path(path: impl AsRef<Path>) -> Result<PathBuf> {
let path = path.as_ref();
let absolute_path = if path.is_absolute() {
path.to_path_buf()
} else {
env::current_dir()?.join(path)
}
.clean();
Ok(absolute_path)
}
// Get the parent directory of a given `Path`.
fn dirname(src: &Path) -> PathBuf {
if src.is_file() {
src.to_path_buf()
.parent()
.map_or(PathBuf::new(), Path::to_path_buf)
} else {
src.to_path_buf()
}
}
// Resolve `dst` that was linked to from within `src`
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
if dst.is_relative() {
// Find `dst` in the parent directory of `src`
if let Some(parent) = src.parent() {
let rel_path = parent.join(dst.to_path_buf());
return absolute_path(&rel_path);
}
}
if dst.is_absolute() {
// Absolute local links (leading slash) require the `base_url` to
// define the document root.
let base = get_base_dir(base).ok_or_else(|| {
ErrorKind::InvalidBase(
"<empty>".to_string(),
format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
)
})?;
let abs_path = join(dirname(&base), dst);
return absolute_path(&abs_path);
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
}
// A cumbersome way to concatenate paths without checking their
// existence on disk. See https://github.com/rust-lang/rust/issues/16507
fn join(base: PathBuf, dst: &Path) -> PathBuf {
let mut abs = base.into_os_string();
let target_str = dst.as_os_str();
abs.push(target_str);
PathBuf::from(abs)
}
#[cfg(test)]
mod test_path {
use super::*;
use crate::Result;
// index.html
// ./foo.html
#[test]
fn test_resolve_relative() -> Result<()> {
let dummy = PathBuf::from("index.html");
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
env::current_dir()?.join("foo.html")
);
Ok(())
}
// ./index.html
// ./foo.html
#[test]
fn test_resolve_relative_index() -> Result<()> {
let dummy = PathBuf::from("./index.html");
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&dummy, &abs_path, &None)?,
env::current_dir()?.join("foo.html")
);
Ok(())
}
// /path/to/index.html
// ./foo.html
#[test]
fn test_resolve_from_absolute() -> Result<()> {
let abs_index = PathBuf::from("/path/to/index.html");
let abs_path = PathBuf::from("./foo.html");
assert_eq!(
resolve(&abs_index, &abs_path, &None)?,
PathBuf::from("/path/to/foo.html")
);
Ok(())
}
// dummy
// foo.html
// valid base dir
#[test]
fn test_resolve_absolute_from_base_dir() -> Result<()> {
let dummy = PathBuf::new();
let abs_path = PathBuf::from("/foo.html");
let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
assert_eq!(
resolve(&dummy, &abs_path, &base)?,
PathBuf::from("/some/absolute/base/dir/foo.html")
);
Ok(())
}
// /path/to/index.html
// /other/path/to/foo.html
#[test]
fn test_resolve_absolute_from_absolute() -> Result<()> {
let abs_index = PathBuf::from("/path/to/index.html");
let abs_path = PathBuf::from("/other/path/to/foo.html");
let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
assert_eq!(
resolve(&abs_index, &abs_path, &base)?,
PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html")
);
Ok(())
}
}

View file

@ -0,0 +1,93 @@
use linkify::LinkFinder;
/// Remove all GET parameters from a URL.
/// The link is not a URL but a String as it may not have a base domain.
pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
let path = match url.split_once('#') {
Some((path_without_fragment, _fragment)) => path_without_fragment,
None => url,
};
let path = match path.split_once('?') {
Some((path_without_params, _params)) => path_without_params,
None => path,
};
path
}
/// Determine if an element's attribute contains a link / URL.
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
matches!(
(attr_name, elem_name),
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
)
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
pub(crate) fn is_anchor(url: &str) -> bool {
url.starts_with('#')
}
// Use `LinkFinder` to offload the raw link searching in plaintext
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
finder.links(input).collect()
}
#[cfg(test)]
mod test_fs_tree {
use super::*;
#[test]
fn test_is_anchor() {
assert!(is_anchor("#anchor"));
assert!(!is_anchor("notan#anchor"));
}
#[test]
fn test_remove_get_params_and_fragment() {
assert_eq!(remove_get_params_and_fragment("/"), "/");
assert_eq!(
remove_get_params_and_fragment("index.html?foo=bar"),
"index.html"
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar"),
"/index.html"
);
assert_eq!(
remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html"
);
assert_eq!(
remove_get_params_and_fragment("https://example.org/index.html?foo=bar"),
"https://example.org/index.html"
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar"),
"test.png"
);
assert_eq!(
remove_get_params_and_fragment("https://example.org/index.html#anchor"),
"https://example.org/index.html"
);
assert_eq!(
remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"),
"https://example.org/index.html"
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor"),
"test.png"
);
assert_eq!(
remove_get_params_and_fragment("test.png#anchor?anchor!?"),
"test.png"
);
assert_eq!(
remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
"test.png"
);
}
}

View file

@ -41,18 +41,18 @@
)]
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
#![deny(missing_docs)]
#![allow(clippy::module_name_repetitions)]
#[cfg(doctest)]
doc_comment::doctest!("../../README.md");
mod client;
mod client_pool;
mod quirks;
mod types;
mod uri;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod helpers;
mod quirks;
mod types;
/// Functionality to extract URIs from inputs
pub mod extract;
@ -75,8 +75,7 @@ use ring as _; // required for apple silicon
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
collector::{Collector, Input},
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
uri::Uri,
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
};

View file

@ -0,0 +1,83 @@
use reqwest::Url;
use serde::{Deserialize, Serialize};
use std::{convert::TryFrom, path::PathBuf};
use crate::ErrorKind;
/// When encountering links without a full domain in a document,
/// the base determines where this resource can be found.
/// Both, local and remote targets are supported.
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
#[allow(variant_size_differences)]
pub enum Base {
/// Local file path pointing to root directory
Local(PathBuf),
/// Remote URL pointing to a website homepage
Remote(Url),
}
impl Base {
/// Join link with base url
#[must_use]
pub fn join(&self, link: &str) -> Option<Url> {
match self {
Self::Remote(url) => url.join(link).ok(),
Self::Local(_) => None,
}
}
/// Return the directory if the base is local
#[must_use]
pub fn dir(&self) -> Option<PathBuf> {
match self {
Self::Remote(_) => None,
Self::Local(d) => Some(d.clone()),
}
}
}
impl TryFrom<&str> for Base {
type Error = ErrorKind;
fn try_from(value: &str) -> Result<Self, Self::Error> {
if let Ok(url) = Url::parse(value) {
if url.cannot_be_a_base() {
return Err(ErrorKind::InvalidBase(
value.to_string(),
"The given URL cannot be a base".to_string(),
));
}
return Ok(Self::Remote(url));
}
Ok(Self::Local(PathBuf::from(value)))
}
}
#[cfg(test)]
mod test_base {
use crate::Result;
use super::*;
#[test]
fn test_valid_remote() -> Result<()> {
let base = Base::try_from("https://endler.dev")?;
assert_eq!(
base,
Base::Remote(Url::parse("https://endler.dev").unwrap())
);
Ok(())
}
#[test]
fn test_invalid_url() {
assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
}
#[test]
fn test_valid_local() -> Result<()> {
let dir = tempfile::tempdir()?;
Base::try_from(dir.as_ref().to_str().unwrap())?;
Ok(())
}
}

View file

@ -10,21 +10,32 @@ use crate::Uri;
#[derive(Debug)]
#[non_exhaustive]
pub enum ErrorKind {
// TODO: maybe need to be splitted; currently first slot is Some only for reading files
// TODO: maybe needs to be split; currently first element is `Some` only for
// reading files
/// Any form of I/O error occurred while reading from a given path.
IoError(Option<PathBuf>, std::io::Error),
/// Errors which can occur when attempting to interpret a sequence of u8 as a string
Utf8Error(std::str::Utf8Error),
/// Network error when trying to connect to an endpoint via reqwest.
ReqwestError(reqwest::Error),
/// Network error when trying to connect to an endpoint via hubcaps.
HubcapsError(hubcaps::Error),
/// The given string can not be parsed into a valid URL or e-mail address
/// The given string can not be parsed into a valid URL, e-mail address, or file path
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
/// The given URI cannot be converted to a file path
InvalidFilePath(Uri),
/// The given path cannot be converted to a URI
InvalidUrl(PathBuf),
/// The given mail address is unreachable
UnreachableEmailAddress(Uri),
/// The given header could not be parsed.
/// A possible error when converting a `HeaderValue` from a string or byte
/// slice.
InvalidHeader(InvalidHeaderValue),
/// The given string can not be parsed into a valid base URL or base directory
InvalidBase(String, String),
/// Cannot find local file
FileNotFound(PathBuf),
/// The given UNIX glob pattern is invalid
InvalidGlobPattern(glob::PatternError),
/// The Github API could not be called because of a missing Github token.
@ -63,8 +74,14 @@ impl Hash for ErrorKind {
Self::IoError(p, e) => (p, e.kind()).hash(state),
Self::ReqwestError(e) => e.to_string().hash(state),
Self::HubcapsError(e) => e.to_string().hash(state),
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
Self::InvalidUrl(p) => p.hash(state),
Self::Utf8Error(e) => e.to_string().hash(state),
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
u.hash(state);
}
Self::InvalidBase(base, e) => (base, e).hash(state),
Self::InvalidHeader(e) => e.to_string().hash(state),
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
@ -84,6 +101,7 @@ impl Display for ErrorKind {
Self::IoError(None, e) => e.fmt(f),
Self::ReqwestError(e) => e.fmt(f),
Self::HubcapsError(e) => e.fmt(f),
Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
write!(
f,
@ -94,6 +112,8 @@ impl Display for ErrorKind {
Self::UrlParseError(s, (url_err, None)) => {
write!(f, "Cannot parse {} as website url ({})", s, url_err)
}
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
Self::InvalidHeader(e) => e.fmt(f),
Self::InvalidGlobPattern(e) => e.fmt(f),
@ -106,6 +126,8 @@ impl Display for ErrorKind {
"This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
uri
),
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
Self::Utf8Error(e) => e.fmt(f),
}
}
}
@ -125,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind {
}
}
impl From<std::str::Utf8Error> for ErrorKind {
fn from(e: std::str::Utf8Error) -> Self {
Self::Utf8Error(e)
}
}
impl From<std::io::Error> for ErrorKind {
fn from(e: std::io::Error) -> Self {
Self::IoError(None, e)
@ -149,6 +177,12 @@ impl From<hubcaps::errors::Error> for ErrorKind {
}
}
impl From<url::ParseError> for ErrorKind {
fn from(e: url::ParseError) -> Self {
Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
}
}
impl From<(String, url::ParseError)> for ErrorKind {
fn from(value: (String, url::ParseError)) -> Self {
Self::UrlParseError(value.0, (value.1, None))

View file

@ -0,0 +1,37 @@
use std::path::Path;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// `FileType` defines which file types lychee can handle
pub enum FileType {
/// File in HTML format
Html,
/// File in Markdown format
Markdown,
/// Generic text file without syntax-specific parsing
Plaintext,
}
impl Default for FileType {
fn default() -> Self {
Self::Plaintext
}
}
impl<P: AsRef<Path>> From<P> for FileType {
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
fn from(p: P) -> FileType {
let path = p.as_ref();
// Assume HTML in case of no extension.
// Note: this is only reasonable for URLs; not paths on disk.
// For example, `README` without an extension is more likely to be a plaintext file.
// A better solution would be to also implement `From<Url> for FileType`.
// Unfortunately that's not possible without refactoring, as
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
// `From<Url> for FileType` is not allowed.
match path.extension().and_then(std::ffi::OsStr::to_str) {
Some("md" | "markdown") => FileType::Markdown,
Some("htm" | "html") | None => FileType::Html,
Some(_) => FileType::Plaintext,
}
}
}

View file

@ -0,0 +1,214 @@
use crate::types::FileType;
use crate::Result;
use glob::glob_with;
use reqwest::Url;
use serde::Serialize;
use shellexpand::tilde;
use std::path::{Path, PathBuf};
use std::{fmt::Display, fs::read_to_string};
use tokio::io::{stdin, AsyncReadExt};
const STDIN: &str = "-";
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
/// An exhaustive list of input sources, which lychee accepts
pub enum Input {
/// URL (of HTTP/HTTPS scheme).
RemoteUrl(Box<Url>),
/// Unix shell-style glob pattern.
FsGlob {
/// The glob pattern matching all input files
pattern: String,
/// Don't be case sensitive when matching files against a glob
ignore_case: bool,
},
/// File path.
FsPath(PathBuf),
/// Standard Input.
Stdin,
/// Raw string input.
String(String),
}
impl Serialize for Input {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
serializer.collect_str(self)
}
}
impl Display for Input {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Input::RemoteUrl(url) => url.as_str(),
Input::FsGlob { pattern, .. } => pattern,
Input::FsPath(path) => path.to_str().unwrap_or_default(),
Input::Stdin => "stdin",
Input::String(_) => "raw input string",
})
}
}
#[derive(Debug)]
/// Encapsulates the content for a given input
pub struct InputContent {
/// Input source
pub input: Input,
/// File type of given input
pub file_type: FileType,
/// Raw UTF-8 string content
pub content: String,
}
impl InputContent {
#[must_use]
/// Create an instance of `InputContent` from an input string
pub fn from_string(s: &str, file_type: FileType) -> Self {
// TODO: consider using Cow (to avoid one .clone() for String types)
Self {
input: Input::String(s.to_owned()),
file_type,
content: s.to_owned(),
}
}
}
impl Input {
#[must_use]
/// Construct a new `Input` source. In case the input is a `glob` pattern,
/// `glob_ignore_case` decides whether matching files against the `glob` is
/// case-insensitive or not
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
if value == STDIN {
Self::Stdin
} else if let Ok(url) = Url::parse(value) {
Self::RemoteUrl(Box::new(url))
} else {
// this seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(value) != value;
if is_glob {
Self::FsGlob {
pattern: value.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
Self::FsPath(value.into())
}
}
}
#[allow(clippy::missing_panics_doc)]
/// Retrieve the contents from the input
///
/// # Errors
///
/// Returns an error if the contents can not be retrieved
/// because of an underlying I/O error (e.g. an error while making a
/// network request or retrieving the contents from the file system)
pub async fn get_contents(
&self,
file_type_hint: Option<FileType>,
skip_missing: bool,
) -> Result<Vec<InputContent>> {
match *self {
// TODO: should skip_missing also affect URLs?
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
Input::FsGlob {
ref pattern,
ignore_case,
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
Input::FsPath(ref path) => {
let content = Self::path_content(path);
match content {
Ok(input_content) => Ok(vec![input_content]),
Err(_) if skip_missing => Ok(vec![]),
Err(e) => Err(e),
}
}
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
}
}
async fn url_contents(url: &Url) -> Result<InputContent> {
// Assume HTML for default paths
let file_type = if url.path().is_empty() || url.path() == "/" {
FileType::Html
} else {
FileType::from(url.as_str())
};
let res = reqwest::get(url.clone()).await?;
let input_content = InputContent {
input: Input::RemoteUrl(Box::new(url.clone())),
file_type,
content: res.text().await?,
};
Ok(input_content)
}
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
let mut contents = vec![];
let glob_expanded = tilde(&path_glob);
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;
for entry in glob_with(&glob_expanded, match_opts)? {
match entry {
Ok(path) => {
if path.is_dir() {
// Directories can still have a suffix which looks like
// a file extension like `foo.html`. This can lead to
// unexpected behavior with glob patterns like
// `**/*.html`. Therefore filter these out.
// https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819
continue;
}
let content = Self::path_content(&path)?;
contents.push(content);
}
Err(e) => println!("{:?}", e),
}
}
Ok(contents)
}
/// Get the input content of a given path
/// # Errors
///
/// Will return `Err` if file contents can't be read
pub fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
let input_content = InputContent {
file_type: FileType::from(path.as_ref()),
content,
input: Input::FsPath(path.into()),
};
Ok(input_content)
}
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
let mut content = String::new();
let mut stdin = stdin();
stdin.read_to_string(&mut content).await?;
let input_content = InputContent {
input: Input::Stdin,
file_type: file_type_hint.unwrap_or_default(),
content,
};
Ok(input_content)
}
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
InputContent::from_string(s, file_type_hint.unwrap_or_default())
}
}

View file

@ -1,14 +1,22 @@
#![allow(unreachable_pub)]
mod base;
mod error;
mod file;
mod input;
mod request;
mod response;
mod status;
mod uri;
pub use base::Base;
pub use error::ErrorKind;
pub use file::FileType;
pub use input::{Input, InputContent};
pub use request::Request;
pub use response::{Response, ResponseBody};
pub use status::Status;
pub use uri::Uri;
/// The lychee `Result` type
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;

View file

@ -82,9 +82,16 @@ impl Uri {
}
#[inline]
/// Check if the URI is a valid mail address
pub(crate) fn is_mail(&self) -> bool {
self.scheme() == "mailto"
}
#[inline]
/// Check if the URI is a file
pub(crate) fn is_file(&self) -> bool {
self.scheme() == "file"
}
}
impl AsRef<str> for Uri {