mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-05 05:51:00 +00:00
Add support for local files #262
This commit is contained in:
commit
9b5fc399ed
33 changed files with 966 additions and 374 deletions
6
.github/workflows/release.yml
vendored
6
.github/workflows/release.yml
vendored
|
|
@ -43,17 +43,17 @@ jobs:
|
|||
fail-fast: false
|
||||
steps:
|
||||
- name: Install musl tools
|
||||
if: contains(matrix.target, 'musl')
|
||||
if: ${{ contains(matrix.target, 'musl') }}
|
||||
run: sudo apt-get install -y musl-tools
|
||||
|
||||
- name: Install arm tools
|
||||
if: contains(matrix.target, 'arm')
|
||||
if: ${{ contains(matrix.target, 'arm') }}
|
||||
run: |
|
||||
echo "GNU_PREFIX=arm-linux-gnueabihf-" >> $GITHUB_ENV
|
||||
sudo apt-get install -y binutils-arm-linux-gnueabihf
|
||||
|
||||
- name: Install aarch64 tools
|
||||
if: contains(matrix.target, 'aarch64')
|
||||
if: ${{ contains(matrix.target, 'aarch64') }}
|
||||
run: |
|
||||
echo "GNU_PREFIX=aarch64-linux-gnu-" >> $GITHUB_ENV
|
||||
sudo apt-get install -y binutils-aarch64-linux-gnu
|
||||
|
|
|
|||
2
.github/workflows/rust.yml
vendored
2
.github/workflows/rust.yml
vendored
|
|
@ -56,7 +56,7 @@ jobs:
|
|||
- run: cargo-publish-all --dry-run
|
||||
|
||||
publish:
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
if: ${{ startsWith(github.ref, 'refs/tags/') }}
|
||||
needs:
|
||||
- test
|
||||
- lint
|
||||
|
|
|
|||
9
Cargo.lock
generated
9
Cargo.lock
generated
|
|
@ -1406,8 +1406,11 @@ dependencies = [
|
|||
"http",
|
||||
"hubcaps",
|
||||
"linkify",
|
||||
"log",
|
||||
"markup5ever_rcdom",
|
||||
"openssl-sys",
|
||||
"path-clean",
|
||||
"percent-encoding",
|
||||
"pretty_assertions",
|
||||
"pulldown-cmark",
|
||||
"regex",
|
||||
|
|
@ -1732,6 +1735,12 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "path-clean"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ecba01bf2678719532c5e3059e0b5f0811273d94b397088b82e3bd0a78c78fdd"
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "0.8.3"
|
||||
|
|
|
|||
6
Makefile
6
Makefile
|
|
@ -18,10 +18,14 @@ docker-run: ## Run Docker image
|
|||
docker-push: ## Push image to Docker Hub
|
||||
docker push $(IMAGE_NAME)
|
||||
|
||||
.PHONY: build-local
|
||||
.PHONY: build
|
||||
build: ## Build Rust code locally
|
||||
cargo build
|
||||
|
||||
.PHONY: install
|
||||
install: ## Install project locally
|
||||
cargo install --path lychee-bin
|
||||
|
||||
.PHONY: run
|
||||
run: ## Run Rust code locally
|
||||
cargo run
|
||||
|
|
|
|||
11
README.md
11
README.md
|
|
@ -161,11 +161,15 @@ lychee ~/projects/*/README.md
|
|||
|
||||
# check links in local files (lychee supports advanced globbing and ~ expansion):
|
||||
lychee "~/projects/big_project/**/README.*"
|
||||
|
||||
# ignore case when globbing and check result for each link:
|
||||
lychee --glob-ignore-case --verbose "~/projects/**/[r]eadme.*"
|
||||
|
||||
# check links from epub file (requires atool: https://www.nongnu.org/atool)
|
||||
acat -F zip {file.epub} "*.xhtml" "*.html" | lychee -
|
||||
|
||||
# check links in directory; block network requests
|
||||
lychee --offline path/to/directory
|
||||
```
|
||||
|
||||
### GitHub token
|
||||
|
|
@ -202,6 +206,7 @@ FLAGS:
|
|||
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
|
||||
-n, --no-progress Do not show progress bar.
|
||||
This is recommended for non-interactive shells (e.g. for continuous integration)
|
||||
--offline Only check local files and block network requests
|
||||
--require-https When HTTPS is available, treat HTTP links as errors
|
||||
--skip-missing Skip missing input files (default is to error if they don't exist)
|
||||
-V, --version Prints version information
|
||||
|
|
@ -209,7 +214,8 @@ FLAGS:
|
|||
|
||||
OPTIONS:
|
||||
-a, --accept <accept> Comma-separated list of accepted status codes for valid links
|
||||
-b, --base-url <base-url> Base URL to check relative URLs
|
||||
-b, --base <base> Base URL or website root directory to check relative URLs e.g.
|
||||
https://example.org or `/path/to/public`
|
||||
--basic-auth <basic-auth> Basic authentication support. E.g. `username:password`
|
||||
-c, --config <config-file> Configuration file to use [default: ./lychee.toml]
|
||||
--exclude <exclude>... Exclude URLs from checking (supports regex)
|
||||
|
|
@ -310,7 +316,8 @@ Try one of these links to get started:
|
|||
- [good first issues](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
|
||||
- [help wanted](https://github.com/lycheeverse/lychee/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
|
||||
|
||||
Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started. Begin my making sure the following commands succeed without errors.
|
||||
Lychee is written in Rust. Install [rust-up](https://rustup.rs/) to get started.
|
||||
Begin my making sure the following commands succeed without errors.
|
||||
|
||||
```bash
|
||||
cargo test # runs tests
|
||||
|
|
|
|||
|
|
@ -14,12 +14,12 @@ async fn main() -> Result<()> {
|
|||
];
|
||||
|
||||
let links = Collector::new(
|
||||
None, // base_url
|
||||
None, // base
|
||||
false, // don't skip missing inputs
|
||||
10, // max concurrency
|
||||
)
|
||||
.collect_links(
|
||||
inputs, // base_url
|
||||
inputs, // base url or directory
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
This link should be ignored as it is not a fully qualified URL.
|
||||

|
||||
Check file link
|
||||

|
||||
|
||||

|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
slack://channel?id=123
|
||||
file://foo/bar
|
||||
file:///test_folder/test_file
|
||||
https://example.org
|
||||
|
|
|
|||
0
fixtures/offline/404.html/.gitkeep
Normal file
0
fixtures/offline/404.html/.gitkeep
Normal file
21
fixtures/offline/about/index.html
Normal file
21
fixtures/offline/about/index.html
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>About</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>About</h1>
|
||||
<p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="https://example.org">example</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/">home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/post1">Post 1</a>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
0
fixtures/offline/another page/.gitkeep
Normal file
0
fixtures/offline/another page/.gitkeep
Normal file
21
fixtures/offline/blog/post1/index.html
Normal file
21
fixtures/offline/blog/post1/index.html
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Post 2</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Post 2 Title</h1>
|
||||
<p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/">home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/post1">Post 1</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="../about">Relative</a>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
18
fixtures/offline/blog/post2/index.html
Normal file
18
fixtures/offline/blog/post2/index.html
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Post 1</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Post 1 Title</h1>
|
||||
<p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/">home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/post2">Post 2</a>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
27
fixtures/offline/index.html
Normal file
27
fixtures/offline/index.html
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Index</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Index Title</h1>
|
||||
<p>
|
||||
<ul>
|
||||
<li>
|
||||
<a href="/">home</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/about">About</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/about#fragment">About</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/another page">About</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="/another%20page">About</a>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -70,10 +70,7 @@ use anyhow::{anyhow, Context, Result};
|
|||
use headers::{authorization::Basic, Authorization, HeaderMap, HeaderMapExt, HeaderName};
|
||||
use http::StatusCode;
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use lychee_lib::{
|
||||
collector::{Collector, Input},
|
||||
ClientBuilder, ClientPool, Response,
|
||||
};
|
||||
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Response};
|
||||
use openssl_sys as _; // required for vendored-openssl feature
|
||||
use regex::RegexSet;
|
||||
use ring as _; // required for apple silicon
|
||||
|
|
@ -178,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
let include = RegexSet::new(&cfg.include)?;
|
||||
let exclude = RegexSet::new(&cfg.exclude)?;
|
||||
|
||||
// Offline mode overrides the scheme
|
||||
let schemes = if cfg.offline {
|
||||
vec!["file".to_string()]
|
||||
} else {
|
||||
cfg.scheme.clone()
|
||||
};
|
||||
|
||||
let client = ClientBuilder::builder()
|
||||
.includes(include)
|
||||
.excludes(exclude)
|
||||
|
|
@ -193,14 +197,14 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
.method(method)
|
||||
.timeout(timeout)
|
||||
.github_token(cfg.github_token.clone())
|
||||
.schemes(HashSet::from_iter(cfg.scheme.clone()))
|
||||
.schemes(HashSet::from_iter(schemes))
|
||||
.accepted(accepted)
|
||||
.require_https(cfg.require_https)
|
||||
.build()
|
||||
.client()
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
let links = Collector::new(cfg.base_url.clone(), cfg.skip_missing, max_concurrency)
|
||||
let links = Collector::new(cfg.base.clone(), cfg.skip_missing, max_concurrency)
|
||||
.collect_links(&inputs)
|
||||
.await
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,8 @@
|
|||
use std::{fs, io::ErrorKind, path::PathBuf, str::FromStr};
|
||||
use std::{convert::TryFrom, fs, io::ErrorKind, path::PathBuf, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use lychee_lib::collector::Input;
|
||||
use reqwest::Url;
|
||||
use lychee_lib::{Base, Input};
|
||||
use serde::Deserialize;
|
||||
use structopt::{clap::crate_version, StructOpt};
|
||||
|
||||
|
|
@ -76,6 +75,10 @@ macro_rules! fold_in {
|
|||
};
|
||||
}
|
||||
|
||||
fn parse_base(src: &str) -> Result<Base, lychee_lib::ErrorKind> {
|
||||
Base::try_from(src)
|
||||
}
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
#[structopt(
|
||||
name = "lychee",
|
||||
|
|
@ -161,6 +164,11 @@ pub(crate) struct Config {
|
|||
#[serde(default)]
|
||||
pub(crate) scheme: Vec<String>,
|
||||
|
||||
/// Only check local files and block network requests.
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
pub(crate) offline: bool,
|
||||
|
||||
/// URLs to check (supports regex). Has preference over all excludes.
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
|
|
@ -223,10 +231,11 @@ pub(crate) struct Config {
|
|||
#[serde(default = "method")]
|
||||
pub(crate) method: String,
|
||||
|
||||
/// Base URL to check relative URLs
|
||||
#[structopt(short, long, parse(try_from_str))]
|
||||
/// Base URL or website root directory to check relative URLs
|
||||
/// e.g. https://example.org or `/path/to/public`
|
||||
#[structopt(short, long, parse(try_from_str = parse_base))]
|
||||
#[serde(default)]
|
||||
pub(crate) base_url: Option<Url>,
|
||||
pub(crate) base: Option<Base>,
|
||||
|
||||
/// Basic authentication support. E.g. `username:password`
|
||||
#[structopt(long)]
|
||||
|
|
@ -311,7 +320,7 @@ impl Config {
|
|||
accept: None;
|
||||
timeout: TIMEOUT;
|
||||
method: METHOD;
|
||||
base_url: None;
|
||||
base: None;
|
||||
basic_auth: None;
|
||||
github_token: None;
|
||||
skip_missing: false;
|
||||
|
|
|
|||
|
|
@ -133,15 +133,38 @@ mod cli {
|
|||
|
||||
/// Test unsupported URI schemes
|
||||
#[test]
|
||||
fn test_unsupported_uri_schemes() -> Result<()> {
|
||||
test_json_output!(
|
||||
"TEST_SCHEMES.txt",
|
||||
MockResponseStats {
|
||||
total: 1,
|
||||
successful: 1,
|
||||
..MockResponseStats::default()
|
||||
}
|
||||
)
|
||||
fn test_unsupported_uri_schemes() {
|
||||
let mut cmd = main_command();
|
||||
let test_schemes_path = fixtures_path().join("TEST_SCHEMES.txt");
|
||||
|
||||
// Exclude file link because it doesn't exist on the filesystem.
|
||||
// (File URIs are absolute paths, which we don't have.)
|
||||
// Nevertheless, the `file` scheme should be recognized.
|
||||
cmd.arg(test_schemes_path)
|
||||
.arg("--exclude")
|
||||
.arg("file://")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............2"))
|
||||
.stdout(contains("Successful.......1"))
|
||||
.stdout(contains("Excluded.........1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_paths() {
|
||||
let mut cmd = main_command();
|
||||
let offline_dir = fixtures_path().join("offline");
|
||||
|
||||
cmd.arg("--offline")
|
||||
.arg("--base")
|
||||
.arg(&offline_dir)
|
||||
.arg(&offline_dir.join("index.html"))
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............3"))
|
||||
.stdout(contains("Successful.......3"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -367,7 +390,7 @@ mod cli {
|
|||
.assert()
|
||||
.success();
|
||||
|
||||
let expected = r#"{"total":10,"successful":10,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
|
||||
let expected = r#"{"total":11,"successful":11,"failures":0,"unknown":0,"timeouts":0,"redirects":0,"excludes":0,"errors":0,"fail_map":{}}"#;
|
||||
let output = fs::read_to_string(&outfile)?;
|
||||
assert_eq!(output.split_whitespace().collect::<String>(), expected);
|
||||
fs::remove_file(outfile)?;
|
||||
|
|
@ -385,7 +408,7 @@ mod cli {
|
|||
.arg(".*")
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Excluded........10"));
|
||||
.stdout(contains("Excluded........11"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
36
lychee-bin/tests/local_files.rs
Normal file
36
lychee-bin/tests/local_files.rs
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
#[cfg(test)]
|
||||
mod cli {
|
||||
use std::{fs::File, io::Write};
|
||||
|
||||
use assert_cmd::Command;
|
||||
use lychee_lib::Result;
|
||||
use predicates::str::contains;
|
||||
|
||||
fn main_command() -> Command {
|
||||
// this gets the "main" binary name (e.g. `lychee`)
|
||||
Command::cargo_bin(env!("CARGO_PKG_NAME")).expect("Couldn't get cargo package name")
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_local_file() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let index_path = dir.path().join("index.html");
|
||||
let mut index = File::create(&index_path)?;
|
||||
writeln!(index, r#"<a href="./foo.html">Foo</a>"#)?;
|
||||
|
||||
let foo_path = dir.path().join("foo.html");
|
||||
File::create(&foo_path)?;
|
||||
|
||||
let mut cmd = main_command();
|
||||
cmd.arg(index_path)
|
||||
.arg("--no-progress")
|
||||
.arg("--verbose")
|
||||
.env_clear()
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(contains("Total............1"))
|
||||
.stdout(contains("foo.html"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -40,6 +40,9 @@ shellexpand = "2.1.0"
|
|||
tokio = { version = "1.6.0", features = ["full"] }
|
||||
typed-builder = "0.9.1"
|
||||
url = { version = "2.2.2", features = ["serde"] }
|
||||
log = "0.4.14"
|
||||
path-clean = "0.1.0"
|
||||
percent-encoding = "2.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
doc-comment = "0.3.3"
|
||||
|
|
|
|||
|
|
@ -20,8 +20,7 @@ use typed_builder::TypedBuilder;
|
|||
use crate::{
|
||||
filter::{Excludes, Filter, Includes},
|
||||
quirks::Quirks,
|
||||
uri::Uri,
|
||||
ErrorKind, Request, Response, Result, Status,
|
||||
ErrorKind, Request, Response, Result, Status, Uri,
|
||||
};
|
||||
|
||||
const DEFAULT_MAX_REDIRECTS: usize = 5;
|
||||
|
|
@ -178,6 +177,8 @@ impl Client {
|
|||
let Request { uri, source } = Request::try_from(request)?;
|
||||
let status = if self.filter.is_excluded(&uri) {
|
||||
Status::Excluded
|
||||
} else if uri.is_file() {
|
||||
self.check_file(&uri).await
|
||||
} else if uri.is_mail() {
|
||||
self.check_mail(&uri).await
|
||||
} else {
|
||||
|
|
@ -255,6 +256,15 @@ impl Client {
|
|||
}
|
||||
}
|
||||
|
||||
pub async fn check_file(&self, uri: &Uri) -> Status {
|
||||
if let Ok(path) = uri.url.to_file_path() {
|
||||
if path.exists() {
|
||||
return Status::Ok(StatusCode::OK);
|
||||
}
|
||||
}
|
||||
ErrorKind::InvalidFilePath(uri.clone()).into()
|
||||
}
|
||||
|
||||
pub async fn check_mail(&self, uri: &Uri) -> Status {
|
||||
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
|
||||
let result = &(check_email(&input).await)[0];
|
||||
|
|
@ -284,11 +294,13 @@ where
|
|||
mod test {
|
||||
use std::{
|
||||
convert::TryInto,
|
||||
fs::File,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use http::{header::HeaderMap, StatusCode};
|
||||
use reqwest::header;
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::ClientBuilder;
|
||||
use crate::{mock_server, test_utils::get_mock_client_response, Uri};
|
||||
|
|
@ -373,6 +385,17 @@ mod test {
|
|||
assert!(res.status().is_success());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file() {
|
||||
let dir = tempdir().unwrap();
|
||||
let file = dir.path().join("temp");
|
||||
File::create(file).unwrap();
|
||||
let uri = format!("file://{}", dir.path().join("temp").to_str().unwrap());
|
||||
|
||||
let res = get_mock_client_response(uri).await;
|
||||
assert!(res.status().is_success());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_custom_headers() {
|
||||
// See https://github.com/rust-lang/crates.io/issues/788
|
||||
|
|
|
|||
|
|
@ -1,223 +1,10 @@
|
|||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::Display,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use glob::glob_with;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use shellexpand::tilde;
|
||||
use tokio::{
|
||||
fs::read_to_string,
|
||||
io::{stdin, AsyncReadExt},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
extract::{extract_links, FileType},
|
||||
uri::Uri,
|
||||
Request, Result,
|
||||
};
|
||||
|
||||
const STDIN: &str = "-";
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[non_exhaustive]
|
||||
/// An exhaustive list of input sources, which lychee accepts
|
||||
pub enum Input {
|
||||
/// URL (of HTTP/HTTPS scheme).
|
||||
RemoteUrl(Box<Url>),
|
||||
/// Unix shell-style glob pattern.
|
||||
FsGlob {
|
||||
/// The glob pattern matching all input files
|
||||
pattern: String,
|
||||
/// Don't be case sensitive when matching files against a glob
|
||||
ignore_case: bool,
|
||||
},
|
||||
/// File path.
|
||||
FsPath(PathBuf),
|
||||
/// Standard Input.
|
||||
Stdin,
|
||||
/// Raw string input.
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl Serialize for Input {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Input {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Input::RemoteUrl(url) => url.as_str(),
|
||||
Input::FsGlob { pattern, .. } => pattern,
|
||||
Input::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Input::Stdin => "stdin",
|
||||
Input::String(_) => "raw input string",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub input: Input,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
input: Input::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Input {
|
||||
#[must_use]
|
||||
/// Construct a new `Input` source. In case the input is a `glob` pattern,
|
||||
/// `glob_ignore_case` decides whether matching files against the `glob` is
|
||||
/// case-insensitive or not
|
||||
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
|
||||
if value == STDIN {
|
||||
Self::Stdin
|
||||
} else if let Ok(url) = Url::parse(value) {
|
||||
Self::RemoteUrl(Box::new(url))
|
||||
} else {
|
||||
// this seems to be the only way to determine if this is a glob pattern
|
||||
let is_glob = glob::Pattern::escape(value) != value;
|
||||
|
||||
if is_glob {
|
||||
Self::FsGlob {
|
||||
pattern: value.to_owned(),
|
||||
ignore_case: glob_ignore_case,
|
||||
}
|
||||
} else {
|
||||
Self::FsPath(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
/// Retrieve the contents from the input
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the contents can not be retrieved
|
||||
/// because of an underlying I/O error (e.g. an error while making a
|
||||
/// network request or retrieving the contents from the file system)
|
||||
pub async fn get_contents(
|
||||
&self,
|
||||
file_type_hint: Option<FileType>,
|
||||
skip_missing: bool,
|
||||
) -> Result<Vec<InputContent>> {
|
||||
match *self {
|
||||
// TODO: should skip_missing also affect URLs?
|
||||
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
|
||||
Input::FsGlob {
|
||||
ref pattern,
|
||||
ignore_case,
|
||||
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
|
||||
Input::FsPath(ref path) => {
|
||||
let content = Self::path_content(path).await;
|
||||
match content {
|
||||
Ok(input_content) => Ok(vec![input_content]),
|
||||
Err(_) if skip_missing => Ok(vec![]),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
|
||||
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
|
||||
}
|
||||
}
|
||||
|
||||
async fn url_contents(url: &Url) -> Result<InputContent> {
|
||||
// Assume HTML for default paths
|
||||
let file_type = if url.path().is_empty() || url.path() == "/" {
|
||||
FileType::Html
|
||||
} else {
|
||||
FileType::from(url.as_str())
|
||||
};
|
||||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(Box::new(url.clone())),
|
||||
file_type,
|
||||
content: res.text().await?,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
|
||||
let mut contents = vec![];
|
||||
let glob_expanded = tilde(&path_glob);
|
||||
let mut match_opts = glob::MatchOptions::new();
|
||||
|
||||
match_opts.case_sensitive = !ignore_case;
|
||||
|
||||
for entry in glob_with(&glob_expanded, match_opts)? {
|
||||
match entry {
|
||||
Ok(path) => {
|
||||
let content = Self::path_content(&path).await?;
|
||||
contents.push(content);
|
||||
}
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
async fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
|
||||
let content = read_to_string(&path)
|
||||
.await
|
||||
.map_err(|e| (path.clone().into(), e))?;
|
||||
let input_content = InputContent {
|
||||
file_type: FileType::from(path.as_ref()),
|
||||
content,
|
||||
input: Input::FsPath(path.into()),
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
|
||||
let mut content = String::new();
|
||||
let mut stdin = stdin();
|
||||
stdin.read_to_string(&mut content).await?;
|
||||
|
||||
let input_content = InputContent {
|
||||
input: Input::Stdin,
|
||||
file_type: file_type_hint.unwrap_or_default(),
|
||||
content,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
|
||||
InputContent::from_string(s, file_type_hint.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
use crate::{extract::extract_links, Base, Input, Request, Result, Uri};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Collector keeps the state of link collection
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Collector {
|
||||
base_url: Option<Url>,
|
||||
base: Option<Base>,
|
||||
skip_missing_inputs: bool,
|
||||
max_concurrency: usize,
|
||||
cache: HashSet<Uri>,
|
||||
|
|
@ -226,9 +13,9 @@ pub struct Collector {
|
|||
impl Collector {
|
||||
/// Create a new collector with an empty cache
|
||||
#[must_use]
|
||||
pub fn new(base_url: Option<Url>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
|
||||
pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
|
||||
Collector {
|
||||
base_url,
|
||||
base,
|
||||
skip_missing_inputs,
|
||||
max_concurrency,
|
||||
cache: HashSet::new(),
|
||||
|
|
@ -236,7 +23,8 @@ impl Collector {
|
|||
}
|
||||
|
||||
/// Fetch all unique links from a slice of inputs
|
||||
/// All relative URLs get prefixed with `base_url` if given.
|
||||
/// All relative URLs get prefixed with `base` if given.
|
||||
/// (This can be a directory or a base URL)
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
|
|
@ -263,9 +51,9 @@ impl Collector {
|
|||
|
||||
while let Some(result) = contents_rx.recv().await {
|
||||
for input_content in result? {
|
||||
let base_url = self.base_url.clone();
|
||||
let base = self.base.clone();
|
||||
let handle =
|
||||
tokio::task::spawn_blocking(move || extract_links(&input_content, &base_url));
|
||||
tokio::task::spawn_blocking(move || extract_links(&input_content, &base));
|
||||
extract_links_handles.push(handle);
|
||||
}
|
||||
}
|
||||
|
|
@ -278,7 +66,7 @@ impl Collector {
|
|||
|
||||
for handle in extract_links_handles {
|
||||
let new_links = handle.await?;
|
||||
links.extend(new_links);
|
||||
links.extend(new_links?);
|
||||
}
|
||||
|
||||
// Filter out already cached links (duplicates)
|
||||
|
|
@ -304,9 +92,9 @@ mod test {
|
|||
|
||||
use super::*;
|
||||
use crate::{
|
||||
extract::FileType,
|
||||
mock_server,
|
||||
test_utils::{mail, website},
|
||||
types::{FileType, Input},
|
||||
Result, Uri,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,66 +1,62 @@
|
|||
use std::{collections::HashSet, convert::TryFrom, path::Path};
|
||||
use std::{collections::HashSet, convert::TryFrom, path::Path, path::PathBuf};
|
||||
|
||||
use html5ever::{
|
||||
parse_document,
|
||||
tendril::{StrTendril, TendrilSink},
|
||||
};
|
||||
use linkify::LinkFinder;
|
||||
use log::info;
|
||||
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||||
use percent_encoding::percent_decode_str;
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use url::Url;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{collector::InputContent, Request, Uri};
|
||||
use crate::{
|
||||
helpers::{path, url},
|
||||
types::{FileType, InputContent},
|
||||
Base, ErrorKind, Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// `FileType` defines which file types lychee can handle
|
||||
pub enum FileType {
|
||||
/// File in HTML format
|
||||
Html,
|
||||
/// File in Markdown format
|
||||
Markdown,
|
||||
/// Generic text file without syntax-specific parsing
|
||||
Plaintext,
|
||||
}
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
|
||||
impl Default for FileType {
|
||||
fn default() -> Self {
|
||||
Self::Plaintext
|
||||
// Only keep legit URLs. For example this filters out anchors.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(url) = base.as_ref().and_then(|u| u.join(&link)) {
|
||||
Request::new(Uri { url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if url::is_anchor(&link) {
|
||||
// Silently ignore anchor links for now
|
||||
continue;
|
||||
}
|
||||
let url = create_uri_from_path(root, base, &link)?;
|
||||
Request::new(Uri { url }, input_content.input.clone())
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
|
||||
impl<P: AsRef<Path>> From<P> for FileType {
|
||||
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
|
||||
fn from(p: P) -> FileType {
|
||||
let path = p.as_ref();
|
||||
// Assume HTML in case of no extension.
|
||||
// Note: this is only reasonable for URLs; not paths on disk.
|
||||
// For example, `README` without an extension is more likely to be a plaintext file.
|
||||
// A better solution would be to also implement `From<Url> for FileType`.
|
||||
// Unfortunately that's not possible without refactoring, as
|
||||
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
|
||||
// `From<Url> for FileType` is not allowed.
|
||||
match path.extension().and_then(std::ffi::OsStr::to_str) {
|
||||
Some("md" | "markdown") => FileType::Markdown,
|
||||
Some("htm" | "html") | None => FileType::Html,
|
||||
Some(_) => FileType::Plaintext,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use LinkFinder here to offload the actual link searching in plaintext.
|
||||
fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
let finder = LinkFinder::new();
|
||||
finder.links(input).collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a markdown string.
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
.flat_map(|event| match event {
|
||||
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => {
|
||||
vec![url.to_string()]
|
||||
}
|
||||
MDEvent::Start(Tag::Link(_, url, _) | Tag::Image(_, url, _)) => vec![url.to_string()],
|
||||
MDEvent::Text(txt) => extract_links_from_plaintext(&txt.to_string()),
|
||||
MDEvent::Html(html) => extract_links_from_html(&html.to_string()),
|
||||
_ => vec![],
|
||||
|
|
@ -68,15 +64,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a HTML string.
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
fn extract_links_from_html(input: &str) -> Vec<String> {
|
||||
let tendril = StrTendril::from(input);
|
||||
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
|
||||
|
||||
let mut urls = Vec::new();
|
||||
|
||||
// we pass mutable urls reference to avoid extra allocations in each
|
||||
// recursive descent
|
||||
// We pass mutable URL references here to avoid
|
||||
// extra allocations in each recursive descent
|
||||
walk_html_links(&mut urls, &rc_dom.document);
|
||||
|
||||
urls
|
||||
|
|
@ -101,7 +97,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
|||
for attr in attrs.borrow().iter() {
|
||||
let attr_value = attr.value.to_string();
|
||||
|
||||
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
urls.push(attr_value);
|
||||
} else {
|
||||
urls.append(&mut extract_links_from_plaintext(&attr_value));
|
||||
|
|
@ -113,56 +109,34 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
|||
}
|
||||
|
||||
// recursively traverse the document's nodes -- this doesn't need any extra
|
||||
// exit conditions because the document is a tree
|
||||
// exit conditions, because the document is a tree
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_html_links(&mut urls, child);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine if element's attribute contains a link / URL.
|
||||
fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
matches!(
|
||||
(attr_name, elem_name),
|
||||
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
|
||||
)
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a plaintext.
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
|
||||
find_links(input)
|
||||
url::find_links(input)
|
||||
.iter()
|
||||
.map(|l| String::from(l.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base_url: &Option<Url>,
|
||||
) -> HashSet<Request> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
|
||||
// Only keep legit URLs. This sorts out things like anchors.
|
||||
// Silently ignore the parse failures for now.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
requests.insert(Request::new(uri, input_content.input.clone()));
|
||||
} else if !Path::new(&link).exists() {
|
||||
if let Some(new_url) = base_url.as_ref().and_then(|u| u.join(&link).ok()) {
|
||||
requests.insert(Request::new(
|
||||
Uri { url: new_url },
|
||||
input_content.input.clone(),
|
||||
));
|
||||
}
|
||||
};
|
||||
}
|
||||
requests
|
||||
fn create_uri_from_path(src: &Path, base: &Option<Base>, dst: &str) -> Result<Url> {
|
||||
let dst = url::remove_get_params_and_fragment(dst);
|
||||
// Avoid double-encoding already encoded destination paths by removing any
|
||||
// potential encoding (e.g. `web%20site` becomes `web site`).
|
||||
// That's because Url::from_file_path will encode the full URL in the end.
|
||||
// This behavior cannot be configured.
|
||||
// See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411
|
||||
// TODO: This is not a perfect solution.
|
||||
// Ideally, only `src` and `base` should be URL encoded (as is done by
|
||||
// `from_file_path` at the moment) while `dst` is left untouched and simply
|
||||
// appended to the end.
|
||||
let decoded = percent_decode_str(dst).decode_utf8()?.to_string();
|
||||
let path = path::resolve(src, &PathBuf::from(decoded), base)?;
|
||||
Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidUrl(path))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -176,17 +150,24 @@ mod test {
|
|||
};
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
use url::Url;
|
||||
|
||||
use super::{
|
||||
extract_links, extract_links_from_html, extract_links_from_markdown,
|
||||
extract_links_from_plaintext, find_links, FileType,
|
||||
};
|
||||
use super::*;
|
||||
use crate::{
|
||||
collector::InputContent,
|
||||
helpers::url::find_links,
|
||||
test_utils::{mail, website},
|
||||
Uri,
|
||||
};
|
||||
use crate::{
|
||||
types::{FileType, InputContent},
|
||||
Base,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_create_uri_from_path() {
|
||||
let result =
|
||||
create_uri_from_path(&PathBuf::from("/README.md"), &None, "test+encoding").unwrap();
|
||||
assert_eq!(result.as_str(), "file:///test+encoding");
|
||||
}
|
||||
|
||||
fn load_fixture(filename: &str) -> String {
|
||||
let fixture_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
|
|
@ -207,13 +188,13 @@ mod test {
|
|||
}
|
||||
|
||||
fn extract_uris(input: &str, file_type: FileType, base_url: Option<&str>) -> HashSet<Uri> {
|
||||
extract_links(
|
||||
&InputContent::from_string(input, file_type),
|
||||
&base_url.map(|u| Url::parse(u).unwrap()),
|
||||
)
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect()
|
||||
let base = base_url.map(|url| Base::Remote(Url::parse(url).unwrap()));
|
||||
extract_links(&InputContent::from_string(input, file_type), &base)
|
||||
// unwrap is fine here as this helper function is only used in tests
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|r| r.uri)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ use std::{collections::HashSet, net::IpAddr};
|
|||
pub use excludes::Excludes;
|
||||
pub use includes::Includes;
|
||||
|
||||
use crate::uri::Uri;
|
||||
use crate::Uri;
|
||||
|
||||
/// Pre-defined exclusions for known false-positives
|
||||
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
|
||||
|
|
|
|||
2
lychee-lib/src/helpers/mod.rs
Normal file
2
lychee-lib/src/helpers/mod.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pub(crate) mod path;
|
||||
pub(crate) mod url;
|
||||
141
lychee-lib/src/helpers/path.rs
Normal file
141
lychee-lib/src/helpers/path.rs
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
use crate::{Base, ErrorKind, Result};
|
||||
use path_clean::PathClean;
|
||||
use std::env;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
// Returns the base if it is a valid `PathBuf`
|
||||
fn get_base_dir(base: &Option<Base>) -> Option<PathBuf> {
|
||||
base.as_ref().and_then(Base::dir)
|
||||
}
|
||||
|
||||
// https://stackoverflow.com/a/54817755/270334
|
||||
pub(crate) fn absolute_path(path: impl AsRef<Path>) -> Result<PathBuf> {
|
||||
let path = path.as_ref();
|
||||
|
||||
let absolute_path = if path.is_absolute() {
|
||||
path.to_path_buf()
|
||||
} else {
|
||||
env::current_dir()?.join(path)
|
||||
}
|
||||
.clean();
|
||||
|
||||
Ok(absolute_path)
|
||||
}
|
||||
|
||||
// Get the parent directory of a given `Path`.
|
||||
fn dirname(src: &Path) -> PathBuf {
|
||||
if src.is_file() {
|
||||
src.to_path_buf()
|
||||
.parent()
|
||||
.map_or(PathBuf::new(), Path::to_path_buf)
|
||||
} else {
|
||||
src.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve `dst` that was linked to from within `src`
|
||||
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
|
||||
if dst.is_relative() {
|
||||
// Find `dst` in the parent directory of `src`
|
||||
if let Some(parent) = src.parent() {
|
||||
let rel_path = parent.join(dst.to_path_buf());
|
||||
return absolute_path(&rel_path);
|
||||
}
|
||||
}
|
||||
if dst.is_absolute() {
|
||||
// Absolute local links (leading slash) require the `base_url` to
|
||||
// define the document root.
|
||||
let base = get_base_dir(base).ok_or_else(|| {
|
||||
ErrorKind::InvalidBase(
|
||||
"<empty>".to_string(),
|
||||
format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
|
||||
)
|
||||
})?;
|
||||
let abs_path = join(dirname(&base), dst);
|
||||
return absolute_path(&abs_path);
|
||||
}
|
||||
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
|
||||
}
|
||||
|
||||
// A cumbersome way to concatenate paths without checking their
|
||||
// existence on disk. See https://github.com/rust-lang/rust/issues/16507
|
||||
fn join(base: PathBuf, dst: &Path) -> PathBuf {
|
||||
let mut abs = base.into_os_string();
|
||||
let target_str = dst.as_os_str();
|
||||
abs.push(target_str);
|
||||
PathBuf::from(abs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_path {
|
||||
use super::*;
|
||||
use crate::Result;
|
||||
|
||||
// index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_resolve_relative() -> Result<()> {
|
||||
let dummy = PathBuf::from("index.html");
|
||||
let abs_path = PathBuf::from("./foo.html");
|
||||
assert_eq!(
|
||||
resolve(&dummy, &abs_path, &None)?,
|
||||
env::current_dir()?.join("foo.html")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ./index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_resolve_relative_index() -> Result<()> {
|
||||
let dummy = PathBuf::from("./index.html");
|
||||
let abs_path = PathBuf::from("./foo.html");
|
||||
assert_eq!(
|
||||
resolve(&dummy, &abs_path, &None)?,
|
||||
env::current_dir()?.join("foo.html")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// ./foo.html
|
||||
#[test]
|
||||
fn test_resolve_from_absolute() -> Result<()> {
|
||||
let abs_index = PathBuf::from("/path/to/index.html");
|
||||
let abs_path = PathBuf::from("./foo.html");
|
||||
assert_eq!(
|
||||
resolve(&abs_index, &abs_path, &None)?,
|
||||
PathBuf::from("/path/to/foo.html")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// dummy
|
||||
// foo.html
|
||||
// valid base dir
|
||||
#[test]
|
||||
fn test_resolve_absolute_from_base_dir() -> Result<()> {
|
||||
let dummy = PathBuf::new();
|
||||
let abs_path = PathBuf::from("/foo.html");
|
||||
let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
|
||||
assert_eq!(
|
||||
resolve(&dummy, &abs_path, &base)?,
|
||||
PathBuf::from("/some/absolute/base/dir/foo.html")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// /path/to/index.html
|
||||
// /other/path/to/foo.html
|
||||
#[test]
|
||||
fn test_resolve_absolute_from_absolute() -> Result<()> {
|
||||
let abs_index = PathBuf::from("/path/to/index.html");
|
||||
let abs_path = PathBuf::from("/other/path/to/foo.html");
|
||||
let base = Some(Base::Local(PathBuf::from("/some/absolute/base/dir")));
|
||||
assert_eq!(
|
||||
resolve(&abs_index, &abs_path, &base)?,
|
||||
PathBuf::from("/some/absolute/base/dir/other/path/to/foo.html")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
93
lychee-lib/src/helpers/url.rs
Normal file
93
lychee-lib/src/helpers/url.rs
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
use linkify::LinkFinder;
|
||||
|
||||
/// Remove all GET parameters from a URL.
|
||||
/// The link is not a URL but a String as it may not have a base domain.
|
||||
pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
|
||||
let path = match url.split_once('#') {
|
||||
Some((path_without_fragment, _fragment)) => path_without_fragment,
|
||||
None => url,
|
||||
};
|
||||
let path = match path.split_once('?') {
|
||||
Some((path_without_params, _params)) => path_without_params,
|
||||
None => path,
|
||||
};
|
||||
path
|
||||
}
|
||||
|
||||
/// Determine if an element's attribute contains a link / URL.
|
||||
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
matches!(
|
||||
(attr_name, elem_name),
|
||||
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
|
||||
)
|
||||
}
|
||||
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(url: &str) -> bool {
|
||||
url.starts_with('#')
|
||||
}
|
||||
|
||||
// Use `LinkFinder` to offload the raw link searching in plaintext
|
||||
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
let finder = LinkFinder::new();
|
||||
finder.links(input).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_fs_tree {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
assert!(is_anchor("#anchor"));
|
||||
assert!(!is_anchor("notan#anchor"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_get_params_and_fragment() {
|
||||
assert_eq!(remove_get_params_and_fragment("/"), "/");
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("index.html?foo=bar"),
|
||||
"index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("/index.html?foo=bar"),
|
||||
"/index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"),
|
||||
"/index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.org/index.html?foo=bar"),
|
||||
"https://example.org/index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar"),
|
||||
"test.png"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.org/index.html#anchor"),
|
||||
"https://example.org/index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("https://example.org/index.html?foo=bar#anchor"),
|
||||
"https://example.org/index.html"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar#anchor"),
|
||||
"test.png"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png#anchor?anchor!?"),
|
||||
"test.png"
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"),
|
||||
"test.png"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -41,18 +41,18 @@
|
|||
)]
|
||||
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
|
||||
#![deny(missing_docs)]
|
||||
#![allow(clippy::module_name_repetitions)]
|
||||
|
||||
#[cfg(doctest)]
|
||||
doc_comment::doctest!("../../README.md");
|
||||
|
||||
mod client;
|
||||
mod client_pool;
|
||||
mod quirks;
|
||||
mod types;
|
||||
mod uri;
|
||||
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
mod helpers;
|
||||
mod quirks;
|
||||
mod types;
|
||||
|
||||
/// Functionality to extract URIs from inputs
|
||||
pub mod extract;
|
||||
|
|
@ -75,8 +75,7 @@ use ring as _; // required for apple silicon
|
|||
pub use crate::{
|
||||
client::{check, ClientBuilder},
|
||||
client_pool::ClientPool,
|
||||
collector::{Collector, Input},
|
||||
collector::Collector,
|
||||
filter::{Excludes, Filter, Includes},
|
||||
types::{ErrorKind, Request, Response, ResponseBody, Result, Status},
|
||||
uri::Uri,
|
||||
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
|
||||
};
|
||||
|
|
|
|||
83
lychee-lib/src/types/base.rs
Normal file
83
lychee-lib/src/types/base.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{convert::TryFrom, path::PathBuf};
|
||||
|
||||
use crate::ErrorKind;
|
||||
|
||||
/// When encountering links without a full domain in a document,
|
||||
/// the base determines where this resource can be found.
|
||||
/// Both, local and remote targets are supported.
|
||||
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)]
|
||||
#[allow(variant_size_differences)]
|
||||
pub enum Base {
|
||||
/// Local file path pointing to root directory
|
||||
Local(PathBuf),
|
||||
/// Remote URL pointing to a website homepage
|
||||
Remote(Url),
|
||||
}
|
||||
|
||||
impl Base {
|
||||
/// Join link with base url
|
||||
#[must_use]
|
||||
pub fn join(&self, link: &str) -> Option<Url> {
|
||||
match self {
|
||||
Self::Remote(url) => url.join(link).ok(),
|
||||
Self::Local(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the directory if the base is local
|
||||
#[must_use]
|
||||
pub fn dir(&self) -> Option<PathBuf> {
|
||||
match self {
|
||||
Self::Remote(_) => None,
|
||||
Self::Local(d) => Some(d.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&str> for Base {
|
||||
type Error = ErrorKind;
|
||||
|
||||
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
||||
if let Ok(url) = Url::parse(value) {
|
||||
if url.cannot_be_a_base() {
|
||||
return Err(ErrorKind::InvalidBase(
|
||||
value.to_string(),
|
||||
"The given URL cannot be a base".to_string(),
|
||||
));
|
||||
}
|
||||
return Ok(Self::Remote(url));
|
||||
}
|
||||
Ok(Self::Local(PathBuf::from(value)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_base {
|
||||
use crate::Result;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_valid_remote() -> Result<()> {
|
||||
let base = Base::try_from("https://endler.dev")?;
|
||||
assert_eq!(
|
||||
base,
|
||||
Base::Remote(Url::parse("https://endler.dev").unwrap())
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_url() {
|
||||
assert!(Base::try_from("data:text/plain,Hello?World#").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_local() -> Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
Base::try_from(dir.as_ref().to_str().unwrap())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -10,21 +10,32 @@ use crate::Uri;
|
|||
#[derive(Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum ErrorKind {
|
||||
// TODO: maybe need to be splitted; currently first slot is Some only for reading files
|
||||
// TODO: maybe needs to be split; currently first element is `Some` only for
|
||||
// reading files
|
||||
/// Any form of I/O error occurred while reading from a given path.
|
||||
IoError(Option<PathBuf>, std::io::Error),
|
||||
/// Errors which can occur when attempting to interpret a sequence of u8 as a string
|
||||
Utf8Error(std::str::Utf8Error),
|
||||
/// Network error when trying to connect to an endpoint via reqwest.
|
||||
ReqwestError(reqwest::Error),
|
||||
/// Network error when trying to connect to an endpoint via hubcaps.
|
||||
HubcapsError(hubcaps::Error),
|
||||
/// The given string can not be parsed into a valid URL or e-mail address
|
||||
/// The given string can not be parsed into a valid URL, e-mail address, or file path
|
||||
UrlParseError(String, (url::ParseError, Option<fast_chemail::ParseError>)),
|
||||
/// The given URI cannot be converted to a file path
|
||||
InvalidFilePath(Uri),
|
||||
/// The given path cannot be converted to a URI
|
||||
InvalidUrl(PathBuf),
|
||||
/// The given mail address is unreachable
|
||||
UnreachableEmailAddress(Uri),
|
||||
/// The given header could not be parsed.
|
||||
/// A possible error when converting a `HeaderValue` from a string or byte
|
||||
/// slice.
|
||||
InvalidHeader(InvalidHeaderValue),
|
||||
/// The given string can not be parsed into a valid base URL or base directory
|
||||
InvalidBase(String, String),
|
||||
/// Cannot find local file
|
||||
FileNotFound(PathBuf),
|
||||
/// The given UNIX glob pattern is invalid
|
||||
InvalidGlobPattern(glob::PatternError),
|
||||
/// The Github API could not be called because of a missing Github token.
|
||||
|
|
@ -63,8 +74,14 @@ impl Hash for ErrorKind {
|
|||
Self::IoError(p, e) => (p, e.kind()).hash(state),
|
||||
Self::ReqwestError(e) => e.to_string().hash(state),
|
||||
Self::HubcapsError(e) => e.to_string().hash(state),
|
||||
Self::FileNotFound(e) => e.to_string_lossy().hash(state),
|
||||
Self::UrlParseError(s, e) => (s, e.type_id()).hash(state),
|
||||
Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => u.hash(state),
|
||||
Self::InvalidUrl(p) => p.hash(state),
|
||||
Self::Utf8Error(e) => e.to_string().hash(state),
|
||||
Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => {
|
||||
u.hash(state);
|
||||
}
|
||||
Self::InvalidBase(base, e) => (base, e).hash(state),
|
||||
Self::InvalidHeader(e) => e.to_string().hash(state),
|
||||
Self::InvalidGlobPattern(e) => e.to_string().hash(state),
|
||||
Self::MissingGitHubToken => std::mem::discriminant(self).hash(state),
|
||||
|
|
@ -84,6 +101,7 @@ impl Display for ErrorKind {
|
|||
Self::IoError(None, e) => e.fmt(f),
|
||||
Self::ReqwestError(e) => e.fmt(f),
|
||||
Self::HubcapsError(e) => e.fmt(f),
|
||||
Self::FileNotFound(e) => write!(f, "{}", e.to_string_lossy()),
|
||||
Self::UrlParseError(s, (url_err, Some(mail_err))) => {
|
||||
write!(
|
||||
f,
|
||||
|
|
@ -94,6 +112,8 @@ impl Display for ErrorKind {
|
|||
Self::UrlParseError(s, (url_err, None)) => {
|
||||
write!(f, "Cannot parse {} as website url ({})", s, url_err)
|
||||
}
|
||||
Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u),
|
||||
Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()),
|
||||
Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri),
|
||||
Self::InvalidHeader(e) => e.fmt(f),
|
||||
Self::InvalidGlobPattern(e) => e.fmt(f),
|
||||
|
|
@ -106,6 +126,8 @@ impl Display for ErrorKind {
|
|||
"This URL is available in HTTPS protocol, but HTTP is provided, use '{}' instead",
|
||||
uri
|
||||
),
|
||||
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
|
||||
Self::Utf8Error(e) => e.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -125,6 +147,12 @@ impl From<(PathBuf, std::io::Error)> for ErrorKind {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<std::str::Utf8Error> for ErrorKind {
|
||||
fn from(e: std::str::Utf8Error) -> Self {
|
||||
Self::Utf8Error(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for ErrorKind {
|
||||
fn from(e: std::io::Error) -> Self {
|
||||
Self::IoError(None, e)
|
||||
|
|
@ -149,6 +177,12 @@ impl From<hubcaps::errors::Error> for ErrorKind {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<url::ParseError> for ErrorKind {
|
||||
fn from(e: url::ParseError) -> Self {
|
||||
Self::UrlParseError("Cannot parse URL".to_string(), (e, None))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(String, url::ParseError)> for ErrorKind {
|
||||
fn from(value: (String, url::ParseError)) -> Self {
|
||||
Self::UrlParseError(value.0, (value.1, None))
|
||||
|
|
|
|||
37
lychee-lib/src/types/file.rs
Normal file
37
lychee-lib/src/types/file.rs
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
use std::path::Path;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// `FileType` defines which file types lychee can handle
|
||||
pub enum FileType {
|
||||
/// File in HTML format
|
||||
Html,
|
||||
/// File in Markdown format
|
||||
Markdown,
|
||||
/// Generic text file without syntax-specific parsing
|
||||
Plaintext,
|
||||
}
|
||||
|
||||
impl Default for FileType {
|
||||
fn default() -> Self {
|
||||
Self::Plaintext
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: AsRef<Path>> From<P> for FileType {
|
||||
/// Detect if the given path points to a Markdown, HTML, or plaintext file.
|
||||
fn from(p: P) -> FileType {
|
||||
let path = p.as_ref();
|
||||
// Assume HTML in case of no extension.
|
||||
// Note: this is only reasonable for URLs; not paths on disk.
|
||||
// For example, `README` without an extension is more likely to be a plaintext file.
|
||||
// A better solution would be to also implement `From<Url> for FileType`.
|
||||
// Unfortunately that's not possible without refactoring, as
|
||||
// `AsRef<Path>` could be implemented for `Url` in the future, which is why
|
||||
// `From<Url> for FileType` is not allowed.
|
||||
match path.extension().and_then(std::ffi::OsStr::to_str) {
|
||||
Some("md" | "markdown") => FileType::Markdown,
|
||||
Some("htm" | "html") | None => FileType::Html,
|
||||
Some(_) => FileType::Plaintext,
|
||||
}
|
||||
}
|
||||
}
|
||||
214
lychee-lib/src/types/input.rs
Normal file
214
lychee-lib/src/types/input.rs
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
use crate::types::FileType;
|
||||
use crate::Result;
|
||||
use glob::glob_with;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use shellexpand::tilde;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{fmt::Display, fs::read_to_string};
|
||||
use tokio::io::{stdin, AsyncReadExt};
|
||||
|
||||
const STDIN: &str = "-";
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
#[non_exhaustive]
|
||||
/// An exhaustive list of input sources, which lychee accepts
|
||||
pub enum Input {
|
||||
/// URL (of HTTP/HTTPS scheme).
|
||||
RemoteUrl(Box<Url>),
|
||||
/// Unix shell-style glob pattern.
|
||||
FsGlob {
|
||||
/// The glob pattern matching all input files
|
||||
pattern: String,
|
||||
/// Don't be case sensitive when matching files against a glob
|
||||
ignore_case: bool,
|
||||
},
|
||||
/// File path.
|
||||
FsPath(PathBuf),
|
||||
/// Standard Input.
|
||||
Stdin,
|
||||
/// Raw string input.
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl Serialize for Input {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Input {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(match self {
|
||||
Input::RemoteUrl(url) => url.as_str(),
|
||||
Input::FsGlob { pattern, .. } => pattern,
|
||||
Input::FsPath(path) => path.to_str().unwrap_or_default(),
|
||||
Input::Stdin => "stdin",
|
||||
Input::String(_) => "raw input string",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Encapsulates the content for a given input
|
||||
pub struct InputContent {
|
||||
/// Input source
|
||||
pub input: Input,
|
||||
/// File type of given input
|
||||
pub file_type: FileType,
|
||||
/// Raw UTF-8 string content
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
impl InputContent {
|
||||
#[must_use]
|
||||
/// Create an instance of `InputContent` from an input string
|
||||
pub fn from_string(s: &str, file_type: FileType) -> Self {
|
||||
// TODO: consider using Cow (to avoid one .clone() for String types)
|
||||
Self {
|
||||
input: Input::String(s.to_owned()),
|
||||
file_type,
|
||||
content: s.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Input {
|
||||
#[must_use]
|
||||
/// Construct a new `Input` source. In case the input is a `glob` pattern,
|
||||
/// `glob_ignore_case` decides whether matching files against the `glob` is
|
||||
/// case-insensitive or not
|
||||
pub fn new(value: &str, glob_ignore_case: bool) -> Self {
|
||||
if value == STDIN {
|
||||
Self::Stdin
|
||||
} else if let Ok(url) = Url::parse(value) {
|
||||
Self::RemoteUrl(Box::new(url))
|
||||
} else {
|
||||
// this seems to be the only way to determine if this is a glob pattern
|
||||
let is_glob = glob::Pattern::escape(value) != value;
|
||||
|
||||
if is_glob {
|
||||
Self::FsGlob {
|
||||
pattern: value.to_owned(),
|
||||
ignore_case: glob_ignore_case,
|
||||
}
|
||||
} else {
|
||||
Self::FsPath(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::missing_panics_doc)]
|
||||
/// Retrieve the contents from the input
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an error if the contents can not be retrieved
|
||||
/// because of an underlying I/O error (e.g. an error while making a
|
||||
/// network request or retrieving the contents from the file system)
|
||||
pub async fn get_contents(
|
||||
&self,
|
||||
file_type_hint: Option<FileType>,
|
||||
skip_missing: bool,
|
||||
) -> Result<Vec<InputContent>> {
|
||||
match *self {
|
||||
// TODO: should skip_missing also affect URLs?
|
||||
Input::RemoteUrl(ref url) => Ok(vec![Self::url_contents(url).await?]),
|
||||
Input::FsGlob {
|
||||
ref pattern,
|
||||
ignore_case,
|
||||
} => Ok(Self::glob_contents(pattern, ignore_case).await?),
|
||||
Input::FsPath(ref path) => {
|
||||
let content = Self::path_content(path);
|
||||
match content {
|
||||
Ok(input_content) => Ok(vec![input_content]),
|
||||
Err(_) if skip_missing => Ok(vec![]),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
Input::Stdin => Ok(vec![Self::stdin_content(file_type_hint).await?]),
|
||||
Input::String(ref s) => Ok(vec![Self::string_content(s, file_type_hint)]),
|
||||
}
|
||||
}
|
||||
|
||||
async fn url_contents(url: &Url) -> Result<InputContent> {
|
||||
// Assume HTML for default paths
|
||||
let file_type = if url.path().is_empty() || url.path() == "/" {
|
||||
FileType::Html
|
||||
} else {
|
||||
FileType::from(url.as_str())
|
||||
};
|
||||
|
||||
let res = reqwest::get(url.clone()).await?;
|
||||
let input_content = InputContent {
|
||||
input: Input::RemoteUrl(Box::new(url.clone())),
|
||||
file_type,
|
||||
content: res.text().await?,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn glob_contents(path_glob: &str, ignore_case: bool) -> Result<Vec<InputContent>> {
|
||||
let mut contents = vec![];
|
||||
let glob_expanded = tilde(&path_glob);
|
||||
let mut match_opts = glob::MatchOptions::new();
|
||||
|
||||
match_opts.case_sensitive = !ignore_case;
|
||||
|
||||
for entry in glob_with(&glob_expanded, match_opts)? {
|
||||
match entry {
|
||||
Ok(path) => {
|
||||
if path.is_dir() {
|
||||
// Directories can still have a suffix which looks like
|
||||
// a file extension like `foo.html`. This can lead to
|
||||
// unexpected behavior with glob patterns like
|
||||
// `**/*.html`. Therefore filter these out.
|
||||
// https://github.com/lycheeverse/lychee/pull/262#issuecomment-913226819
|
||||
continue;
|
||||
}
|
||||
let content = Self::path_content(&path)?;
|
||||
contents.push(content);
|
||||
}
|
||||
Err(e) => println!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
/// Get the input content of a given path
|
||||
/// # Errors
|
||||
///
|
||||
/// Will return `Err` if file contents can't be read
|
||||
pub fn path_content<P: Into<PathBuf> + AsRef<Path> + Clone>(path: P) -> Result<InputContent> {
|
||||
let content = read_to_string(&path).map_err(|e| (path.clone().into(), e))?;
|
||||
let input_content = InputContent {
|
||||
file_type: FileType::from(path.as_ref()),
|
||||
content,
|
||||
input: Input::FsPath(path.into()),
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
async fn stdin_content(file_type_hint: Option<FileType>) -> Result<InputContent> {
|
||||
let mut content = String::new();
|
||||
let mut stdin = stdin();
|
||||
stdin.read_to_string(&mut content).await?;
|
||||
|
||||
let input_content = InputContent {
|
||||
input: Input::Stdin,
|
||||
file_type: file_type_hint.unwrap_or_default(),
|
||||
content,
|
||||
};
|
||||
|
||||
Ok(input_content)
|
||||
}
|
||||
|
||||
fn string_content(s: &str, file_type_hint: Option<FileType>) -> InputContent {
|
||||
InputContent::from_string(s, file_type_hint.unwrap_or_default())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,14 +1,22 @@
|
|||
#![allow(unreachable_pub)]
|
||||
|
||||
mod base;
|
||||
mod error;
|
||||
mod file;
|
||||
mod input;
|
||||
mod request;
|
||||
mod response;
|
||||
mod status;
|
||||
mod uri;
|
||||
|
||||
pub use base::Base;
|
||||
pub use error::ErrorKind;
|
||||
pub use file::FileType;
|
||||
pub use input::{Input, InputContent};
|
||||
pub use request::Request;
|
||||
pub use response::{Response, ResponseBody};
|
||||
pub use status::Status;
|
||||
pub use uri::Uri;
|
||||
|
||||
/// The lychee `Result` type
|
||||
pub type Result<T> = std::result::Result<T, crate::ErrorKind>;
|
||||
|
|
|
|||
|
|
@ -82,9 +82,16 @@ impl Uri {
|
|||
}
|
||||
|
||||
#[inline]
|
||||
/// Check if the URI is a valid mail address
|
||||
pub(crate) fn is_mail(&self) -> bool {
|
||||
self.scheme() == "mailto"
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Check if the URI is a file
|
||||
pub(crate) fn is_file(&self) -> bool {
|
||||
self.scheme() == "file"
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<str> for Uri {
|
||||
Loading…
Reference in a new issue