mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-18 04:10:57 +00:00
wip
This commit is contained in:
parent
5a2e10799f
commit
dd3205a87c
8 changed files with 154 additions and 94 deletions
|
|
@ -175,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
let include = RegexSet::new(&cfg.include)?;
|
||||
let exclude = RegexSet::new(&cfg.exclude)?;
|
||||
|
||||
// Offline mode overrides the scheme
|
||||
let schemes = if cfg.offline {
|
||||
vec!["file".to_string()]
|
||||
} else {
|
||||
cfg.scheme.clone()
|
||||
};
|
||||
|
||||
let client = ClientBuilder::builder()
|
||||
.includes(include)
|
||||
.excludes(exclude)
|
||||
|
|
@ -190,7 +197,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
|
|||
.method(method)
|
||||
.timeout(timeout)
|
||||
.github_token(cfg.github_token.clone())
|
||||
.schemes(HashSet::from_iter(cfg.scheme.clone()))
|
||||
.schemes(HashSet::from_iter(schemes))
|
||||
.accepted(accepted)
|
||||
.require_https(cfg.require_https)
|
||||
.build()
|
||||
|
|
|
|||
|
|
@ -158,6 +158,11 @@ pub(crate) struct Config {
|
|||
#[serde(default)]
|
||||
pub(crate) scheme: Vec<String>,
|
||||
|
||||
/// Only check local files and block network requests.
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
pub(crate) offline: bool,
|
||||
|
||||
/// URLs to check (supports regex). Has preference over all excludes.
|
||||
#[structopt(long)]
|
||||
#[serde(default)]
|
||||
|
|
|
|||
|
|
@ -4,25 +4,53 @@ use html5ever::{
|
|||
parse_document,
|
||||
tendril::{StrTendril, TendrilSink},
|
||||
};
|
||||
use linkify::LinkFinder;
|
||||
use log::info;
|
||||
use markup5ever_rcdom::{Handle, NodeData, RcDom};
|
||||
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::{
|
||||
fs,
|
||||
helpers::{path, url},
|
||||
types::{FileType, InputContent},
|
||||
Base, ErrorKind, Input, Request, Result, Uri,
|
||||
};
|
||||
|
||||
// Use LinkFinder here to offload the actual link searching in plaintext.
|
||||
fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
let finder = LinkFinder::new();
|
||||
finder.links(input).collect()
|
||||
/// Main entrypoint for extracting links from various sources
|
||||
/// (Markdown, HTML, and plaintext)
|
||||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
|
||||
// Only keep legit URLs. For example this filters out anchors.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
|
||||
Request::new(Uri { inner: new_url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
if url::is_anchor(&link) {
|
||||
// Silently ignore anchor links for now
|
||||
continue;
|
||||
}
|
||||
let uri = create_uri(root, base, &link)?;
|
||||
Request::new(Uri { inner: uri }, input_content.input.clone())
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a markdown string.
|
||||
/// Extract unparsed URL strings from a Markdown string.
|
||||
fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
||||
let parser = Parser::new(input);
|
||||
parser
|
||||
|
|
@ -35,15 +63,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a HTML string.
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
fn extract_links_from_html(input: &str) -> Vec<String> {
|
||||
let tendril = StrTendril::from(input);
|
||||
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
|
||||
|
||||
let mut urls = Vec::new();
|
||||
|
||||
// we pass mutable urls reference to avoid extra allocations in each
|
||||
// recursive descent
|
||||
// We pass mutable URL references here to avoid
|
||||
// extra allocations in each recursive descent
|
||||
walk_html_links(&mut urls, &rc_dom.document);
|
||||
|
||||
urls
|
||||
|
|
@ -68,7 +96,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
|||
for attr in attrs.borrow().iter() {
|
||||
let attr_value = attr.value.to_string();
|
||||
|
||||
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
urls.push(attr_value);
|
||||
} else {
|
||||
urls.append(&mut extract_links_from_plaintext(&attr_value));
|
||||
|
|
@ -80,63 +108,24 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
|
|||
}
|
||||
|
||||
// recursively traverse the document's nodes -- this doesn't need any extra
|
||||
// exit conditions because the document is a tree
|
||||
// exit conditions, because the document is a tree
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_html_links(&mut urls, child);
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine if element's attribute contains a link / URL.
|
||||
fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
matches!(
|
||||
(attr_name, elem_name),
|
||||
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
|
||||
)
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from a plaintext.
|
||||
/// Extract unparsed URL strings from plaintext
|
||||
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
|
||||
find_links(input)
|
||||
url::find_links(input)
|
||||
.iter()
|
||||
.map(|l| String::from(l.as_str()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn extract_links(
|
||||
input_content: &InputContent,
|
||||
base: &Option<Base>,
|
||||
) -> Result<HashSet<Request>> {
|
||||
let links = match input_content.file_type {
|
||||
FileType::Markdown => extract_links_from_markdown(&input_content.content),
|
||||
FileType::Html => extract_links_from_html(&input_content.content),
|
||||
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
|
||||
};
|
||||
|
||||
// Only keep legit URLs. For example this filters out anchors.
|
||||
let mut requests: HashSet<Request> = HashSet::new();
|
||||
for link in links {
|
||||
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
|
||||
Request::new(uri, input_content.input.clone())
|
||||
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
|
||||
Request::new(Uri { inner: new_url }, input_content.input.clone())
|
||||
} else if let Input::FsPath(root) = &input_content.input {
|
||||
let link = fs::sanitize(&link);
|
||||
if link.starts_with('#') {
|
||||
// Silently ignore anchors for now.
|
||||
continue;
|
||||
}
|
||||
let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
|
||||
let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?;
|
||||
Request::new(Uri { inner: uri }, input_content.input.clone())
|
||||
} else {
|
||||
info!("Handling of {} not implemented yet", &link);
|
||||
continue;
|
||||
};
|
||||
requests.insert(req);
|
||||
}
|
||||
Ok(requests)
|
||||
fn create_uri(root: &PathBuf, base: &Option<Base>, link: &str) -> Result<Url> {
|
||||
let link = url::remove_get_params(&link);
|
||||
let path = path::resolve(root, &PathBuf::from(&link), base)?;
|
||||
Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
@ -150,10 +139,10 @@ mod test {
|
|||
};
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
use url::Url;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
helpers::url::find_links,
|
||||
test_utils::{mail, website},
|
||||
Uri,
|
||||
};
|
||||
|
|
|
|||
2
lychee-lib/src/helpers/mod.rs
Normal file
2
lychee-lib/src/helpers/mod.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pub(crate) mod path;
|
||||
pub(crate) mod url;
|
||||
|
|
@ -42,6 +42,18 @@ pub(crate) fn normalize(path: &Path) -> PathBuf {
|
|||
ret
|
||||
}
|
||||
|
||||
// Get the parent directory of a given `Path`.
|
||||
fn dirname(src: &Path) -> PathBuf {
|
||||
if src.is_file() {
|
||||
src.to_path_buf()
|
||||
.parent()
|
||||
.map_or(PathBuf::new(), Path::to_path_buf)
|
||||
} else {
|
||||
src.to_path_buf()
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve `dst` that was linked to from within `src`
|
||||
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
|
||||
if dst.is_relative() {
|
||||
// Find `dst` in the parent directory of `src`
|
||||
|
|
@ -51,14 +63,16 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Pat
|
|||
}
|
||||
}
|
||||
if dst.is_absolute() {
|
||||
// Absolute local links (leading slash) require the base_url to
|
||||
// Absolute local links (leading slash) require the `base_url` to
|
||||
// define the document root.
|
||||
let base_dir = get_base_dir(base).unwrap_or_else(|| {
|
||||
src.to_path_buf()
|
||||
.parent()
|
||||
.map_or(PathBuf::new(), Path::to_path_buf)
|
||||
});
|
||||
let abs_path = join(base_dir, dst);
|
||||
let base = get_base_dir(base).ok_or_else(|| {
|
||||
ErrorKind::InvalidBase(
|
||||
"<empty>".to_string(),
|
||||
format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
|
||||
.to_string(),
|
||||
)
|
||||
})?;
|
||||
let abs_path = join(dirname(&base), dst);
|
||||
return Ok(normalize(&abs_path));
|
||||
}
|
||||
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
|
||||
|
|
@ -73,37 +87,11 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
|
|||
PathBuf::from(abs)
|
||||
}
|
||||
|
||||
/// A little helper function to remove the get parameters from a URL link.
|
||||
/// The link is not a URL but a String as that link may not have a base domain.
|
||||
pub(crate) fn sanitize(link: &str) -> String {
|
||||
let path = match link.split_once('?') {
|
||||
Some((path, _params)) => path,
|
||||
None => link,
|
||||
};
|
||||
path.to_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_fs_tree {
|
||||
mod test_path {
|
||||
use super::*;
|
||||
use crate::Result;
|
||||
|
||||
#[test]
|
||||
fn test_sanitize() {
|
||||
assert_eq!(sanitize("/"), "/".to_string());
|
||||
assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string());
|
||||
assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string());
|
||||
assert_eq!(
|
||||
sanitize("/index.html?foo=bar&baz=zorx?bla=blub"),
|
||||
"/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
sanitize("https://example.org/index.html?foo=bar"),
|
||||
"https://example.org/index.html".to_string()
|
||||
);
|
||||
assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string());
|
||||
}
|
||||
|
||||
// dummy root
|
||||
// /path/to/foo.html
|
||||
#[test]
|
||||
68
lychee-lib/src/helpers/url.rs
Normal file
68
lychee-lib/src/helpers/url.rs
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
use linkify::LinkFinder;
|
||||
|
||||
/// Remove all GET parameters from a URL.
|
||||
/// The link is not a URL but a String as it may not have a base domain.
|
||||
pub(crate) fn remove_get_params(url: &str) -> String {
|
||||
let path = match url.split_once('?') {
|
||||
Some((path, _params)) => path,
|
||||
None => url,
|
||||
};
|
||||
path.to_string()
|
||||
}
|
||||
|
||||
/// Determine if an element's attribute contains a link / URL.
|
||||
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
matches!(
|
||||
(attr_name, elem_name),
|
||||
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
|
||||
)
|
||||
}
|
||||
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
pub(crate) fn is_anchor(url: &str) -> bool {
|
||||
url.starts_with('#')
|
||||
}
|
||||
|
||||
// Use `LinkFinder` to offload the raw link searching in plaintext
|
||||
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
|
||||
let finder = LinkFinder::new();
|
||||
finder.links(input).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_fs_tree {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_anchor() {
|
||||
assert!(is_anchor("#anchor"));
|
||||
assert!(!is_anchor("notan#anchor"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_get_params() {
|
||||
assert_eq!(remove_get_params("/"), "/".to_string());
|
||||
assert_eq!(
|
||||
remove_get_params("index.html?foo=bar"),
|
||||
"index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params("/index.html?foo=bar"),
|
||||
"/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"),
|
||||
"/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params("https://example.org/index.html?foo=bar"),
|
||||
"https://example.org/index.html".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
remove_get_params("test.png?foo=bar"),
|
||||
"test.png".to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -50,7 +50,7 @@ mod client;
|
|||
mod client_pool;
|
||||
/// A pool of clients, to handle concurrent checks
|
||||
pub mod collector;
|
||||
mod fs;
|
||||
mod helpers;
|
||||
mod quirks;
|
||||
mod types;
|
||||
|
||||
|
|
|
|||
|
|
@ -124,6 +124,7 @@ impl Display for ErrorKind {
|
|||
uri
|
||||
),
|
||||
Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
|
||||
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue