This commit is contained in:
Matthias 2021-09-02 23:10:46 +02:00
parent 5a2e10799f
commit dd3205a87c
8 changed files with 154 additions and 94 deletions

View file

@ -175,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
let include = RegexSet::new(&cfg.include)?;
let exclude = RegexSet::new(&cfg.exclude)?;
// Offline mode overrides the scheme
let schemes = if cfg.offline {
vec!["file".to_string()]
} else {
cfg.scheme.clone()
};
let client = ClientBuilder::builder()
.includes(include)
.excludes(exclude)
@ -190,7 +197,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
.method(method)
.timeout(timeout)
.github_token(cfg.github_token.clone())
.schemes(HashSet::from_iter(cfg.scheme.clone()))
.schemes(HashSet::from_iter(schemes))
.accepted(accepted)
.require_https(cfg.require_https)
.build()

View file

@ -158,6 +158,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) scheme: Vec<String>,
/// Only check local files and block network requests.
#[structopt(long)]
#[serde(default)]
pub(crate) offline: bool,
/// URLs to check (supports regex). Has preference over all excludes.
#[structopt(long)]
#[serde(default)]

View file

@ -4,25 +4,53 @@ use html5ever::{
parse_document,
tendril::{StrTendril, TendrilSink},
};
use linkify::LinkFinder;
use log::info;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use pulldown_cmark::{Event as MDEvent, Parser, Tag};
use reqwest::Url;
use crate::{
fs,
helpers::{path, url},
types::{FileType, InputContent},
Base, ErrorKind, Input, Request, Result, Uri,
};
// Use LinkFinder here to offload the actual link searching in plaintext.
fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
finder.links(input).collect()
/// Main entrypoint for extracting links from various sources
/// (Markdown, HTML, and plaintext)
pub(crate) fn extract_links(
input_content: &InputContent,
base: &Option<Base>,
) -> Result<HashSet<Request>> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
Request::new(uri, input_content.input.clone())
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
if url::is_anchor(&link) {
// Silently ignore anchor links for now
continue;
}
let uri = create_uri(root, base, &link)?;
Request::new(Uri { inner: uri }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
continue;
};
requests.insert(req);
}
Ok(requests)
}
/// Extract unparsed URL strings from a markdown string.
/// Extract unparsed URL strings from a Markdown string.
fn extract_links_from_markdown(input: &str) -> Vec<String> {
let parser = Parser::new(input);
parser
@ -35,15 +63,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
.collect()
}
/// Extract unparsed URL strings from a HTML string.
/// Extract unparsed URL strings from an HTML string.
fn extract_links_from_html(input: &str) -> Vec<String> {
let tendril = StrTendril::from(input);
let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);
let mut urls = Vec::new();
// we pass mutable urls reference to avoid extra allocations in each
// recursive descent
// We pass mutable URL references here to avoid
// extra allocations in each recursive descent
walk_html_links(&mut urls, &rc_dom.document);
urls
@ -68,7 +96,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
for attr in attrs.borrow().iter() {
let attr_value = attr.value.to_string();
if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
urls.push(attr_value);
} else {
urls.append(&mut extract_links_from_plaintext(&attr_value));
@ -80,63 +108,24 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
}
// recursively traverse the document's nodes -- this doesn't need any extra
// exit conditions because the document is a tree
// exit conditions, because the document is a tree
for child in node.children.borrow().iter() {
walk_html_links(&mut urls, child);
}
}
/// Determine if element's attribute contains a link / URL.
fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
matches!(
(attr_name, elem_name),
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
)
}
/// Extract unparsed URL strings from a plaintext.
/// Extract unparsed URL strings from plaintext
fn extract_links_from_plaintext(input: &str) -> Vec<String> {
find_links(input)
url::find_links(input)
.iter()
.map(|l| String::from(l.as_str()))
.collect()
}
pub(crate) fn extract_links(
input_content: &InputContent,
base: &Option<Base>,
) -> Result<HashSet<Request>> {
let links = match input_content.file_type {
FileType::Markdown => extract_links_from_markdown(&input_content.content),
FileType::Html => extract_links_from_html(&input_content.content),
FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
};
// Only keep legit URLs. For example this filters out anchors.
let mut requests: HashSet<Request> = HashSet::new();
for link in links {
let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
Request::new(uri, input_content.input.clone())
} else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
Request::new(Uri { inner: new_url }, input_content.input.clone())
} else if let Input::FsPath(root) = &input_content.input {
let link = fs::sanitize(&link);
if link.starts_with('#') {
// Silently ignore anchors for now.
continue;
}
let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?;
Request::new(Uri { inner: uri }, input_content.input.clone())
} else {
info!("Handling of {} not implemented yet", &link);
continue;
};
requests.insert(req);
}
Ok(requests)
fn create_uri(root: &PathBuf, base: &Option<Base>, link: &str) -> Result<Url> {
let link = url::remove_get_params(&link);
let path = path::resolve(root, &PathBuf::from(&link), base)?;
Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
}
#[cfg(test)]
@ -150,10 +139,10 @@ mod test {
};
use pretty_assertions::assert_eq;
use url::Url;
use super::*;
use crate::{
helpers::url::find_links,
test_utils::{mail, website},
Uri,
};

View file

@ -0,0 +1,2 @@
pub(crate) mod path;
pub(crate) mod url;

View file

@ -42,6 +42,18 @@ pub(crate) fn normalize(path: &Path) -> PathBuf {
ret
}
// Get the parent directory of a given `Path`.
fn dirname(src: &Path) -> PathBuf {
if src.is_file() {
src.to_path_buf()
.parent()
.map_or(PathBuf::new(), Path::to_path_buf)
} else {
src.to_path_buf()
}
}
// Resolve `dst` that was linked to from within `src`
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
if dst.is_relative() {
// Find `dst` in the parent directory of `src`
@ -51,14 +63,16 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Pat
}
}
if dst.is_absolute() {
// Absolute local links (leading slash) require the base_url to
// Absolute local links (leading slash) require the `base_url` to
// define the document root.
let base_dir = get_base_dir(base).unwrap_or_else(|| {
src.to_path_buf()
.parent()
.map_or(PathBuf::new(), Path::to_path_buf)
});
let abs_path = join(base_dir, dst);
let base = get_base_dir(base).ok_or_else(|| {
ErrorKind::InvalidBase(
"<empty>".to_string(),
format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
.to_string(),
)
})?;
let abs_path = join(dirname(&base), dst);
return Ok(normalize(&abs_path));
}
Err(ErrorKind::FileNotFound(dst.to_path_buf()))
@ -73,37 +87,11 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
PathBuf::from(abs)
}
/// A little helper function to remove the get parameters from a URL link.
/// The link is not a URL but a String as that link may not have a base domain.
pub(crate) fn sanitize(link: &str) -> String {
let path = match link.split_once('?') {
Some((path, _params)) => path,
None => link,
};
path.to_string()
}
#[cfg(test)]
mod test_fs_tree {
mod test_path {
use super::*;
use crate::Result;
#[test]
fn test_sanitize() {
assert_eq!(sanitize("/"), "/".to_string());
assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string());
assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string());
assert_eq!(
sanitize("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html".to_string()
);
assert_eq!(
sanitize("https://example.org/index.html?foo=bar"),
"https://example.org/index.html".to_string()
);
assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string());
}
// dummy root
// /path/to/foo.html
#[test]

View file

@ -0,0 +1,68 @@
use linkify::LinkFinder;
/// Remove all GET parameters from a URL.
/// The link is not a URL but a String as it may not have a base domain.
pub(crate) fn remove_get_params(url: &str) -> String {
let path = match url.split_once('?') {
Some((path, _params)) => path,
None => url,
};
path.to_string()
}
/// Determine if an element's attribute contains a link / URL.
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
matches!(
(attr_name, elem_name),
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
)
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
pub(crate) fn is_anchor(url: &str) -> bool {
url.starts_with('#')
}
// Use `LinkFinder` to offload the raw link searching in plaintext
pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
let finder = LinkFinder::new();
finder.links(input).collect()
}
#[cfg(test)]
mod test_fs_tree {
use super::*;
#[test]
fn test_is_anchor() {
assert!(is_anchor("#anchor"));
assert!(!is_anchor("notan#anchor"));
}
#[test]
fn test_remove_get_params() {
assert_eq!(remove_get_params("/"), "/".to_string());
assert_eq!(
remove_get_params("index.html?foo=bar"),
"index.html".to_string()
);
assert_eq!(
remove_get_params("/index.html?foo=bar"),
"/index.html".to_string()
);
assert_eq!(
remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"),
"/index.html".to_string()
);
assert_eq!(
remove_get_params("https://example.org/index.html?foo=bar"),
"https://example.org/index.html".to_string()
);
assert_eq!(
remove_get_params("test.png?foo=bar"),
"test.png".to_string()
);
}
}

View file

@ -50,7 +50,7 @@ mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod fs;
mod helpers;
mod quirks;
mod types;

View file

@ -124,6 +124,7 @@ impl Display for ErrorKind {
uri
),
Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
}
}
}