wip

2026-04-18 04:10:57 +00:00 · 2021-09-02 23:10:46 +02:00 · 2021-09-02 23:10:46 +02:00 · dd3205a87c
commit dd3205a87c
parent 5a2e10799f
8 changed files with 154 additions and 94 deletions
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@ -175,6 +175,13 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
    let include = RegexSet::new(&cfg.include)?;
    let exclude = RegexSet::new(&cfg.exclude)?;

+    // Offline mode overrides the scheme
+    let schemes = if cfg.offline {
+        vec!["file".to_string()]
+    } else {
+        cfg.scheme.clone()
+    };
+
    let client = ClientBuilder::builder()
        .includes(include)
        .excludes(exclude)
@ -190,7 +197,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
        .method(method)
        .timeout(timeout)
        .github_token(cfg.github_token.clone())
-        .schemes(HashSet::from_iter(cfg.scheme.clone()))
+        .schemes(HashSet::from_iter(schemes))
        .accepted(accepted)
        .require_https(cfg.require_https)
        .build()
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@ -158,6 +158,11 @@ pub(crate) struct Config {
    #[serde(default)]
    pub(crate) scheme: Vec<String>,

+    /// Only check local files and block network requests. 
+    #[structopt(long)]
+    #[serde(default)]
+    pub(crate) offline: bool,
+
    /// URLs to check (supports regex). Has preference over all excludes.
    #[structopt(long)]
    #[serde(default)]
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@ -4,25 +4,53 @@ use html5ever::{
    parse_document,
    tendril::{StrTendril, TendrilSink},
 };
-use linkify::LinkFinder;
 use log::info;
 use markup5ever_rcdom::{Handle, NodeData, RcDom};
 use pulldown_cmark::{Event as MDEvent, Parser, Tag};
 use reqwest::Url;

 use crate::{
-    fs,
+    helpers::{path, url},
    types::{FileType, InputContent},
    Base, ErrorKind, Input, Request, Result, Uri,
 };

-// Use LinkFinder here to offload the actual link searching in plaintext.
-fn find_links(input: &str) -> Vec<linkify::Link> {
-    let finder = LinkFinder::new();
-    finder.links(input).collect()
+/// Main entrypoint for extracting links from various sources
+/// (Markdown, HTML, and plaintext)
+pub(crate) fn extract_links(
+    input_content: &InputContent,
+    base: &Option<Base>,
+) -> Result<HashSet<Request>> {
+    let links = match input_content.file_type {
+        FileType::Markdown => extract_links_from_markdown(&input_content.content),
+        FileType::Html => extract_links_from_html(&input_content.content),
+        FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
+    };
+
+    // Only keep legit URLs. For example this filters out anchors.
+    let mut requests: HashSet<Request> = HashSet::new();
+    for link in links {
+        let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
+            Request::new(uri, input_content.input.clone())
+        } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
+            Request::new(Uri { inner: new_url }, input_content.input.clone())
+        } else if let Input::FsPath(root) = &input_content.input {
+            if url::is_anchor(&link) {
+                // Silently ignore anchor links for now
+                continue;
+            }
+            let uri = create_uri(root, base, &link)?;
+            Request::new(Uri { inner: uri }, input_content.input.clone())
+        } else {
+            info!("Handling of {} not implemented yet", &link);
+            continue;
+        };
+        requests.insert(req);
+    }
+    Ok(requests)
 }

-/// Extract unparsed URL strings from a markdown string.
+/// Extract unparsed URL strings from a Markdown string.
 fn extract_links_from_markdown(input: &str) -> Vec<String> {
    let parser = Parser::new(input);
    parser
@ -35,15 +63,15 @@ fn extract_links_from_markdown(input: &str) -> Vec<String> {
        .collect()
 }

-/// Extract unparsed URL strings from a HTML string.
+/// Extract unparsed URL strings from an HTML string.
 fn extract_links_from_html(input: &str) -> Vec<String> {
    let tendril = StrTendril::from(input);
    let rc_dom = parse_document(RcDom::default(), html5ever::ParseOpts::default()).one(tendril);

    let mut urls = Vec::new();

-    // we pass mutable urls reference to avoid extra allocations in each
-    // recursive descent
+    // We pass mutable URL references here to avoid
+    // extra allocations in each recursive descent
    walk_html_links(&mut urls, &rc_dom.document);

    urls
@ -68,7 +96,7 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
            for attr in attrs.borrow().iter() {
                let attr_value = attr.value.to_string();

-                if elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
+                if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
                    urls.push(attr_value);
                } else {
                    urls.append(&mut extract_links_from_plaintext(&attr_value));
@ -80,63 +108,24 @@ fn walk_html_links(mut urls: &mut Vec<String>, node: &Handle) {
    }

    // recursively traverse the document's nodes -- this doesn't need any extra
-    // exit conditions because the document is a tree
+    // exit conditions, because the document is a tree
    for child in node.children.borrow().iter() {
        walk_html_links(&mut urls, child);
    }
 }

-/// Determine if element's attribute contains a link / URL.
-fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
-    // See a comprehensive list of attributes that might contain URLs/URIs
-    // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
-    matches!(
-        (attr_name, elem_name),
-        ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
-    )
-}
-
-/// Extract unparsed URL strings from a plaintext.
+/// Extract unparsed URL strings from plaintext
 fn extract_links_from_plaintext(input: &str) -> Vec<String> {
-    find_links(input)
+    url::find_links(input)
        .iter()
        .map(|l| String::from(l.as_str()))
        .collect()
 }

-pub(crate) fn extract_links(
-    input_content: &InputContent,
-    base: &Option<Base>,
-) -> Result<HashSet<Request>> {
-    let links = match input_content.file_type {
-        FileType::Markdown => extract_links_from_markdown(&input_content.content),
-        FileType::Html => extract_links_from_html(&input_content.content),
-        FileType::Plaintext => extract_links_from_plaintext(&input_content.content),
-    };
-
-    // Only keep legit URLs. For example this filters out anchors.
-    let mut requests: HashSet<Request> = HashSet::new();
-    for link in links {
-        let req = if let Ok(uri) = Uri::try_from(link.as_str()) {
-            Request::new(uri, input_content.input.clone())
-        } else if let Some(new_url) = base.as_ref().and_then(|u| u.join(&link)) {
-            Request::new(Uri { inner: new_url }, input_content.input.clone())
-        } else if let Input::FsPath(root) = &input_content.input {
-            let link = fs::sanitize(&link);
-            if link.starts_with('#') {
-                // Silently ignore anchors for now.
-                continue;
-            }
-            let path = fs::resolve(&root, &PathBuf::from(&link), base)?;
-            let uri = Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?;
-            Request::new(Uri { inner: uri }, input_content.input.clone())
-        } else {
-            info!("Handling of {} not implemented yet", &link);
-            continue;
-        };
-        requests.insert(req);
-    }
-    Ok(requests)
+fn create_uri(root: &PathBuf, base: &Option<Base>, link: &str) -> Result<Url> {
+    let link = url::remove_get_params(&link);
+    let path = path::resolve(root, &PathBuf::from(&link), base)?;
+    Ok(Url::from_file_path(&path).map_err(|_e| ErrorKind::InvalidPath(path))?)
 }

 #[cfg(test)]
@ -150,10 +139,10 @@ mod test {
    };

    use pretty_assertions::assert_eq;
-    use url::Url;

    use super::*;
    use crate::{
+        helpers::url::find_links,
        test_utils::{mail, website},
        Uri,
    };
--- a/lychee-lib/src/helpers/mod.rs
+++ b/lychee-lib/src/helpers/mod.rs
@ -0,0 +1,2 @@
+pub(crate) mod path;
+pub(crate) mod url;
--- a/lychee-lib/src/helpers/path.rs
+++ b/lychee-lib/src/helpers/path.rs
@ -42,6 +42,18 @@ pub(crate) fn normalize(path: &Path) -> PathBuf {
    ret
 }

+// Get the parent directory of a given `Path`.
+fn dirname(src: &Path) -> PathBuf {
+    if src.is_file() {
+        src.to_path_buf()
+            .parent()
+            .map_or(PathBuf::new(), Path::to_path_buf)
+    } else {
+        src.to_path_buf()
+    }
+}
+
+// Resolve `dst` that was linked to from within `src`
 pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<PathBuf> {
    if dst.is_relative() {
        // Find `dst` in the parent directory of `src`
@ -51,14 +63,16 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Pat
        }
    }
    if dst.is_absolute() {
-        // Absolute local links (leading slash) require the base_url to
+        // Absolute local links (leading slash) require the `base_url` to
        // define the document root.
-        let base_dir = get_base_dir(base).unwrap_or_else(|| {
-            src.to_path_buf()
-                .parent()
-                .map_or(PathBuf::new(), Path::to_path_buf)
-        });
-        let abs_path = join(base_dir, dst);
+        let base = get_base_dir(base).ok_or_else(|| {
+            ErrorKind::InvalidBase(
+                "<empty>".to_string(),
+                format!("Found absolute local link {:?} but no base directory was set. Set with `--base`.", dst)
+                    .to_string(),
+            )
+        })?;
+        let abs_path = join(dirname(&base), dst);
        return Ok(normalize(&abs_path));
    }
    Err(ErrorKind::FileNotFound(dst.to_path_buf()))
@ -73,37 +87,11 @@ fn join(base: PathBuf, dst: &Path) -> PathBuf {
    PathBuf::from(abs)
 }

-/// A little helper function to remove the get parameters from a URL link.
-/// The link is not a URL but a String as that link may not have a base domain.
-pub(crate) fn sanitize(link: &str) -> String {
-    let path = match link.split_once('?') {
-        Some((path, _params)) => path,
-        None => link,
-    };
-    path.to_string()
-}
-
 #[cfg(test)]
-mod test_fs_tree {
+mod test_path {
    use super::*;
    use crate::Result;

-    #[test]
-    fn test_sanitize() {
-        assert_eq!(sanitize("/"), "/".to_string());
-        assert_eq!(sanitize("index.html?foo=bar"), "index.html".to_string());
-        assert_eq!(sanitize("/index.html?foo=bar"), "/index.html".to_string());
-        assert_eq!(
-            sanitize("/index.html?foo=bar&baz=zorx?bla=blub"),
-            "/index.html".to_string()
-        );
-        assert_eq!(
-            sanitize("https://example.org/index.html?foo=bar"),
-            "https://example.org/index.html".to_string()
-        );
-        assert_eq!(sanitize("test.png?foo=bar"), "test.png".to_string());
-    }
-
    // dummy root
    // /path/to/foo.html
    #[test]
--- a/lychee-lib/src/helpers/url.rs
+++ b/lychee-lib/src/helpers/url.rs
@ -0,0 +1,68 @@
+use linkify::LinkFinder;
+
+/// Remove all GET parameters from a URL.
+/// The link is not a URL but a String as it may not have a base domain.
+pub(crate) fn remove_get_params(url: &str) -> String {
+    let path = match url.split_once('?') {
+        Some((path, _params)) => path,
+        None => url,
+    };
+    path.to_string()
+}
+
+/// Determine if an element's attribute contains a link / URL.
+pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
+    // See a comprehensive list of attributes that might contain URLs/URIs
+    // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
+    matches!(
+        (attr_name, elem_name),
+        ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
+    )
+}
+
+// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
+pub(crate) fn is_anchor(url: &str) -> bool {
+    url.starts_with('#')
+}
+
+// Use `LinkFinder` to offload the raw link searching in plaintext
+pub(crate) fn find_links(input: &str) -> Vec<linkify::Link> {
+    let finder = LinkFinder::new();
+    finder.links(input).collect()
+}
+
+#[cfg(test)]
+mod test_fs_tree {
+    use super::*;
+
+    #[test]
+    fn test_is_anchor() {
+        assert!(is_anchor("#anchor"));
+        assert!(!is_anchor("notan#anchor"));
+    }
+
+    #[test]
+    fn test_remove_get_params() {
+        assert_eq!(remove_get_params("/"), "/".to_string());
+        assert_eq!(
+            remove_get_params("index.html?foo=bar"),
+            "index.html".to_string()
+        );
+        assert_eq!(
+            remove_get_params("/index.html?foo=bar"),
+            "/index.html".to_string()
+        );
+        assert_eq!(
+            remove_get_params("/index.html?foo=bar&baz=zorx?bla=blub"),
+            "/index.html".to_string()
+        );
+        assert_eq!(
+            remove_get_params("https://example.org/index.html?foo=bar"),
+            "https://example.org/index.html".to_string()
+        );
+        assert_eq!(
+            remove_get_params("test.png?foo=bar"),
+            "test.png".to_string()
+        );
+    }
+}
--- a/lychee-lib/src/lib.rs
+++ b/lychee-lib/src/lib.rs
@ -50,7 +50,7 @@ mod client;
 mod client_pool;
 /// A pool of clients, to handle concurrent checks
 pub mod collector;
-mod fs;
+mod helpers;
 mod quirks;
 mod types;

--- a/lychee-lib/src/types/error.rs
+++ b/lychee-lib/src/types/error.rs
@ -124,6 +124,7 @@ impl Display for ErrorKind {
                uri
            ),
            Self::InvalidBase(base, e) => write!(f, "Error while base dir `{}` : {}", base, e),
+            Self::InvalidBase(base, e) => write!(f, "Error with base dir `{}` : {}", base, e),
        }
    }
 }