diff --git a/README.md b/README.md index c9d929b..4abe588 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ FLAGS: --exclude-private Exclude private IP address ranges from checking --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information + --include-verbatim Find links in verbatim sections like `pre`- and `code` blocks -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuous integration) diff --git a/benches/src/extract.rs b/benches/src/extract.rs index 9482a48..352a59c 100644 --- a/benches/src/extract.rs +++ b/benches/src/extract.rs @@ -6,7 +6,8 @@ use std::path::PathBuf; fn extract(paths: &[PathBuf]) { for path in paths { let content: InputContent = path.try_into().unwrap(); - let extracted = Extractor::extract(&content); + let extractor = Extractor::default(); + let extracted = extractor.extract(&content); println!("{}", extracted.len()); } } diff --git a/examples/extract/extract.rs b/examples/extract/extract.rs index 4cecd29..0090b8f 100644 --- a/examples/extract/extract.rs +++ b/examples/extract/extract.rs @@ -6,7 +6,8 @@ use std::fs; #[tokio::main] async fn main() -> Result<()> { let input = fs::read_to_string("fixtures/elvis.html").unwrap(); - let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html)); + let extractor = Extractor::default(); + let links = extractor.extract(&InputContent::from_string(&input, FileType::Html)); println!("{links:#?}"); Ok(()) diff --git a/fixtures/TEST_CODE_BLOCKS.md b/fixtures/TEST_CODE_BLOCKS.md new file mode 100644 index 0000000..efc04fc --- /dev/null +++ b/fixtures/TEST_CODE_BLOCKS.md @@ -0,0 +1,11 @@ +# Test Links In Code + +``` +http://127.0.0.1/block +``` + +```bash +http://127.0.0.1/bash +``` + +`http://127.0.0.1/inline` will also be excluded by default diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 18a17bb..03c1753 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result { let inputs = opts.inputs(); let requests = Collector::new(opts.config.base.clone()) .skip_missing_inputs(opts.config.skip_missing) + .include_verbatim(opts.config.include_verbatim) // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1")) .collect_links(inputs) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index f4a08ac..85d5a11 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -300,6 +300,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) skip_missing: bool, + /// Find links in verbatim sections like `pre`- and `code` blocks + #[structopt(long)] + #[serde(default)] + pub(crate) include_verbatim: bool, + /// Ignore case when expanding filesystem path glob inputs #[structopt(long)] #[serde(default)] @@ -375,6 +380,7 @@ impl Config { base: None; basic_auth: None; skip_missing: false; + include_verbatim: false; glob_ignore_case: false; output: None; require_https: false; diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 5c7493a..e3a6d47 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -9,7 +9,7 @@ mod cli { use assert_cmd::Command; use http::StatusCode; - use predicates::str::contains; + use predicates::str::{contains, is_empty}; use pretty_assertions::assert_eq; use uuid::Uuid; @@ -603,6 +603,37 @@ mod cli { Ok(()) } + #[test] + fn test_include_verbatim() -> Result<()> { + let mut cmd = main_command(); + let input = fixtures_path().join("TEST_CODE_BLOCKS.md"); + + cmd.arg("--include-verbatim") + .arg(input) + .arg("--dump") + .assert() + .success() + .stdout(contains("http://127.0.0.1/block")) + .stdout(contains("http://127.0.0.1/inline")) + .stdout(contains("http://127.0.0.1/bash")); + + Ok(()) + } + + #[test] + fn test_exclude_verbatim() -> Result<()> { + let mut cmd = main_command(); + let input = fixtures_path().join("TEST_CODE_BLOCKS.md"); + + cmd.arg(input) + .arg("--dump") + .assert() + .success() + .stdout(is_empty()); + + Ok(()) + } + #[test] fn test_require_https() -> Result<()> { let mut cmd = main_command(); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 34ec930..3bc4387 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -13,6 +13,7 @@ use par_stream::ParStreamExt; pub struct Collector { base: Option, skip_missing_inputs: bool, + include_verbatim: bool, use_html5ever: bool, } @@ -24,6 +25,7 @@ impl Collector { base, skip_missing_inputs: false, use_html5ever: false, + include_verbatim: false, } } @@ -41,6 +43,13 @@ impl Collector { self } + /// Skip over links in verbatim sections (like Markdown code blocks) + #[must_use] + pub const fn include_verbatim(mut self, yes: bool) -> Self { + self.include_verbatim = yes; + self + } + /// Fetch all unique links from inputs /// All relative URLs get prefixed with `base` (if given). /// (This can be a directory or a base URL) @@ -63,11 +72,8 @@ impl Collector { let base = base.clone(); async move { let content = content?; - let uris: Vec = if self.use_html5ever { - Extractor::extract_html5ever(&content) - } else { - Extractor::extract(&content) - }; + let extractor = Extractor::new(self.use_html5ever, self.include_verbatim); + let uris: Vec = extractor.extract(&content); let requests = request::create(uris, &content, &base)?; Result::Ok(stream::iter(requests.into_iter().map(Ok))) } diff --git a/lychee-lib/src/extract/html.rs b/lychee-lib/src/extract/html5ever.rs similarity index 56% rename from lychee-lib/src/extract/html.rs rename to lychee-lib/src/extract/html5ever.rs index caf69bc..5f5cd53 100644 --- a/lychee-lib/src/extract/html.rs +++ b/lychee-lib/src/extract/html5ever.rs @@ -1,15 +1,17 @@ use html5ever::{ buffer_queue::BufferQueue, tendril::StrTendril, - tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}, + tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}, }; -use super::plaintext::extract_plaintext; +use super::{is_verbatim_elem, plaintext::extract_plaintext}; use crate::types::raw_uri::RawUri; #[derive(Clone, Default)] struct LinkExtractor { links: Vec, + include_verbatim: bool, + inside_excluded_element: bool, } impl TokenSink for LinkExtractor { @@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor { #[allow(clippy::match_same_arms)] fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { match token { - Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)), + Token::CharacterTokens(raw) => { + if self.inside_excluded_element { + return TokenSinkResult::Continue; + } + self.links.extend(extract_plaintext(&raw)); + } Token::TagToken(tag) => { let Tag { - kind: _kind, + kind, name, self_closing: _self_closing, attrs, } = tag; + if !self.include_verbatim && is_verbatim_elem(&name) { + // Skip content inside excluded elements until we see the end tag. + self.inside_excluded_element = matches!(kind, TagKind::StartTag); + return TokenSinkResult::Continue; + } for attr in attrs { let urls = LinkExtractor::extract_urls_from_elem_attr( - attr.name.local.as_ref(), - name.as_ref(), - attr.value.as_ref(), + &attr.name.local, + &name, + &attr.value, ); let new_urls = match urls { @@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor { } impl LinkExtractor { - pub(crate) fn new() -> Self { - LinkExtractor::default() + pub(crate) const fn new(include_verbatim: bool) -> Self { + Self { + links: vec![], + include_verbatim, + inside_excluded_element: false, + } } /// Extract all semantically known links from a given html attribute. @@ -75,6 +91,7 @@ impl LinkExtractor { // For a comprehensive list of elements that might contain URLs/URIs // see https://www.w3.org/TR/REC-html40/index/attributes.html // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 + match (elem_name, attr_name) { // Common element/attribute combinations for links (_, "href" | "src" | "cite" | "usemap") @@ -115,13 +132,75 @@ impl LinkExtractor { } /// Extract unparsed URL strings from an HTML string. -pub(crate) fn extract_html(buf: &str) -> Vec { +pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { let mut input = BufferQueue::new(); input.push_back(StrTendril::from(buf)); - let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default()); + let mut tokenizer = Tokenizer::new( + LinkExtractor::new(include_verbatim), + TokenizerOpts::default(), + ); let _handle = tokenizer.feed(&mut input); tokenizer.end(); tokenizer.sink.links } + +#[cfg(test)] +mod tests { + use super::*; + + const HTML_INPUT: &str = r#" + + +

This is a paragraph with some inline https://example.com and a normal example

+
+        Some random text
+        https://foo.com and http://bar.com/some/path
+        Something else
+        
+

bold

+ +"#; + + #[test] + fn test_skip_verbatim() { + let expected = vec![RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + + let uris = extract_html(HTML_INPUT, false); + assert_eq!(uris, expected); + } + + #[test] + fn test_include_verbatim() { + let expected = vec![ + RawUri { + text: "https://example.com".to_string(), + element: None, + attribute: None, + }, + RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }, + RawUri { + text: "https://foo.com".to_string(), + element: None, + attribute: None, + }, + RawUri { + text: "http://bar.com/some/path".to_string(), + element: None, + attribute: None, + }, + ]; + + let uris = extract_html(HTML_INPUT, true); + assert_eq!(uris, expected); + } +} diff --git a/lychee-lib/src/extract/html5gum.rs b/lychee-lib/src/extract/html5gum.rs index 4724b24..379d4b9 100644 --- a/lychee-lib/src/extract/html5gum.rs +++ b/lychee-lib/src/extract/html5gum.rs @@ -1,5 +1,6 @@ use html5gum::{Emitter, Error, Tokenizer}; +use super::is_verbatim_elem; use super::plaintext::extract_plaintext; use crate::types::raw_uri::RawUri; @@ -13,6 +14,7 @@ struct LinkExtractor { current_attribute_name: Vec, current_attribute_value: Vec, last_start_element: Vec, + include_verbatim: bool, } /// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease @@ -23,7 +25,7 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str { } impl LinkExtractor { - pub(crate) const fn new() -> Self { + pub(crate) const fn new(include_verbatim: bool) -> Self { LinkExtractor { links: Vec::new(), current_string: Vec::new(), @@ -32,6 +34,7 @@ impl LinkExtractor { current_attribute_name: Vec::new(), current_attribute_value: Vec::new(), last_start_element: Vec::new(), + include_verbatim, } } @@ -45,6 +48,7 @@ impl LinkExtractor { // For a comprehensive list of elements that might contain URLs/URIs // see https://www.w3.org/TR/REC-html40/index/attributes.html // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 + match (elem_name, attr_name) { // Common element/attribute combinations for links (_, "href" | "src" | "cite" | "usemap") @@ -85,6 +89,13 @@ impl LinkExtractor { fn flush_current_characters(&mut self) { // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well. + let name = unsafe { from_utf8_unchecked(&self.current_element_name) }; + if !self.include_verbatim && is_verbatim_elem(name) { + // Early return if we don't want to extract links from preformatted text + self.current_string.clear(); + return; + } + let raw = unsafe { from_utf8_unchecked(&self.current_string) }; self.links.extend(extract_plaintext(raw)); self.current_string.clear(); @@ -94,6 +105,10 @@ impl LinkExtractor { { // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well. let name = unsafe { from_utf8_unchecked(&self.current_element_name) }; + if !self.include_verbatim && is_verbatim_elem(name) { + // Early return if we don't want to extract links from preformatted text + return; + } let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) }; let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) }; @@ -199,9 +214,67 @@ impl Emitter for &mut LinkExtractor { } /// Extract unparsed URL strings from an HTML string. -pub(crate) fn extract_html(buf: &str) -> Vec { - let mut extractor = LinkExtractor::new(); +pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { + let mut extractor = LinkExtractor::new(include_verbatim); let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible(); assert!(tokenizer.next().is_none()); extractor.links } +#[cfg(test)] +mod tests { + use super::*; + + const HTML_INPUT: &str = r#" + + +

This is a paragraph with some inline https://example.com and a normal example

+
+        Some random text
+        https://foo.com and http://bar.com/some/path
+        Something else
+        
+

bold

+ +"#; + + #[test] + fn test_skip_verbatim() { + let expected = vec![RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + + let uris = extract_html(HTML_INPUT, false); + assert_eq!(uris, expected); + } + + #[test] + fn test_include_verbatim() { + let expected = vec![ + RawUri { + text: "https://example.com".to_string(), + element: None, + attribute: None, + }, + RawUri { + text: "https://example.org".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }, + RawUri { + text: "https://foo.com".to_string(), + element: None, + attribute: None, + }, + RawUri { + text: "http://bar.com/some/path".to_string(), + element: None, + attribute: None, + }, + ]; + + let uris = extract_html(HTML_INPUT, true); + assert_eq!(uris, expected); + } +} diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 85ff11f..57b9e40 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,35 +1,164 @@ -use pulldown_cmark::{Event as MDEvent, Parser, Tag}; +use pulldown_cmark::{Event, Parser, Tag}; use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri}; +use super::html5gum::extract_html; + /// Extract unparsed URL strings from a Markdown string. -pub(crate) fn extract_markdown(input: &str) -> Vec { +pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { + // In some cases it is undesirable to extract links from within code blocks, + // which is why we keep track of entries and exits while traversing the input. + let mut inside_code_block = false; + let parser = Parser::new(input); parser - .flat_map(|event| match event { - MDEvent::Start(Tag::Link(_, uri, _)) => { - vec![RawUri { + .filter_map(|event| match event { + // A link. The first field is the link type, the second the destination URL and the third is a title. + Event::Start(Tag::Link(_, uri, _)) => { + Some(vec![RawUri { text: uri.to_string(), // Emulate `` tag here to be compatible with // HTML links. We might consider using the actual Markdown // `LinkType` for better granularity in the future element: Some("a".to_string()), attribute: Some("href".to_string()), - }] + }]) } - MDEvent::Start(Tag::Image(_, uri, _)) => { - vec![RawUri { + // An image. The first field is the link type, the second the destination URL and the third is a title. + Event::Start(Tag::Image(_, uri, _)) => { + Some(vec![RawUri { text: uri.to_string(), // Emulate `` tag here to be compatible with // HTML links. We might consider using the actual Markdown // `LinkType` for better granularity in the future element: Some("img".to_string()), attribute: Some("src".to_string()), - }] + }]) } - MDEvent::Text(txt) => extract_plaintext(&txt), - MDEvent::Html(html) => extract_plaintext(&html.to_string()), - _ => vec![], + // A code block (inline or fenced). + Event::Start(Tag::CodeBlock(_)) => { + inside_code_block = true; + None + } + Event::End(Tag::CodeBlock(_)) => { + inside_code_block = false; + None + } + + // A text node. + Event::Text(txt) => { + if inside_code_block && !include_verbatim { + None + } else { + Some(extract_plaintext(&txt)) + } + } + + // An HTML node + Event::Html(html) => { + // This won't exclude verbatim links right now, because HTML gets passed in chunks + // by pulldown_cmark. So excluding `
` and `` is not handled right now.
+                Some(extract_html(&html.to_string(), include_verbatim))
+            }
+
+            // An inline code node.
+            Event::Code(code) => {
+                if include_verbatim {
+                    Some(extract_plaintext(&code))
+                } else {
+                    None
+                }
+            }
+
+            // Silently skip over other events
+            _ => None,
         })
+        .flatten()
         .collect()
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const MD_INPUT: &str = r#"
+# Test
+
+Some link in text [here](https://foo.com)
+
+Code:
+
+```bash
+https://bar.com/123
+```
+
+or inline like `https://bar.org` for instance.
+
+[example](http://example.com)
+        "#;
+
+    #[test]
+    fn test_skip_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "http://example.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+        ];
+
+        let uris = extract_markdown(MD_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_include_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://bar.com/123".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "https://bar.org".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "http://example.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+        ];
+
+        let uris = extract_markdown(MD_INPUT, true);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    #[ignore]
+    fn test_skip_verbatim_html() {
+        let input = " 
+
+http://link.com
+
+
+Some pre-formatted http://pre.com 
+
"; + + let expected = vec![]; + + let uris = extract_markdown(input, false); + assert_eq!(uris, expected); + } +} diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index f3bde05..ec54609 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -1,44 +1,76 @@ +use std::collections::HashSet; + use crate::types::{raw_uri::RawUri, FileType, InputContent}; -mod html; +mod html5ever; mod html5gum; mod markdown; mod plaintext; use markdown::extract_markdown; +use once_cell::sync::Lazy; use plaintext::extract_plaintext; +/// HTML elements that are deemed verbatim (i.e. preformatted). +/// These will be excluded from link checking by default. +static VERBATIM_ELEMENTS: Lazy> = Lazy::new(|| { + HashSet::from_iter([ + "pre".into(), + "code".into(), + "textarea".into(), + "samp".into(), + "xmp".into(), + "plaintext".into(), + "listing".into(), + ]) +}); + +/// Check if the given element is in the list of preformatted tags +pub(crate) fn is_verbatim_elem(name: &str) -> bool { + VERBATIM_ELEMENTS.contains(name) +} + /// A handler for extracting links from various input formats like Markdown and /// HTML. Allocations should be avoided if possible as this is a /// performance-critical section of the library. -#[derive(Debug, Clone, Copy)] -pub struct Extractor; +#[derive(Default, Debug, Clone, Copy)] +pub struct Extractor { + use_html5ever: bool, + include_verbatim: bool, +} impl Extractor { + /// Creates a new extractor + /// + /// The extractor can be configured with the following settings: + /// + /// - `use_html5ever` enables the alternative HTML parser engine html5ever, that + /// is also used in the Servo browser by Mozilla. + /// The default is `html5gum`, which is more performant and well maintained. + /// + /// - `include_verbatim` ignores links inside Markdown code blocks. + /// These can be denoted as a block starting with three backticks or an indented block. + /// For more information, consult the `pulldown_cmark` documentation about code blocks + /// [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html) + #[must_use] + pub const fn new(use_html5ever: bool, include_verbatim: bool) -> Self { + Self { + use_html5ever, + include_verbatim, + } + } + /// Main entrypoint for extracting links from various sources /// (Markdown, HTML, and plaintext) #[must_use] - pub fn extract(input_content: &InputContent) -> Vec { - Self::extract_impl(input_content, false) - } - - /// Main entrypoint for extracting links from various sources, legacy implementation using - /// html5ever - /// (Markdown, HTML, and plaintext) - #[must_use] - pub fn extract_html5ever(input_content: &InputContent) -> Vec { - Self::extract_impl(input_content, true) - } - - #[must_use] - fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec { + pub fn extract(&self, input_content: &InputContent) -> Vec { match input_content.file_type { - FileType::Markdown => extract_markdown(&input_content.content), + FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim), FileType::Html => { - if use_html5ever { - html::extract_html(&input_content.content) + if self.use_html5ever { + html5ever::extract_html(&input_content.content, self.include_verbatim) } else { - html5gum::extract_html(&input_content.content) + html5gum::extract_html(&input_content.content, self.include_verbatim) } } FileType::Plaintext => extract_plaintext(&input_content.content), @@ -63,12 +95,16 @@ mod test { fn extract_uris(input: &str, file_type: FileType) -> HashSet { let input_content = InputContent::from_string(input, file_type); - let uris_html5gum = Extractor::extract(&input_content) + let extractor = Extractor::new(false, false); + let uris_html5gum = extractor + .extract(&input_content) .into_iter() .filter_map(|raw_uri| Uri::try_from(raw_uri).ok()) .collect(); - let uris_html5ever = Extractor::extract_html5ever(&input_content) + let extractor = Extractor::new(true, false); + let uris_html5ever = extractor + .extract(&input_content) .into_iter() .filter_map(|raw_uri| Uri::try_from(raw_uri).ok()) .collect(); @@ -183,11 +219,8 @@ mod test { }; for use_html5ever in [true, false] { - let links = if use_html5ever { - Extractor::extract_html5ever(input_content) - } else { - Extractor::extract(input_content) - }; + let extractor = Extractor::new(use_html5ever, false); + let links = extractor.extract(input_content); let urls = links .into_iter()