Implement excluding code blocks (#523)

This is done in the extractor to avoid unnecessary allocations.
2026-05-08 13:54:44 +00:00 · 2022-03-26 10:42:56 +01:00 · 2022-03-26 10:42:56 +01:00 · d616177a99
commit d616177a99
parent 5a77209466
12 changed files with 434 additions and 62 deletions
--- a/README.md
+++ b/README.md
@ -217,6 +217,7 @@ FLAGS:
        --exclude-private        Exclude private IP address ranges from checking
        --glob-ignore-case       Ignore case when expanding filesystem path glob inputs
        --help                   Prints help information
+        --include-verbatim       Find links in verbatim sections like `pre`- and `code` blocks
    -i, --insecure               Proceed for server connections considered insecure (invalid TLS)
    -n, --no-progress            Do not show progress bar.
                                 This is recommended for non-interactive shells (e.g. for continuous integration)
--- a/benches/src/extract.rs
+++ b/benches/src/extract.rs
@ -6,7 +6,8 @@ use std::path::PathBuf;
 fn extract(paths: &[PathBuf]) {
    for path in paths {
        let content: InputContent = path.try_into().unwrap();
-        let extracted = Extractor::extract(&content);
+        let extractor = Extractor::default();
+        let extracted = extractor.extract(&content);
        println!("{}", extracted.len());
    }
 }
--- a/examples/extract/extract.rs
+++ b/examples/extract/extract.rs
@ -6,7 +6,8 @@ use std::fs;
 #[tokio::main]
 async fn main() -> Result<()> {
    let input = fs::read_to_string("fixtures/elvis.html").unwrap();
-    let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
+    let extractor = Extractor::default();
+    let links = extractor.extract(&InputContent::from_string(&input, FileType::Html));
    println!("{links:#?}");

    Ok(())
--- a/fixtures/TEST_CODE_BLOCKS.md
+++ b/fixtures/TEST_CODE_BLOCKS.md
@ -0,0 +1,11 @@
+# Test Links In Code
+
+```
+http://127.0.0.1/block
+```
+
+```bash
+http://127.0.0.1/bash
+```
+
+`http://127.0.0.1/inline` will also be excluded by default
--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
    let inputs = opts.inputs();
    let requests = Collector::new(opts.config.base.clone())
        .skip_missing_inputs(opts.config.skip_missing)
+        .include_verbatim(opts.config.include_verbatim)
        // File a bug if you rely on this envvar! It's going to go away eventually.
        .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
        .collect_links(inputs)
--- a/lychee-bin/src/options.rs
+++ b/lychee-bin/src/options.rs
@ -300,6 +300,11 @@ pub(crate) struct Config {
    #[serde(default)]
    pub(crate) skip_missing: bool,

+    /// Find links in verbatim sections like `pre`- and `code` blocks
+    #[structopt(long)]
+    #[serde(default)]
+    pub(crate) include_verbatim: bool,
+
    /// Ignore case when expanding filesystem path glob inputs
    #[structopt(long)]
    #[serde(default)]
@ -375,6 +380,7 @@ impl Config {
            base: None;
            basic_auth: None;
            skip_missing: false;
+            include_verbatim: false;
            glob_ignore_case: false;
            output: None;
            require_https: false;
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@ -9,7 +9,7 @@ mod cli {

    use assert_cmd::Command;
    use http::StatusCode;
-    use predicates::str::contains;
+    use predicates::str::{contains, is_empty};
    use pretty_assertions::assert_eq;
    use uuid::Uuid;

@ -603,6 +603,37 @@ mod cli {
        Ok(())
    }

+    #[test]
+    fn test_include_verbatim() -> Result<()> {
+        let mut cmd = main_command();
+        let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
+
+        cmd.arg("--include-verbatim")
+            .arg(input)
+            .arg("--dump")
+            .assert()
+            .success()
+            .stdout(contains("http://127.0.0.1/block"))
+            .stdout(contains("http://127.0.0.1/inline"))
+            .stdout(contains("http://127.0.0.1/bash"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_exclude_verbatim() -> Result<()> {
+        let mut cmd = main_command();
+        let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
+
+        cmd.arg(input)
+            .arg("--dump")
+            .assert()
+            .success()
+            .stdout(is_empty());
+
+        Ok(())
+    }
+
    #[test]
    fn test_require_https() -> Result<()> {
        let mut cmd = main_command();
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@ -13,6 +13,7 @@ use par_stream::ParStreamExt;
 pub struct Collector {
    base: Option<Base>,
    skip_missing_inputs: bool,
+    include_verbatim: bool,
    use_html5ever: bool,
 }

@ -24,6 +25,7 @@ impl Collector {
            base,
            skip_missing_inputs: false,
            use_html5ever: false,
+            include_verbatim: false,
        }
    }

@ -41,6 +43,13 @@ impl Collector {
        self
    }

+    /// Skip over links in verbatim sections (like Markdown code blocks)
+    #[must_use]
+    pub const fn include_verbatim(mut self, yes: bool) -> Self {
+        self.include_verbatim = yes;
+        self
+    }
+
    /// Fetch all unique links from inputs
    /// All relative URLs get prefixed with `base` (if given).
    /// (This can be a directory or a base URL)
@ -63,11 +72,8 @@ impl Collector {
                let base = base.clone();
                async move {
                    let content = content?;
-                    let uris: Vec<RawUri> = if self.use_html5ever {
-                        Extractor::extract_html5ever(&content)
-                    } else {
-                        Extractor::extract(&content)
-                    };
+                    let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
+                    let uris: Vec<RawUri> = extractor.extract(&content);
                    let requests = request::create(uris, &content, &base)?;
                    Result::Ok(stream::iter(requests.into_iter().map(Ok)))
                }
--- a/lychee-lib/src/extract/html5ever.rs
+++ b/lychee-lib/src/extract/html5ever.rs
@ -1,15 +1,17 @@
 use html5ever::{
    buffer_queue::BufferQueue,
    tendril::StrTendril,
-    tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
+    tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
 };

-use super::plaintext::extract_plaintext;
+use super::{is_verbatim_elem, plaintext::extract_plaintext};
 use crate::types::raw_uri::RawUri;

 #[derive(Clone, Default)]
 struct LinkExtractor {
    links: Vec<RawUri>,
+    include_verbatim: bool,
+    inside_excluded_element: bool,
 }

 impl TokenSink for LinkExtractor {
@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor {
    #[allow(clippy::match_same_arms)]
    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
        match token {
-            Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
+            Token::CharacterTokens(raw) => {
+                if self.inside_excluded_element {
+                    return TokenSinkResult::Continue;
+                }
+                self.links.extend(extract_plaintext(&raw));
+            }
            Token::TagToken(tag) => {
                let Tag {
-                    kind: _kind,
+                    kind,
                    name,
                    self_closing: _self_closing,
                    attrs,
                } = tag;
+                if !self.include_verbatim && is_verbatim_elem(&name) {
+                    // Skip content inside excluded elements until we see the end tag.
+                    self.inside_excluded_element = matches!(kind, TagKind::StartTag);
+                    return TokenSinkResult::Continue;
+                }

                for attr in attrs {
                    let urls = LinkExtractor::extract_urls_from_elem_attr(
-                        attr.name.local.as_ref(),
-                        name.as_ref(),
-                        attr.value.as_ref(),
+                        &attr.name.local,
+                        &name,
+                        &attr.value,
                    );

                    let new_urls = match urls {
@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor {
 }

 impl LinkExtractor {
-    pub(crate) fn new() -> Self {
-        LinkExtractor::default()
+    pub(crate) const fn new(include_verbatim: bool) -> Self {
+        Self {
+            links: vec![],
+            include_verbatim,
+            inside_excluded_element: false,
+        }
    }

    /// Extract all semantically known links from a given html attribute.
@ -75,6 +91,7 @@ impl LinkExtractor {
        // For a comprehensive list of elements that might contain URLs/URIs
        // see https://www.w3.org/TR/REC-html40/index/attributes.html
        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+
        match (elem_name, attr_name) {
            // Common element/attribute combinations for links
            (_, "href" | "src" | "cite" | "usemap")
@ -115,13 +132,75 @@ impl LinkExtractor {
 }

 /// Extract unparsed URL strings from an HTML string.
-pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
+pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
    let mut input = BufferQueue::new();
    input.push_back(StrTendril::from(buf));

-    let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
+    let mut tokenizer = Tokenizer::new(
+        LinkExtractor::new(include_verbatim),
+        TokenizerOpts::default(),
+    );
    let _handle = tokenizer.feed(&mut input);
    tokenizer.end();

    tokenizer.sink.links
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const HTML_INPUT: &str = r#"
+<html>
+    <body>
+        <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
+        <pre>
+        Some random text
+        https://foo.com and http://bar.com/some/path
+        Something else
+        </pre>
+        <p><b>bold</b></p>
+    </body>
+</html>"#;
+
+    #[test]
+    fn test_skip_verbatim() {
+        let expected = vec![RawUri {
+            text: "https://example.org".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_include_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://example.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "https://example.org".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "http://bar.com/some/path".to_string(),
+                element: None,
+                attribute: None,
+            },
+        ];
+
+        let uris = extract_html(HTML_INPUT, true);
+        assert_eq!(uris, expected);
+    }
+}
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@ -1,5 +1,6 @@
 use html5gum::{Emitter, Error, Tokenizer};

+use super::is_verbatim_elem;
 use super::plaintext::extract_plaintext;
 use crate::types::raw_uri::RawUri;

@ -13,6 +14,7 @@ struct LinkExtractor {
    current_attribute_name: Vec<u8>,
    current_attribute_value: Vec<u8>,
    last_start_element: Vec<u8>,
+    include_verbatim: bool,
 }

 /// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
@ -23,7 +25,7 @@ unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
 }

 impl LinkExtractor {
-    pub(crate) const fn new() -> Self {
+    pub(crate) const fn new(include_verbatim: bool) -> Self {
        LinkExtractor {
            links: Vec::new(),
            current_string: Vec::new(),
@ -32,6 +34,7 @@ impl LinkExtractor {
            current_attribute_name: Vec::new(),
            current_attribute_value: Vec::new(),
            last_start_element: Vec::new(),
+            include_verbatim,
        }
    }

@ -45,6 +48,7 @@ impl LinkExtractor {
        // For a comprehensive list of elements that might contain URLs/URIs
        // see https://www.w3.org/TR/REC-html40/index/attributes.html
        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+
        match (elem_name, attr_name) {
            // Common element/attribute combinations for links
            (_, "href" | "src" | "cite" | "usemap")
@ -85,6 +89,13 @@ impl LinkExtractor {

    fn flush_current_characters(&mut self) {
        // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
+        let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
+        if !self.include_verbatim && is_verbatim_elem(name) {
+            // Early return if we don't want to extract links from preformatted text
+            self.current_string.clear();
+            return;
+        }
+
        let raw = unsafe { from_utf8_unchecked(&self.current_string) };
        self.links.extend(extract_plaintext(raw));
        self.current_string.clear();
@ -94,6 +105,10 @@ impl LinkExtractor {
        {
            // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
            let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
+            if !self.include_verbatim && is_verbatim_elem(name) {
+                // Early return if we don't want to extract links from preformatted text
+                return;
+            }
            let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
            let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };

@ -199,9 +214,67 @@ impl Emitter for &mut LinkExtractor {
 }

 /// Extract unparsed URL strings from an HTML string.
-pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
-    let mut extractor = LinkExtractor::new();
+pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
+    let mut extractor = LinkExtractor::new(include_verbatim);
    let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
    assert!(tokenizer.next().is_none());
    extractor.links
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const HTML_INPUT: &str = r#"
+<html>
+    <body>
+        <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
+        <pre>
+        Some random text
+        https://foo.com and http://bar.com/some/path
+        Something else
+        </pre>
+        <p><b>bold</b></p>
+    </body>
+</html>"#;
+
+    #[test]
+    fn test_skip_verbatim() {
+        let expected = vec![RawUri {
+            text: "https://example.org".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_include_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://example.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "https://example.org".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "http://bar.com/some/path".to_string(),
+                element: None,
+                attribute: None,
+            },
+        ];
+
+        let uris = extract_html(HTML_INPUT, true);
+        assert_eq!(uris, expected);
+    }
+}
--- a/lychee-lib/src/extract/markdown.rs
+++ b/lychee-lib/src/extract/markdown.rs
@ -1,35 +1,164 @@
-use pulldown_cmark::{Event as MDEvent, Parser, Tag};
+use pulldown_cmark::{Event, Parser, Tag};

 use crate::{extract::plaintext::extract_plaintext, types::raw_uri::RawUri};

+use super::html5gum::extract_html;
+
 /// Extract unparsed URL strings from a Markdown string.
-pub(crate) fn extract_markdown(input: &str) -> Vec<RawUri> {
+pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
+    // In some cases it is undesirable to extract links from within code blocks,
+    // which is why we keep track of entries and exits while traversing the input.
+    let mut inside_code_block = false;
+
    let parser = Parser::new(input);
    parser
-        .flat_map(|event| match event {
-            MDEvent::Start(Tag::Link(_, uri, _)) => {
-                vec![RawUri {
+        .filter_map(|event| match event {
+            // A link. The first field is the link type, the second the destination URL and the third is a title.
+            Event::Start(Tag::Link(_, uri, _)) => {
+                Some(vec![RawUri {
                    text: uri.to_string(),
                    // Emulate `<a href="...">` tag here to be compatible with
                    // HTML links. We might consider using the actual Markdown
                    // `LinkType` for better granularity in the future
                    element: Some("a".to_string()),
                    attribute: Some("href".to_string()),
-                }]
+                }])
            }
-            MDEvent::Start(Tag::Image(_, uri, _)) => {
-                vec![RawUri {
+            // An image. The first field is the link type, the second the destination URL and the third is a title.
+            Event::Start(Tag::Image(_, uri, _)) => {
+                Some(vec![RawUri {
                    text: uri.to_string(),
                    // Emulate `<img src="...">` tag here to be compatible with
                    // HTML links. We might consider using the actual Markdown
                    // `LinkType` for better granularity in the future
                    element: Some("img".to_string()),
                    attribute: Some("src".to_string()),
-                }]
+                }])
            }
-            MDEvent::Text(txt) => extract_plaintext(&txt),
-            MDEvent::Html(html) => extract_plaintext(&html.to_string()),
-            _ => vec![],
+            // A code block (inline or fenced).
+            Event::Start(Tag::CodeBlock(_)) => {
+                inside_code_block = true;
+                None
+            }
+            Event::End(Tag::CodeBlock(_)) => {
+                inside_code_block = false;
+                None
+            }
+
+            // A text node.
+            Event::Text(txt) => {
+                if inside_code_block && !include_verbatim {
+                    None
+                } else {
+                    Some(extract_plaintext(&txt))
+                }
+            }
+
+            // An HTML node
+            Event::Html(html) => {
+                // This won't exclude verbatim links right now, because HTML gets passed in chunks
+                // by pulldown_cmark. So excluding `<pre>` and `<code>` is not handled right now.
+                Some(extract_html(&html.to_string(), include_verbatim))
+            }
+
+            // An inline code node.
+            Event::Code(code) => {
+                if include_verbatim {
+                    Some(extract_plaintext(&code))
+                } else {
+                    None
+                }
+            }
+
+            // Silently skip over other events
+            _ => None,
        })
+        .flatten()
        .collect()
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const MD_INPUT: &str = r#"
+# Test
+
+Some link in text [here](https://foo.com)
+
+Code:
+
+```bash
+https://bar.com/123
+```
+
+or inline like `https://bar.org` for instance.
+
+[example](http://example.com)
+        "#;
+
+    #[test]
+    fn test_skip_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "http://example.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+        ];
+
+        let uris = extract_markdown(MD_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_include_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://bar.com/123".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "https://bar.org".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "http://example.com".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+        ];
+
+        let uris = extract_markdown(MD_INPUT, true);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    #[ignore]
+    fn test_skip_verbatim_html() {
+        let input = " 
+<code>
+http://link.com
+</code>
+<pre>
+Some pre-formatted http://pre.com 
+</pre>";
+
+        let expected = vec![];
+
+        let uris = extract_markdown(input, false);
+        assert_eq!(uris, expected);
+    }
+}
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -1,44 +1,76 @@
+use std::collections::HashSet;
+
 use crate::types::{raw_uri::RawUri, FileType, InputContent};

-mod html;
+mod html5ever;
 mod html5gum;
 mod markdown;
 mod plaintext;

 use markdown::extract_markdown;
+use once_cell::sync::Lazy;
 use plaintext::extract_plaintext;

+/// HTML elements that are deemed verbatim (i.e. preformatted).
+/// These will be excluded from link checking by default.
+static VERBATIM_ELEMENTS: Lazy<HashSet<String>> = Lazy::new(|| {
+    HashSet::from_iter([
+        "pre".into(),
+        "code".into(),
+        "textarea".into(),
+        "samp".into(),
+        "xmp".into(),
+        "plaintext".into(),
+        "listing".into(),
+    ])
+});
+
+/// Check if the given element is in the list of preformatted tags
+pub(crate) fn is_verbatim_elem(name: &str) -> bool {
+    VERBATIM_ELEMENTS.contains(name)
+}
+
 /// A handler for extracting links from various input formats like Markdown and
 /// HTML. Allocations should be avoided if possible as this is a
 /// performance-critical section of the library.
-#[derive(Debug, Clone, Copy)]
-pub struct Extractor;
+#[derive(Default, Debug, Clone, Copy)]
+pub struct Extractor {
+    use_html5ever: bool,
+    include_verbatim: bool,
+}

 impl Extractor {
+    /// Creates a new extractor
+    ///
+    /// The extractor can be configured with the following settings:
+    ///
+    /// - `use_html5ever` enables the alternative HTML parser engine html5ever, that
+    ///   is also used in the Servo browser by Mozilla.
+    ///   The default is `html5gum`, which is more performant and well maintained.
+    ///
+    /// - `include_verbatim` ignores links inside Markdown code blocks.
+    ///   These can be denoted as a block starting with three backticks or an indented block.
+    ///   For more information, consult the `pulldown_cmark` documentation about code blocks
+    ///   [here](https://docs.rs/pulldown-cmark/latest/pulldown_cmark/enum.CodeBlockKind.html)
+    #[must_use]
+    pub const fn new(use_html5ever: bool, include_verbatim: bool) -> Self {
+        Self {
+            use_html5ever,
+            include_verbatim,
+        }
+    }
+
    /// Main entrypoint for extracting links from various sources
    /// (Markdown, HTML, and plaintext)
    #[must_use]
-    pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
-        Self::extract_impl(input_content, false)
-    }
-
-    /// Main entrypoint for extracting links from various sources, legacy implementation using
-    /// html5ever
-    /// (Markdown, HTML, and plaintext)
-    #[must_use]
-    pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
-        Self::extract_impl(input_content, true)
-    }
-
-    #[must_use]
-    fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
+    pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
        match input_content.file_type {
-            FileType::Markdown => extract_markdown(&input_content.content),
+            FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim),
            FileType::Html => {
-                if use_html5ever {
-                    html::extract_html(&input_content.content)
+                if self.use_html5ever {
+                    html5ever::extract_html(&input_content.content, self.include_verbatim)
                } else {
-                    html5gum::extract_html(&input_content.content)
+                    html5gum::extract_html(&input_content.content, self.include_verbatim)
                }
            }
            FileType::Plaintext => extract_plaintext(&input_content.content),
@ -63,12 +95,16 @@ mod test {
    fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
        let input_content = InputContent::from_string(input, file_type);

-        let uris_html5gum = Extractor::extract(&input_content)
+        let extractor = Extractor::new(false, false);
+        let uris_html5gum = extractor
+            .extract(&input_content)
            .into_iter()
            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
            .collect();

-        let uris_html5ever = Extractor::extract_html5ever(&input_content)
+        let extractor = Extractor::new(true, false);
+        let uris_html5ever = extractor
+            .extract(&input_content)
            .into_iter()
            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
            .collect();
@ -183,11 +219,8 @@ mod test {
        };

        for use_html5ever in [true, false] {
-            let links = if use_html5ever {
-                Extractor::extract_html5ever(input_content)
-            } else {
-                Extractor::extract(input_content)
-            };
+            let extractor = Extractor::new(use_html5ever, false);
+            let links = extractor.extract(input_content);

            let urls = links
                .into_iter()