Add html5gum as alternative link extractor (#480)

html5gum is a HTML parser that offers lower-level control over which tokens actually get created and are tracked. As such, the extractor doesn't allocate anything tokens it doesn't care about. On some benchmarks it provides a substantial performance boost. The old parser, html5ever is still available by setting the `LYCHEE_USE_HTML5EVER=1` env var.
2026-04-25 07:24:46 +00:00 · 2022-02-07 22:54:47 +01:00 · 2022-02-07 22:54:47 +01:00 · 68d09f7e5b
commit 68d09f7e5b
parent 6bf8c1fe39
9 changed files with 326 additions and 53 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1410,6 +1410,15 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "html5gum"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dad48b66db55322add2819ae1d7bda0c32f3415269a08330679dbc8b0afeb30"
+dependencies = [
+ "jetscii",
+]
+
 [[package]]
 name = "http"
 version = "0.2.6"
@ -1638,6 +1647,12 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"

+[[package]]
+name = "jetscii"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9447923c57a8a2d5c1b0875cdf96a6324275df728b498f2ede0e5cbde088a15"
+
 [[package]]
 name = "js-sys"
 version = "0.3.55"
@ -1812,6 +1827,7 @@ dependencies = [
 "futures",
 "glob",
 "html5ever",
+ "html5gum",
 "http",
 "hubcaps",
 "jwalk",
--- a/examples/collect_links/collect_links.rs
+++ b/examples/collect_links/collect_links.rs
@ -20,16 +20,13 @@ async fn main() -> Result<()> {
        },
    ];

-    let links = Collector::new(
-        None,  // base
-        false, // don't skip missing inputs
-    )
-    .collect_links(
-        inputs, // base url or directory
-    )
-    .await
-    .collect::<Result<Vec<_>>>()
-    .await?;
+    let links = Collector::new(None) // base
+        .skip_missing_inputs(false) // don't skip missing inputs? (default=false)
+        .use_html5ever(false) // use html5ever for parsing? (default=false)
+        .collect_links(inputs) // base url or directory
+        .await
+        .collect::<Result<Vec<_>>>()
+        .await?;

    dbg!(links);

--- a/lychee-bin/src/main.rs
+++ b/lychee-bin/src/main.rs
@ -200,7 +200,10 @@ fn run_main() -> Result<i32> {
 /// Run lychee on the given inputs
 async fn run(opts: &LycheeOptions) -> Result<i32> {
    let inputs = opts.inputs();
-    let requests = Collector::new(opts.config.base.clone(), opts.config.skip_missing)
+    let requests = Collector::new(opts.config.base.clone())
+        .skip_missing_inputs(opts.config.skip_missing)
+        // File a bug if you rely on this envvar! It's going to go away eventually.
+        .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
        .collect_links(inputs)
        .await;

--- a/lychee-lib/Cargo.toml
+++ b/lychee-lib/Cargo.toml
@ -20,7 +20,6 @@ version = "0.8.2"
 check-if-email-exists = "0.8.26"
 fast_chemail = "0.9.6"
 glob = "0.3.0"
-html5ever = "0.25.1"
 http = "0.2.6"
 hubcaps = "0.6.2"
 linkify = "0.8.0"
@ -50,6 +49,8 @@ once_cell = "1.9.0"
 thiserror = "1.0.30"
 futures = "0.3.19"
 lazy_static = "1.4.0"
+html5ever = "0.25.1"
+html5gum = "0.4.0"

 [dependencies.par-stream]
 version = "0.10.0"
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@ -13,18 +13,34 @@ use par_stream::ParStreamExt;
 pub struct Collector {
    base: Option<Base>,
    skip_missing_inputs: bool,
+    use_html5ever: bool,
 }

 impl Collector {
    /// Create a new collector with an empty cache
    #[must_use]
-    pub const fn new(base: Option<Base>, skip_missing_inputs: bool) -> Self {
+    pub const fn new(base: Option<Base>) -> Self {
        Collector {
            base,
-            skip_missing_inputs,
+            skip_missing_inputs: false,
+            use_html5ever: false,
        }
    }

+    /// Skip missing input files (default is to error if they don't exist)
+    #[must_use]
+    pub const fn skip_missing_inputs(mut self, yes: bool) -> Self {
+        self.skip_missing_inputs = yes;
+        self
+    }
+
+    /// Use `html5ever` to parse HTML instead of `html5gum`.
+    #[must_use]
+    pub const fn use_html5ever(mut self, yes: bool) -> Self {
+        self.use_html5ever = yes;
+        self
+    }
+
    /// Fetch all unique links from inputs
    /// All relative URLs get prefixed with `base` (if given).
    /// (This can be a directory or a base URL)
@ -47,7 +63,11 @@ impl Collector {
                let base = base.clone();
                async move {
                    let content = content?;
-                    let uris: Vec<RawUri> = Extractor::extract(&content);
+                    let uris: Vec<RawUri> = if self.use_html5ever {
+                        Extractor::extract_html5ever(&content)
+                    } else {
+                        Extractor::extract(&content)
+                    };
                    let requests = request::create(uris, &content, &base)?;
                    Result::Ok(stream::iter(requests.into_iter().map(Ok)))
                }
@ -74,7 +94,7 @@ mod test {

    // Helper function to run the collector on the given inputs
    async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
-        let responses = Collector::new(base, false).collect_links(inputs).await;
+        let responses = Collector::new(base).collect_links(inputs).await;
        responses.map(|r| r.unwrap().uri).collect().await
    }

--- a/lychee-lib/src/extract/html.rs
+++ b/lychee-lib/src/extract/html.rs
@ -7,7 +7,7 @@ use html5ever::{
 use super::plaintext::extract_plaintext;
 use crate::types::raw_uri::RawUri;

-#[derive(Clone)]
+#[derive(Clone, Default)]
 struct LinkExtractor {
    links: Vec<RawUri>,
 }
@ -61,8 +61,8 @@ impl TokenSink for LinkExtractor {
 }

 impl LinkExtractor {
-    pub(crate) const fn new() -> Self {
-        Self { links: Vec::new() }
+    pub(crate) fn new() -> Self {
+        LinkExtractor::default()
    }

    /// Extract all semantically known links from a given html attribute.
@ -125,20 +125,3 @@ pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {

    tokenizer.sink.links
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_extract_link_at_end_of_line() {
-        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
-        let link = input.trim_end();
-
-        let uris: Vec<String> = extract_html(input)
-            .into_iter()
-            .map(|raw_uri| raw_uri.text)
-            .collect();
-        assert_eq!(vec![link.to_string()], uris);
-    }
-}
--- a/lychee-lib/src/extract/html5gum.rs
+++ b/lychee-lib/src/extract/html5gum.rs
@ -0,0 +1,207 @@
+use html5gum::{Emitter, Error, Tokenizer};
+
+use super::plaintext::extract_plaintext;
+use crate::types::raw_uri::RawUri;
+
+#[derive(Clone)]
+struct LinkExtractor {
+    // note: what html5gum calls a tag, lychee calls an element
+    links: Vec<RawUri>,
+    current_string: Vec<u8>,
+    current_element_name: Vec<u8>,
+    current_element_is_closing: bool,
+    current_attribute_name: Vec<u8>,
+    current_attribute_value: Vec<u8>,
+    last_start_element: Vec<u8>,
+}
+
+/// this is the same as `std::str::from_utf8_unchecked`, but with extra debug assertions for ease
+/// of debugging
+unsafe fn from_utf8_unchecked(s: &[u8]) -> &str {
+    debug_assert!(std::str::from_utf8(s).is_ok());
+    std::str::from_utf8_unchecked(s)
+}
+
+impl LinkExtractor {
+    pub(crate) const fn new() -> Self {
+        LinkExtractor {
+            links: Vec::new(),
+            current_string: Vec::new(),
+            current_element_name: Vec::new(),
+            current_element_is_closing: false,
+            current_attribute_name: Vec::new(),
+            current_attribute_value: Vec::new(),
+            last_start_element: Vec::new(),
+        }
+    }
+
+    /// Extract all semantically known links from a given html attribute.
+    #[allow(clippy::unnested_or_patterns)]
+    pub(crate) fn extract_urls_from_elem_attr<'a>(
+        attr_name: &str,
+        elem_name: &str,
+        attr_value: &'a str,
+    ) -> Option<impl Iterator<Item = &'a str>> {
+        // For a comprehensive list of elements that might contain URLs/URIs
+        // see https://www.w3.org/TR/REC-html40/index/attributes.html
+        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+        match (elem_name, attr_name) {
+            // Common element/attribute combinations for links
+            (_, "href" | "src" | "cite" | "usemap")
+            // Less common (but still valid!) combinations
+            | ("applet", "codebase")
+            | ("body", "background")
+            | ("button", "formaction")
+            | ("command", "icon")
+            | ("form", "action")
+            | ("frame", "longdesc")
+            | ("head", "profile")
+            | ("html", "manifest")
+            | ("iframe", "longdesc")
+            | ("img", "longdesc")
+            | ("input", "formaction")
+            | ("object", "classid")
+            | ("object", "codebase")
+            | ("object", "data")
+            | ("video", "poster") => {
+                Some(vec![attr_value].into_iter())
+            }
+            (_, "srcset") => {
+                let mut urls = Vec::new();
+                for image_candidate_string in attr_value.trim().split(',') {
+                    for part in image_candidate_string.split_ascii_whitespace() {
+                        if part.is_empty() {
+                            continue;
+                        }
+                        urls.push(part);
+                        break;
+                    }
+                }
+                Some(urls.into_iter())
+            }
+            _ => None,
+        }
+    }
+
+    fn flush_current_characters(&mut self) {
+        // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
+        let raw = unsafe { from_utf8_unchecked(&self.current_string) };
+        self.links.extend(extract_plaintext(raw));
+        self.current_string.clear();
+    }
+
+    fn flush_old_attribute(&mut self) {
+        {
+            // safety: since we feed html5gum tokenizer with a &str, this must be a &str as well.
+            let name = unsafe { from_utf8_unchecked(&self.current_element_name) };
+            let attr = unsafe { from_utf8_unchecked(&self.current_attribute_name) };
+            let value = unsafe { from_utf8_unchecked(&self.current_attribute_value) };
+
+            let urls = LinkExtractor::extract_urls_from_elem_attr(attr, name, value);
+
+            let new_urls = match urls {
+                None => extract_plaintext(value),
+                Some(urls) => urls
+                    .into_iter()
+                    .map(|url| RawUri {
+                        text: url.to_string(),
+                        element: Some(name.to_string()),
+                        attribute: Some(attr.to_string()),
+                    })
+                    .collect::<Vec<_>>(),
+            };
+
+            self.links.extend(new_urls);
+        }
+
+        self.current_attribute_name.clear();
+        self.current_attribute_value.clear();
+    }
+}
+
+impl Emitter for &mut LinkExtractor {
+    type Token = ();
+
+    fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
+        self.last_start_element.clear();
+        self.last_start_element
+            .extend(last_start_tag.unwrap_or_default());
+    }
+
+    fn emit_eof(&mut self) {
+        self.flush_current_characters();
+    }
+    fn emit_error(&mut self, _: Error) {}
+    fn pop_token(&mut self) -> Option<()> {
+        None
+    }
+
+    fn emit_string(&mut self, c: &[u8]) {
+        self.current_string.extend(c);
+    }
+
+    fn init_start_tag(&mut self) {
+        self.flush_current_characters();
+        self.current_element_name.clear();
+        self.current_element_is_closing = false;
+    }
+
+    fn init_end_tag(&mut self) {
+        self.flush_current_characters();
+        self.current_element_name.clear();
+        self.current_element_is_closing = true;
+    }
+
+    fn init_comment(&mut self) {
+        self.flush_current_characters();
+    }
+
+    fn emit_current_tag(&mut self) {
+        self.flush_old_attribute();
+    }
+
+    fn emit_current_doctype(&mut self) {}
+    fn set_self_closing(&mut self) {
+        self.current_element_is_closing = true;
+    }
+    fn set_force_quirks(&mut self) {}
+
+    fn push_tag_name(&mut self, s: &[u8]) {
+        self.current_element_name.extend(s);
+    }
+
+    fn push_comment(&mut self, _: &[u8]) {}
+    fn push_doctype_name(&mut self, _: &[u8]) {}
+    fn init_doctype(&mut self) {
+        self.flush_current_characters();
+    }
+    fn init_attribute(&mut self) {
+        self.flush_old_attribute();
+    }
+    fn push_attribute_name(&mut self, s: &[u8]) {
+        self.current_attribute_name.extend(s);
+    }
+    fn push_attribute_value(&mut self, s: &[u8]) {
+        self.current_attribute_value.extend(s);
+    }
+
+    fn set_doctype_public_identifier(&mut self, _: &[u8]) {}
+    fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
+    fn push_doctype_public_identifier(&mut self, _: &[u8]) {}
+    fn push_doctype_system_identifier(&mut self, _: &[u8]) {}
+    fn current_is_appropriate_end_tag_token(&mut self) -> bool {
+        self.current_element_is_closing
+            && !self.current_element_name.is_empty()
+            && self.current_element_name == self.last_start_element
+    }
+
+    fn emit_current_comment(&mut self) {}
+}
+
+/// Extract unparsed URL strings from an HTML string.
+pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
+    let mut extractor = LinkExtractor::new();
+    let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
+    assert!(tokenizer.next().is_none());
+    extractor.links
+}
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -1,10 +1,10 @@
 use crate::types::{raw_uri::RawUri, FileType, InputContent};

 mod html;
+mod html5gum;
 mod markdown;
 mod plaintext;

-use html::extract_html;
 use markdown::extract_markdown;
 use plaintext::extract_plaintext;

@ -19,9 +19,28 @@ impl Extractor {
    /// (Markdown, HTML, and plaintext)
    #[must_use]
    pub fn extract(input_content: &InputContent) -> Vec<RawUri> {
+        Self::extract_impl(input_content, false)
+    }
+
+    /// Main entrypoint for extracting links from various sources, legacy implementation using
+    /// html5ever
+    /// (Markdown, HTML, and plaintext)
+    #[must_use]
+    pub fn extract_html5ever(input_content: &InputContent) -> Vec<RawUri> {
+        Self::extract_impl(input_content, true)
+    }
+
+    #[must_use]
+    fn extract_impl(input_content: &InputContent, use_html5ever: bool) -> Vec<RawUri> {
        match input_content.file_type {
            FileType::Markdown => extract_markdown(&input_content.content),
-            FileType::Html => extract_html(&input_content.content),
+            FileType::Html => {
+                if use_html5ever {
+                    html::extract_html(&input_content.content)
+                } else {
+                    html5gum::extract_html(&input_content.content)
+                }
+            }
            FileType::Plaintext => extract_plaintext(&input_content.content),
        }
    }
@ -43,10 +62,19 @@ mod test {

    fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
        let input_content = InputContent::from_string(input, file_type);
-        Extractor::extract(&input_content)
+
+        let uris_html5gum = Extractor::extract(&input_content)
            .into_iter()
            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
-            .collect()
+            .collect();
+
+        let uris_html5ever = Extractor::extract_html5ever(&input_content)
+            .into_iter()
+            .filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
+            .collect();
+
+        assert_eq!(uris_html5gum, uris_html5ever);
+        uris_html5gum
    }

    #[test]
@ -154,19 +182,26 @@ mod test {
            content: contents.to_string(),
        };

-        let links = Extractor::extract(input_content);
-        let urls = links
-            .into_iter()
-            .map(|raw_uri| raw_uri.text)
+        for use_html5ever in [true, false] {
+            let links = if use_html5ever {
+                Extractor::extract_html5ever(input_content)
+            } else {
+                Extractor::extract(input_content)
+            };
+
+            let urls = links
+                .into_iter()
+                .map(|raw_uri| raw_uri.text)
+                .collect::<HashSet<_>>();
+
+            let expected_urls = IntoIterator::into_iter([
+                String::from("https://github.com/lycheeverse/lychee/"),
+                String::from("/about"),
+            ])
            .collect::<HashSet<_>>();

-        let expected_urls = IntoIterator::into_iter([
-            String::from("https://github.com/lycheeverse/lychee/"),
-            String::from("/about"),
-        ])
-        .collect::<HashSet<_>>();
-
-        assert_eq!(urls, expected_urls);
+            assert_eq!(urls, expected_urls);
+        }
    }

    #[test]
@ -242,4 +277,16 @@ mod test {

        assert_eq!(links, expected_links);
    }
+
+    #[test]
+    fn test_extract_link_at_end_of_line() {
+        let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
+        let links = extract_uris(input, FileType::Plaintext);
+
+        let expected_links =
+            IntoIterator::into_iter([website("https://www.apache.org/licenses/LICENSE-2.0")])
+                .collect::<HashSet<Uri>>();
+
+        assert_eq!(links, expected_links);
+    }
 }
--- a/lychee-lib/src/helpers/request.rs
+++ b/lychee-lib/src/helpers/request.rs
@ -1,4 +1,3 @@
-use html5ever::tendril::StrTendril;
 use log::info;
 use percent_encoding::percent_decode_str;
 use reqwest::Url;
@ -28,7 +27,7 @@ pub(crate) fn create(
        .into_iter()
        .map(|raw_uri| {
            let is_anchor = raw_uri.is_anchor();
-            let text = StrTendril::from(raw_uri.text.clone());
+            let text = raw_uri.text.clone();
            let element = raw_uri.element.clone();
            let attribute = raw_uri.attribute.clone();