Fix bugs in extractor; reduce allocs (#464)

When URLs couldn't be extracted from a tag, we ran a plaintext search, but never added the newly found urls to the vec of extracted urls. Also tried to make the code a little more idiomatic
2026-04-16 11:20:59 +00:00 · 2022-01-16 02:13:38 +01:00 · 2022-01-16 02:13:38 +01:00 · 5802ae912c
commit 5802ae912c
parent 6e757fa20e
1 changed files with 66 additions and 67 deletions
--- a/lychee-lib/src/extract/html.rs
+++ b/lychee-lib/src/extract/html.rs
@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor {
    #[allow(clippy::match_same_arms)]
    fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
        match token {
-            Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
+            Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
            Token::TagToken(tag) => {
                let Tag {
                    kind: _kind,
@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor {
                } = tag;

                for attr in attrs {
-                    let urls = extract_urls_from_elem_attr(
+                    let urls = LinkExtractor::extract_urls_from_elem_attr(
                        attr.name.local.as_ref(),
                        name.as_ref(),
                        attr.value.as_ref(),
                    );

-                    if urls.is_empty() {
-                        extract_plaintext(&attr.value);
-                    } else {
-                        self.links.extend(
-                            urls.into_iter()
-                                .map(|url| RawUri {
-                                    text: url,
-                                    element: Some(name.to_string()),
-                                    attribute: Some(attr.name.local.to_string()),
-                                })
-                                .collect::<Vec<_>>(),
-                        );
-                    }
+                    let new_urls = match urls {
+                        None => extract_plaintext(&attr.value),
+                        Some(urls) => urls
+                            .into_iter()
+                            .map(|url| RawUri {
+                                text: url.to_string(),
+                                element: Some(name.to_string()),
+                                attribute: Some(attr.name.local.to_string()),
+                            })
+                            .collect::<Vec<_>>(),
+                    };
+                    self.links.extend(new_urls);
                }
            }
            Token::ParseError(_err) => {
@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor {
    }
 }

-/// Extract all semantically known links from a given html attribute.
-#[allow(clippy::unnested_or_patterns)]
-pub(crate) fn extract_urls_from_elem_attr(
-    attr_name: &str,
-    elem_name: &str,
-    attr_value: &str,
-) -> Vec<String> {
-    let mut urls = Vec::new();
-
-    // For a comprehensive list of elements that might contain URLs/URIs
-    // see https://www.w3.org/TR/REC-html40/index/attributes.html
-    // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
-    match (elem_name, attr_name) {
-        // Common element/attribute combinations for links
-        (_, "href" | "src" | "cite" | "usemap")
-        // Less common (but still valid!) combinations
-        | ("applet", "codebase")
-        | ("body", "background")
-        | ("button", "formaction")
-        | ("command", "icon")
-        | ("form", "action")
-        | ("frame", "longdesc")
-        | ("head", "profile")
-        | ("html", "manifest")
-        | ("iframe", "longdesc")
-        | ("img", "longdesc")
-        | ("input", "formaction")
-        | ("object", "classid")
-        | ("object", "codebase")
-        | ("object", "data")
-        | ("video", "poster") => {
-            urls.push(attr_value.to_owned());
-        }
-        (_, "srcset") => {
-            for image_candidate_string in attr_value.trim().split(',') {
-                for part in image_candidate_string.split_ascii_whitespace() {
-                    if part.is_empty() {
-                        continue;
-                    }
-
-                    urls.push(part.to_owned());
-                    break;
-                }
-            }
-        }
-        _ => (),
+impl LinkExtractor {
+    pub(crate) const fn new() -> Self {
+        Self { links: Vec::new() }
+    }
+
+    /// Extract all semantically known links from a given html attribute.
+    #[allow(clippy::unnested_or_patterns)]
+    pub(crate) fn extract_urls_from_elem_attr<'a>(
+        attr_name: &str,
+        elem_name: &str,
+        attr_value: &'a str,
+    ) -> Option<impl Iterator<Item = &'a str>> {
+        // For a comprehensive list of elements that might contain URLs/URIs
+        // see https://www.w3.org/TR/REC-html40/index/attributes.html
+        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+        match (elem_name, attr_name) {
+            // Common element/attribute combinations for links
+            (_, "href" | "src" | "cite" | "usemap")
+            // Less common (but still valid!) combinations
+            | ("applet", "codebase")
+            | ("body", "background")
+            | ("button", "formaction")
+            | ("command", "icon")
+            | ("form", "action")
+            | ("frame", "longdesc")
+            | ("head", "profile")
+            | ("html", "manifest")
+            | ("iframe", "longdesc")
+            | ("img", "longdesc")
+            | ("input", "formaction")
+            | ("object", "classid")
+            | ("object", "codebase")
+            | ("object", "data")
+            | ("video", "poster") => {
+                Some(vec![attr_value].into_iter())
+            }
+            (_, "srcset") => {
+                let mut urls = Vec::new();
+                for image_candidate_string in attr_value.trim().split(',') {
+                    for part in image_candidate_string.split_ascii_whitespace() {
+                        if part.is_empty() {
+                            continue;
+                        }
+                        urls.push(part);
+                        break;
+                    }
+                }
+                Some(urls.into_iter())
+            }
+            _ => None,
+        }
    }
-    urls
 }

 /// Extract unparsed URL strings from an HTML string.
 pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
-    let mut tokenizer = Tokenizer::new(
-        LinkExtractor { links: Vec::new() },
-        TokenizerOpts::default(),
-    );
-
    let mut input = BufferQueue::new();
    input.push_back(StrTendril::from(buf));

+    let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
    let _handle = tokenizer.feed(&mut input);
    tokenizer.end();