From 5802ae912c701451fab8efee0a17fc76bb6e6ac3 Mon Sep 17 00:00:00 2001
From: Matthias <matthias-endler@gmx.net>
Date: Sun, 16 Jan 2022 02:13:38 +0100
Subject: [PATCH] Fix bugs in extractor; reduce allocs (#464)

When URLs couldn't be extracted from a tag,
we ran a plaintext search, but never added the
newly found urls to the vec of extracted urls.

Also tried to make the code a little more idiomatic
---
 lychee-lib/src/extract/html.rs | 133 ++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 67 deletions(-)
diff --git a/lychee-lib/src/extract/html.rs b/lychee-lib/src/extract/html.rs
index 48f8730..bb0b19e 100644
--- a/lychee-lib/src/extract/html.rs
+++ b/lychee-lib/src/extract/html.rs
@@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor {
     #[allow(clippy::match_same_arms)]
     fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
         match token {
-            Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
+            Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
             Token::TagToken(tag) => {
                 let Tag {
                     kind: _kind,
@@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor {
                 } = tag;
 
                 for attr in attrs {
-                    let urls = extract_urls_from_elem_attr(
+                    let urls = LinkExtractor::extract_urls_from_elem_attr(
                         attr.name.local.as_ref(),
                         name.as_ref(),
                         attr.value.as_ref(),
                     );
 
-                    if urls.is_empty() {
-                        extract_plaintext(&attr.value);
-                    } else {
-                        self.links.extend(
-                            urls.into_iter()
-                                .map(|url| RawUri {
-                                    text: url,
-                                    element: Some(name.to_string()),
-                                    attribute: Some(attr.name.local.to_string()),
-                                })
-                                .collect::<Vec<_>>(),
-                        );
-                    }
+                    let new_urls = match urls {
+                        None => extract_plaintext(&attr.value),
+                        Some(urls) => urls
+                            .into_iter()
+                            .map(|url| RawUri {
+                                text: url.to_string(),
+                                element: Some(name.to_string()),
+                                attribute: Some(attr.name.local.to_string()),
+                            })
+                            .collect::<Vec<_>>(),
+                    };
+                    self.links.extend(new_urls);
                 }
             }
             Token::ParseError(_err) => {
@@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor {
     }
 }
 
-/// Extract all semantically known links from a given html attribute.
-#[allow(clippy::unnested_or_patterns)]
-pub(crate) fn extract_urls_from_elem_attr(
-    attr_name: &str,
-    elem_name: &str,
-    attr_value: &str,
-) -> Vec<String> {
-    let mut urls = Vec::new();
-
-    // For a comprehensive list of elements that might contain URLs/URIs
-    // see https://www.w3.org/TR/REC-html40/index/attributes.html
-    // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
-    match (elem_name, attr_name) {
-        // Common element/attribute combinations for links
-        (_, "href" | "src" | "cite" | "usemap")
-        // Less common (but still valid!) combinations
-        | ("applet", "codebase")
-        | ("body", "background")
-        | ("button", "formaction")
-        | ("command", "icon")
-        | ("form", "action")
-        | ("frame", "longdesc")
-        | ("head", "profile")
-        | ("html", "manifest")
-        | ("iframe", "longdesc")
-        | ("img", "longdesc")
-        | ("input", "formaction")
-        | ("object", "classid")
-        | ("object", "codebase")
-        | ("object", "data")
-        | ("video", "poster") => {
-            urls.push(attr_value.to_owned());
-        }
-        (_, "srcset") => {
-            for image_candidate_string in attr_value.trim().split(',') {
-                for part in image_candidate_string.split_ascii_whitespace() {
-                    if part.is_empty() {
-                        continue;
-                    }
-
-                    urls.push(part.to_owned());
-                    break;
-                }
-            }
-        }
-        _ => (),
+impl LinkExtractor {
+    pub(crate) const fn new() -> Self {
+        Self { links: Vec::new() }
+    }
+
+    /// Extract all semantically known links from a given html attribute.
+    #[allow(clippy::unnested_or_patterns)]
+    pub(crate) fn extract_urls_from_elem_attr<'a>(
+        attr_name: &str,
+        elem_name: &str,
+        attr_value: &'a str,
+    ) -> Option<impl Iterator<Item = &'a str>> {
+        // For a comprehensive list of elements that might contain URLs/URIs
+        // see https://www.w3.org/TR/REC-html40/index/attributes.html
+        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+        match (elem_name, attr_name) {
+            // Common element/attribute combinations for links
+            (_, "href" | "src" | "cite" | "usemap")
+            // Less common (but still valid!) combinations
+            | ("applet", "codebase")
+            | ("body", "background")
+            | ("button", "formaction")
+            | ("command", "icon")
+            | ("form", "action")
+            | ("frame", "longdesc")
+            | ("head", "profile")
+            | ("html", "manifest")
+            | ("iframe", "longdesc")
+            | ("img", "longdesc")
+            | ("input", "formaction")
+            | ("object", "classid")
+            | ("object", "codebase")
+            | ("object", "data")
+            | ("video", "poster") => {
+                Some(vec![attr_value].into_iter())
+            }
+            (_, "srcset") => {
+                let mut urls = Vec::new();
+                for image_candidate_string in attr_value.trim().split(',') {
+                    for part in image_candidate_string.split_ascii_whitespace() {
+                        if part.is_empty() {
+                            continue;
+                        }
+                        urls.push(part);
+                        break;
+                    }
+                }
+                Some(urls.into_iter())
+            }
+            _ => None,
+        }
     }
-    urls
 }
 
 /// Extract unparsed URL strings from an HTML string.
 pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
-    let mut tokenizer = Tokenizer::new(
-        LinkExtractor { links: Vec::new() },
-        TokenizerOpts::default(),
-    );
-
     let mut input = BufferQueue::new();
     input.push_back(StrTendril::from(buf));
 
+    let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
     let _handle = tokenizer.feed(&mut input);
     tokenizer.end();