From 5802ae912c701451fab8efee0a17fc76bb6e6ac3 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 16 Jan 2022 02:13:38 +0100 Subject: [PATCH] Fix bugs in extractor; reduce allocs (#464) When URLs couldn't be extracted from a tag, we ran a plaintext search, but never added the newly found urls to the vec of extracted urls. Also tried to make the code a little more idiomatic --- lychee-lib/src/extract/html.rs | 133 ++++++++++++++++----------------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/lychee-lib/src/extract/html.rs b/lychee-lib/src/extract/html.rs index 48f8730..bb0b19e 100644 --- a/lychee-lib/src/extract/html.rs +++ b/lychee-lib/src/extract/html.rs @@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor { #[allow(clippy::match_same_arms)] fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> { match token { - Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)), + Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)), Token::TagToken(tag) => { let Tag { kind: _kind, @@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor { } = tag; for attr in attrs { - let urls = extract_urls_from_elem_attr( + let urls = LinkExtractor::extract_urls_from_elem_attr( attr.name.local.as_ref(), name.as_ref(), attr.value.as_ref(), ); - if urls.is_empty() { - extract_plaintext(&attr.value); - } else { - self.links.extend( - urls.into_iter() - .map(|url| RawUri { - text: url, - element: Some(name.to_string()), - attribute: Some(attr.name.local.to_string()), - }) - .collect::>(), - ); - } + let new_urls = match urls { + None => extract_plaintext(&attr.value), + Some(urls) => urls + .into_iter() + .map(|url| RawUri { + text: url.to_string(), + element: Some(name.to_string()), + attribute: Some(attr.name.local.to_string()), + }) + .collect::>(), + }; + self.links.extend(new_urls); } } Token::ParseError(_err) => { @@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor { } } -/// Extract all semantically known links from a given html attribute. -#[allow(clippy::unnested_or_patterns)] -pub(crate) fn extract_urls_from_elem_attr( - attr_name: &str, - elem_name: &str, - attr_value: &str, -) -> Vec { - let mut urls = Vec::new(); - - // For a comprehensive list of elements that might contain URLs/URIs - // see https://www.w3.org/TR/REC-html40/index/attributes.html - // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 - match (elem_name, attr_name) { - // Common element/attribute combinations for links - (_, "href" | "src" | "cite" | "usemap") - // Less common (but still valid!) combinations - | ("applet", "codebase") - | ("body", "background") - | ("button", "formaction") - | ("command", "icon") - | ("form", "action") - | ("frame", "longdesc") - | ("head", "profile") - | ("html", "manifest") - | ("iframe", "longdesc") - | ("img", "longdesc") - | ("input", "formaction") - | ("object", "classid") - | ("object", "codebase") - | ("object", "data") - | ("video", "poster") => { - urls.push(attr_value.to_owned()); - } - (_, "srcset") => { - for image_candidate_string in attr_value.trim().split(',') { - for part in image_candidate_string.split_ascii_whitespace() { - if part.is_empty() { - continue; - } - - urls.push(part.to_owned()); - break; - } - } - } - _ => (), +impl LinkExtractor { + pub(crate) const fn new() -> Self { + Self { links: Vec::new() } + } + + /// Extract all semantically known links from a given html attribute. + #[allow(clippy::unnested_or_patterns)] + pub(crate) fn extract_urls_from_elem_attr<'a>( + attr_name: &str, + elem_name: &str, + attr_value: &'a str, + ) -> Option> { + // For a comprehensive list of elements that might contain URLs/URIs + // see https://www.w3.org/TR/REC-html40/index/attributes.html + // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 + match (elem_name, attr_name) { + // Common element/attribute combinations for links + (_, "href" | "src" | "cite" | "usemap") + // Less common (but still valid!) combinations + | ("applet", "codebase") + | ("body", "background") + | ("button", "formaction") + | ("command", "icon") + | ("form", "action") + | ("frame", "longdesc") + | ("head", "profile") + | ("html", "manifest") + | ("iframe", "longdesc") + | ("img", "longdesc") + | ("input", "formaction") + | ("object", "classid") + | ("object", "codebase") + | ("object", "data") + | ("video", "poster") => { + Some(vec![attr_value].into_iter()) + } + (_, "srcset") => { + let mut urls = Vec::new(); + for image_candidate_string in attr_value.trim().split(',') { + for part in image_candidate_string.split_ascii_whitespace() { + if part.is_empty() { + continue; + } + urls.push(part); + break; + } + } + Some(urls.into_iter()) + } + _ => None, + } } - urls } /// Extract unparsed URL strings from an HTML string. pub(crate) fn extract_html(buf: &str) -> Vec { - let mut tokenizer = Tokenizer::new( - LinkExtractor { links: Vec::new() }, - TokenizerOpts::default(), - ); - let mut input = BufferQueue::new(); input.push_back(StrTendril::from(buf)); + let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default()); let _handle = tokenizer.feed(&mut input); tokenizer.end();