diff --git a/lychee-lib/src/extract/html.rs b/lychee-lib/src/extract/html.rs
index 48f8730..bb0b19e 100644
--- a/lychee-lib/src/extract/html.rs
+++ b/lychee-lib/src/extract/html.rs
@@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor {
#[allow(clippy::match_same_arms)]
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
- Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
+ Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
Token::TagToken(tag) => {
let Tag {
kind: _kind,
@@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor {
} = tag;
for attr in attrs {
- let urls = extract_urls_from_elem_attr(
+ let urls = LinkExtractor::extract_urls_from_elem_attr(
attr.name.local.as_ref(),
name.as_ref(),
attr.value.as_ref(),
);
- if urls.is_empty() {
- extract_plaintext(&attr.value);
- } else {
- self.links.extend(
- urls.into_iter()
- .map(|url| RawUri {
- text: url,
- element: Some(name.to_string()),
- attribute: Some(attr.name.local.to_string()),
- })
- .collect::>(),
- );
- }
+ let new_urls = match urls {
+ None => extract_plaintext(&attr.value),
+ Some(urls) => urls
+ .into_iter()
+ .map(|url| RawUri {
+ text: url.to_string(),
+ element: Some(name.to_string()),
+ attribute: Some(attr.name.local.to_string()),
+ })
+ .collect::>(),
+ };
+ self.links.extend(new_urls);
}
}
Token::ParseError(_err) => {
@@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor {
}
}
-/// Extract all semantically known links from a given html attribute.
-#[allow(clippy::unnested_or_patterns)]
-pub(crate) fn extract_urls_from_elem_attr(
- attr_name: &str,
- elem_name: &str,
- attr_value: &str,
-) -> Vec {
- let mut urls = Vec::new();
-
- // For a comprehensive list of elements that might contain URLs/URIs
- // see https://www.w3.org/TR/REC-html40/index/attributes.html
- // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
- match (elem_name, attr_name) {
- // Common element/attribute combinations for links
- (_, "href" | "src" | "cite" | "usemap")
- // Less common (but still valid!) combinations
- | ("applet", "codebase")
- | ("body", "background")
- | ("button", "formaction")
- | ("command", "icon")
- | ("form", "action")
- | ("frame", "longdesc")
- | ("head", "profile")
- | ("html", "manifest")
- | ("iframe", "longdesc")
- | ("img", "longdesc")
- | ("input", "formaction")
- | ("object", "classid")
- | ("object", "codebase")
- | ("object", "data")
- | ("video", "poster") => {
- urls.push(attr_value.to_owned());
- }
- (_, "srcset") => {
- for image_candidate_string in attr_value.trim().split(',') {
- for part in image_candidate_string.split_ascii_whitespace() {
- if part.is_empty() {
- continue;
- }
-
- urls.push(part.to_owned());
- break;
- }
- }
- }
- _ => (),
+impl LinkExtractor {
+ pub(crate) const fn new() -> Self {
+ Self { links: Vec::new() }
+ }
+
+ /// Extract all semantically known links from a given html attribute.
+ #[allow(clippy::unnested_or_patterns)]
+ pub(crate) fn extract_urls_from_elem_attr<'a>(
+ attr_name: &str,
+ elem_name: &str,
+ attr_value: &'a str,
+ ) -> Option> {
+ // For a comprehensive list of elements that might contain URLs/URIs
+ // see https://www.w3.org/TR/REC-html40/index/attributes.html
+ // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+ match (elem_name, attr_name) {
+ // Common element/attribute combinations for links
+ (_, "href" | "src" | "cite" | "usemap")
+ // Less common (but still valid!) combinations
+ | ("applet", "codebase")
+ | ("body", "background")
+ | ("button", "formaction")
+ | ("command", "icon")
+ | ("form", "action")
+ | ("frame", "longdesc")
+ | ("head", "profile")
+ | ("html", "manifest")
+ | ("iframe", "longdesc")
+ | ("img", "longdesc")
+ | ("input", "formaction")
+ | ("object", "classid")
+ | ("object", "codebase")
+ | ("object", "data")
+ | ("video", "poster") => {
+ Some(vec![attr_value].into_iter())
+ }
+ (_, "srcset") => {
+ let mut urls = Vec::new();
+ for image_candidate_string in attr_value.trim().split(',') {
+ for part in image_candidate_string.split_ascii_whitespace() {
+ if part.is_empty() {
+ continue;
+ }
+ urls.push(part);
+ break;
+ }
+ }
+ Some(urls.into_iter())
+ }
+ _ => None,
+ }
}
- urls
}
/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec {
- let mut tokenizer = Tokenizer::new(
- LinkExtractor { links: Vec::new() },
- TokenizerOpts::default(),
- );
-
let mut input = BufferQueue::new();
input.push_back(StrTendril::from(buf));
+ let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
let _handle = tokenizer.feed(&mut input);
tokenizer.end();