Fix bugs in extractor; reduce allocs (#464)

When URLs couldn't be extracted from a tag,
we ran a plaintext search, but never added the
newly found urls to the vec of extracted urls.

Also tried to make the code a little more idiomatic
This commit is contained in:
Matthias 2022-01-16 02:13:38 +01:00 committed by GitHub
parent 6e757fa20e
commit 5802ae912c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor {
#[allow(clippy::match_same_arms)]
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
Token::TagToken(tag) => {
let Tag {
kind: _kind,
@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor {
} = tag;
for attr in attrs {
let urls = extract_urls_from_elem_attr(
let urls = LinkExtractor::extract_urls_from_elem_attr(
attr.name.local.as_ref(),
name.as_ref(),
attr.value.as_ref(),
);
if urls.is_empty() {
extract_plaintext(&attr.value);
} else {
self.links.extend(
urls.into_iter()
.map(|url| RawUri {
text: url,
element: Some(name.to_string()),
attribute: Some(attr.name.local.to_string()),
})
.collect::<Vec<_>>(),
);
}
let new_urls = match urls {
None => extract_plaintext(&attr.value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.name.local.to_string()),
})
.collect::<Vec<_>>(),
};
self.links.extend(new_urls);
}
}
Token::ParseError(_err) => {
@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor {
}
}
/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr(
attr_name: &str,
elem_name: &str,
attr_value: &str,
) -> Vec<String> {
let mut urls = Vec::new();
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
urls.push(attr_value.to_owned());
}
(_, "srcset") => {
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part.to_owned());
break;
}
}
}
_ => (),
impl LinkExtractor {
pub(crate) const fn new() -> Self {
Self { links: Vec::new() }
}
/// Extract all semantically known links from a given html attribute.
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}
urls
}
/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
let mut tokenizer = Tokenizer::new(
LinkExtractor { links: Vec::new() },
TokenizerOpts::default(),
);
let mut input = BufferQueue::new();
input.push_back(StrTendril::from(buf));
let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
let _handle = tokenizer.feed(&mut input);
tokenizer.end();