mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-16 11:20:59 +00:00
Fix bugs in extractor; reduce allocs (#464)
When URLs couldn't be extracted from a tag, we ran a plaintext search, but never added the newly found urls to the vec of extracted urls. Also tried to make the code a little more idiomatic
This commit is contained in:
parent
6e757fa20e
commit
5802ae912c
1 changed files with 66 additions and 67 deletions
|
|
@ -18,7 +18,7 @@ impl TokenSink for LinkExtractor {
|
|||
#[allow(clippy::match_same_arms)]
|
||||
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
|
||||
match token {
|
||||
Token::CharacterTokens(raw) => self.links.append(&mut extract_plaintext(&raw)),
|
||||
Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
|
||||
Token::TagToken(tag) => {
|
||||
let Tag {
|
||||
kind: _kind,
|
||||
|
|
@ -28,25 +28,24 @@ impl TokenSink for LinkExtractor {
|
|||
} = tag;
|
||||
|
||||
for attr in attrs {
|
||||
let urls = extract_urls_from_elem_attr(
|
||||
let urls = LinkExtractor::extract_urls_from_elem_attr(
|
||||
attr.name.local.as_ref(),
|
||||
name.as_ref(),
|
||||
attr.value.as_ref(),
|
||||
);
|
||||
|
||||
if urls.is_empty() {
|
||||
extract_plaintext(&attr.value);
|
||||
} else {
|
||||
self.links.extend(
|
||||
urls.into_iter()
|
||||
.map(|url| RawUri {
|
||||
text: url,
|
||||
element: Some(name.to_string()),
|
||||
attribute: Some(attr.name.local.to_string()),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
}
|
||||
let new_urls = match urls {
|
||||
None => extract_plaintext(&attr.value),
|
||||
Some(urls) => urls
|
||||
.into_iter()
|
||||
.map(|url| RawUri {
|
||||
text: url.to_string(),
|
||||
element: Some(name.to_string()),
|
||||
attribute: Some(attr.name.local.to_string()),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
};
|
||||
self.links.extend(new_urls);
|
||||
}
|
||||
}
|
||||
Token::ParseError(_err) => {
|
||||
|
|
@ -61,66 +60,66 @@ impl TokenSink for LinkExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
pub(crate) fn extract_urls_from_elem_attr(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &str,
|
||||
) -> Vec<String> {
|
||||
let mut urls = Vec::new();
|
||||
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
| ("applet", "codebase")
|
||||
| ("body", "background")
|
||||
| ("button", "formaction")
|
||||
| ("command", "icon")
|
||||
| ("form", "action")
|
||||
| ("frame", "longdesc")
|
||||
| ("head", "profile")
|
||||
| ("html", "manifest")
|
||||
| ("iframe", "longdesc")
|
||||
| ("img", "longdesc")
|
||||
| ("input", "formaction")
|
||||
| ("object", "classid")
|
||||
| ("object", "codebase")
|
||||
| ("object", "data")
|
||||
| ("video", "poster") => {
|
||||
urls.push(attr_value.to_owned());
|
||||
}
|
||||
(_, "srcset") => {
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
urls.push(part.to_owned());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
impl LinkExtractor {
|
||||
pub(crate) const fn new() -> Self {
|
||||
Self { links: Vec::new() }
|
||||
}
|
||||
|
||||
/// Extract all semantically known links from a given html attribute.
|
||||
#[allow(clippy::unnested_or_patterns)]
|
||||
pub(crate) fn extract_urls_from_elem_attr<'a>(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &'a str,
|
||||
) -> Option<impl Iterator<Item = &'a str>> {
|
||||
// For a comprehensive list of elements that might contain URLs/URIs
|
||||
// see https://www.w3.org/TR/REC-html40/index/attributes.html
|
||||
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
|
||||
match (elem_name, attr_name) {
|
||||
// Common element/attribute combinations for links
|
||||
(_, "href" | "src" | "cite" | "usemap")
|
||||
// Less common (but still valid!) combinations
|
||||
| ("applet", "codebase")
|
||||
| ("body", "background")
|
||||
| ("button", "formaction")
|
||||
| ("command", "icon")
|
||||
| ("form", "action")
|
||||
| ("frame", "longdesc")
|
||||
| ("head", "profile")
|
||||
| ("html", "manifest")
|
||||
| ("iframe", "longdesc")
|
||||
| ("img", "longdesc")
|
||||
| ("input", "formaction")
|
||||
| ("object", "classid")
|
||||
| ("object", "codebase")
|
||||
| ("object", "data")
|
||||
| ("video", "poster") => {
|
||||
Some(vec![attr_value].into_iter())
|
||||
}
|
||||
(_, "srcset") => {
|
||||
let mut urls = Vec::new();
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
urls.push(part);
|
||||
break;
|
||||
}
|
||||
}
|
||||
Some(urls.into_iter())
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
urls
|
||||
}
|
||||
|
||||
/// Extract unparsed URL strings from an HTML string.
|
||||
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
|
||||
let mut tokenizer = Tokenizer::new(
|
||||
LinkExtractor { links: Vec::new() },
|
||||
TokenizerOpts::default(),
|
||||
);
|
||||
|
||||
let mut input = BufferQueue::new();
|
||||
input.push_back(StrTendril::from(buf));
|
||||
|
||||
let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
|
||||
let _handle = tokenizer.feed(&mut input);
|
||||
tokenizer.end();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue