mirror of
https://github.com/Hopiu/lychee.git
synced 2026-04-10 16:30:58 +00:00
Remove srcset attribute from list of "link" attrs (#393)
* Remove srcset attribute from list of "link" attrs Fix #390 * Add test for srcset * Add note about srcSet links * add real support for srcset Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
parent
893dfff453
commit
d3ed133f10
2 changed files with 64 additions and 9 deletions
|
|
@ -131,10 +131,16 @@ impl Extractor {
|
|||
..
|
||||
} => {
|
||||
for attr in attrs.borrow().iter() {
|
||||
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
|
||||
self.urls.push(attr.value.clone());
|
||||
} else {
|
||||
let urls = url::extract_links_from_elem_attr(
|
||||
attr.name.local.as_ref(),
|
||||
name.local.as_ref(),
|
||||
attr.value.as_ref(),
|
||||
);
|
||||
|
||||
if urls.is_empty() {
|
||||
self.extract_plaintext(&attr.value);
|
||||
} else {
|
||||
self.urls.extend(urls.into_iter().map(StrTendril::from));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -310,6 +316,32 @@ mod test {
|
|||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_html_srcset() {
|
||||
let links = extract_uris(
|
||||
r#"
|
||||
<img
|
||||
src="/static/image.png"
|
||||
srcset="
|
||||
/static/image300.png 300w,
|
||||
/static/image600.png 600w,
|
||||
"
|
||||
/>
|
||||
"#,
|
||||
FileType::Html,
|
||||
Some("https://example.com/"),
|
||||
);
|
||||
|
||||
let expected_links = array::IntoIter::new([
|
||||
website("https://example.com/static/image.png"),
|
||||
website("https://example.com/static/image300.png"),
|
||||
website("https://example.com/static/image600.png"),
|
||||
])
|
||||
.collect::<HashSet<Uri>>();
|
||||
|
||||
assert_eq!(links, expected_links);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_markdown_anchors() {
|
||||
let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None);
|
||||
|
|
|
|||
|
|
@ -18,14 +18,37 @@ pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
|
|||
path
|
||||
}
|
||||
|
||||
/// Determine if an element's attribute contains a link / URL.
|
||||
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
|
||||
/// Extract all semantically-known links from a given html attribute. Pattern-based extraction from
|
||||
/// unstructured plaintext is done elsewhere.
|
||||
pub(crate) fn extract_links_from_elem_attr(
|
||||
attr_name: &str,
|
||||
elem_name: &str,
|
||||
attr_value: &str,
|
||||
) -> Vec<String> {
|
||||
// See a comprehensive list of attributes that might contain URLs/URIs
|
||||
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
|
||||
matches!(
|
||||
(attr_name, elem_name),
|
||||
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
|
||||
)
|
||||
let mut urls = Vec::new();
|
||||
|
||||
match (attr_name, elem_name) {
|
||||
("href" | "src" | "cite", _) | ("data", "object") | ("onhashchange", "body") => {
|
||||
urls.push(attr_value.to_owned());
|
||||
}
|
||||
("srcset", _) => {
|
||||
for image_candidate_string in attr_value.trim().split(',') {
|
||||
for part in image_candidate_string.split_ascii_whitespace() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
urls.push(part.to_owned());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
urls
|
||||
}
|
||||
|
||||
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
|
||||
|
|
|
|||
Loading…
Reference in a new issue