Remove srcset attribute from list of "link" attrs (#393)

* Remove srcset attribute from list of "link" attrs

Fix #390

* Add test for srcset

* Add note about srcSet links

* add real support for srcset

Co-authored-by: Matthias <matthias-endler@gmx.net>
This commit is contained in:
Markus Unterwaditzer 2021-11-16 22:58:10 +01:00 committed by GitHub
parent 893dfff453
commit d3ed133f10
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 64 additions and 9 deletions

View file

@ -131,10 +131,16 @@ impl Extractor {
..
} => {
for attr in attrs.borrow().iter() {
if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
self.urls.push(attr.value.clone());
} else {
let urls = url::extract_links_from_elem_attr(
attr.name.local.as_ref(),
name.local.as_ref(),
attr.value.as_ref(),
);
if urls.is_empty() {
self.extract_plaintext(&attr.value);
} else {
self.urls.extend(urls.into_iter().map(StrTendril::from));
}
}
}
@ -310,6 +316,32 @@ mod test {
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_html_srcset() {
let links = extract_uris(
r#"
<img
src="/static/image.png"
srcset="
/static/image300.png 300w,
/static/image600.png 600w,
"
/>
"#,
FileType::Html,
Some("https://example.com/"),
);
let expected_links = array::IntoIter::new([
website("https://example.com/static/image.png"),
website("https://example.com/static/image300.png"),
website("https://example.com/static/image600.png"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_skip_markdown_anchors() {
let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None);

View file

@ -18,14 +18,37 @@ pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
path
}
/// Determine if an element's attribute contains a link / URL.
pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
/// Extract all semantically-known links from a given html attribute. Pattern-based extraction from
/// unstructured plaintext is done elsewhere.
pub(crate) fn extract_links_from_elem_attr(
attr_name: &str,
elem_name: &str,
attr_value: &str,
) -> Vec<String> {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
matches!(
(attr_name, elem_name),
("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
)
let mut urls = Vec::new();
match (attr_name, elem_name) {
("href" | "src" | "cite", _) | ("data", "object") | ("onhashchange", "body") => {
urls.push(attr_value.to_owned());
}
("srcset", _) => {
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part.to_owned());
break;
}
}
}
_ => (),
}
urls
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs