diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs
index af307c0..f5849e3 100644
--- a/lychee-lib/src/extract.rs
+++ b/lychee-lib/src/extract.rs
@@ -131,10 +131,16 @@ impl Extractor {
..
} => {
for attr in attrs.borrow().iter() {
- if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) {
- self.urls.push(attr.value.clone());
- } else {
+ let urls = url::extract_links_from_elem_attr(
+ attr.name.local.as_ref(),
+ name.local.as_ref(),
+ attr.value.as_ref(),
+ );
+
+ if urls.is_empty() {
self.extract_plaintext(&attr.value);
+ } else {
+ self.urls.extend(urls.into_iter().map(StrTendril::from));
}
}
}
@@ -310,6 +316,32 @@ mod test {
assert_eq!(links, expected_links);
}
+ #[test]
+ fn test_extract_html_srcset() {
+ let links = extract_uris(
+ r#"
+
+ "#,
+ FileType::Html,
+ Some("https://example.com/"),
+ );
+
+ let expected_links = array::IntoIter::new([
+ website("https://example.com/static/image.png"),
+ website("https://example.com/static/image300.png"),
+ website("https://example.com/static/image600.png"),
+ ])
+ .collect::>();
+
+ assert_eq!(links, expected_links);
+ }
+
#[test]
fn test_skip_markdown_anchors() {
let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None);
diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs
index 6a81d2c..b641b84 100644
--- a/lychee-lib/src/helpers/url.rs
+++ b/lychee-lib/src/helpers/url.rs
@@ -18,14 +18,37 @@ pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str {
path
}
-/// Determine if an element's attribute contains a link / URL.
-pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool {
+/// Extract all semantically-known links from a given html attribute. Pattern-based extraction from
+/// unstructured plaintext is done elsewhere.
+pub(crate) fn extract_links_from_elem_attr(
+ attr_name: &str,
+ elem_name: &str,
+ attr_value: &str,
+) -> Vec {
// See a comprehensive list of attributes that might contain URLs/URIs
// over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
- matches!(
- (attr_name, elem_name),
- ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body")
- )
+ let mut urls = Vec::new();
+
+ match (attr_name, elem_name) {
+ ("href" | "src" | "cite", _) | ("data", "object") | ("onhashchange", "body") => {
+ urls.push(attr_value.to_owned());
+ }
+ ("srcset", _) => {
+ for image_candidate_string in attr_value.trim().split(',') {
+ for part in image_candidate_string.split_ascii_whitespace() {
+ if part.is_empty() {
+ continue;
+ }
+
+ urls.push(part.to_owned());
+ break;
+ }
+ }
+ }
+ _ => (),
+ }
+
+ urls
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs