From d3ed133f10841db5b4e1ed90696f9b2d7456bdb9 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Tue, 16 Nov 2021 22:58:10 +0100 Subject: [PATCH] Remove srcset attribute from list of "link" attrs (#393) * Remove srcset attribute from list of "link" attrs Fix #390 * Add test for srcset * Add note about srcSet links * add real support for srcset Co-authored-by: Matthias --- lychee-lib/src/extract.rs | 38 ++++++++++++++++++++++++++++++++--- lychee-lib/src/helpers/url.rs | 35 ++++++++++++++++++++++++++------ 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index af307c0..f5849e3 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -131,10 +131,16 @@ impl Extractor { .. } => { for attr in attrs.borrow().iter() { - if url::elem_attr_is_link(attr.name.local.as_ref(), name.local.as_ref()) { - self.urls.push(attr.value.clone()); - } else { + let urls = url::extract_links_from_elem_attr( + attr.name.local.as_ref(), + name.local.as_ref(), + attr.value.as_ref(), + ); + + if urls.is_empty() { self.extract_plaintext(&attr.value); + } else { + self.urls.extend(urls.into_iter().map(StrTendril::from)); } } } @@ -310,6 +316,32 @@ mod test { assert_eq!(links, expected_links); } + #[test] + fn test_extract_html_srcset() { + let links = extract_uris( + r#" + + "#, + FileType::Html, + Some("https://example.com/"), + ); + + let expected_links = array::IntoIter::new([ + website("https://example.com/static/image.png"), + website("https://example.com/static/image300.png"), + website("https://example.com/static/image600.png"), + ]) + .collect::>(); + + assert_eq!(links, expected_links); + } + #[test] fn test_skip_markdown_anchors() { let links = extract_uris("This is [a test](#lol).", FileType::Markdown, None); diff --git a/lychee-lib/src/helpers/url.rs b/lychee-lib/src/helpers/url.rs index 6a81d2c..b641b84 100644 --- a/lychee-lib/src/helpers/url.rs +++ b/lychee-lib/src/helpers/url.rs @@ -18,14 +18,37 @@ pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { path } -/// Determine if an element's attribute contains a link / URL. -pub(crate) fn elem_attr_is_link(attr_name: &str, elem_name: &str) -> bool { +/// Extract all semantically-known links from a given html attribute. Pattern-based extraction from +/// unstructured plaintext is done elsewhere. +pub(crate) fn extract_links_from_elem_attr( + attr_name: &str, + elem_name: &str, + attr_value: &str, +) -> Vec { // See a comprehensive list of attributes that might contain URLs/URIs // over at: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes - matches!( - (attr_name, elem_name), - ("href" | "src" | "srcset" | "cite", _) | ("data", "object") | ("onhashchange", "body") - ) + let mut urls = Vec::new(); + + match (attr_name, elem_name) { + ("href" | "src" | "cite", _) | ("data", "object") | ("onhashchange", "body") => { + urls.push(attr_value.to_owned()); + } + ("srcset", _) => { + for image_candidate_string in attr_value.trim().split(',') { + for part in image_candidate_string.split_ascii_whitespace() { + if part.is_empty() { + continue; + } + + urls.push(part.to_owned()); + break; + } + } + } + _ => (), + } + + urls } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs