Improve srcset parsing (#1160)

Our current `srcset` parsing is pretty basic. We split on comma and then on whitespace and take the first part, which is the image source URL. However, we don't handle URLs containing unencoded commas like </cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg>, which leads to false-positives. According to the spec, commas in strings should be encoded, but in practice, there are some websites which don't do that. To handle these cases, too, I propose to extend the `srcset` parsing to make use of a small "state machine", which detects if a comma is within the image source or outside of it while parsing. This is part of an effort to reduce false-positives during link checking. --------- Co-authored-by: Hugo McNally <45573837+HU90m@users.noreply.github.com>
2026-04-16 19:30:58 +00:00 · 2023-07-29 17:06:44 +02:00 · 2023-07-29 17:06:44 +02:00 · cead4ce826
commit cead4ce826
parent 62c6a056d5
6 changed files with 328 additions and 97 deletions
--- a/lychee-lib/src/extract/html/html5ever.rs
+++ b/lychee-lib/src/extract/html/html5ever.rs
@ -4,7 +4,7 @@ use html5ever::{
    tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
 };

-use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
+use super::{super::plaintext::extract_plaintext, is_email_link, is_verbatim_elem, srcset};
 use crate::types::uri::raw::RawUri;

 #[derive(Clone, Default)]
@ -157,17 +157,7 @@ impl LinkExtractor {
                Some(vec![attr_value].into_iter())
            }
            (_, "srcset") => {
-                let mut urls = Vec::new();
-                for image_candidate_string in attr_value.trim().split(',') {
-                    for part in image_candidate_string.split_ascii_whitespace() {
-                        if part.is_empty() {
-                            continue;
-                        }
-                        urls.push(part);
-                        break;
-                    }
-                }
-                Some(urls.into_iter())
+                Some(srcset::parse(attr_value).into_iter())
            }
            _ => None,
        }
--- a/lychee-lib/src/extract/html/html5gum.rs
+++ b/lychee-lib/src/extract/html/html5gum.rs
@ -1,8 +1,7 @@
 use html5gum::{Emitter, Error, State, Tokenizer};

-use super::plaintext::extract_plaintext;
-use super::{is_email_link, is_verbatim_elem};
-use crate::types::uri::raw::RawUri;
+use super::{is_email_link, is_verbatim_elem, srcset};
+use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};

 #[derive(Clone)]
 struct LinkExtractor {
@ -75,17 +74,7 @@ impl LinkExtractor {
                Some(vec![attr_value].into_iter())
            }
            (_, "srcset") => {
-                let mut urls = Vec::new();
-                for image_candidate_string in attr_value.trim().split(',') {
-                    for part in image_candidate_string.split_ascii_whitespace() {
-                        if part.is_empty() {
-                            continue;
-                        }
-                        urls.push(part);
-                        break;
-                    }
-                }
-                Some(urls.into_iter())
+                Some(srcset::parse(attr_value).into_iter())
            }
            _ => None,
        }
@ -457,6 +446,7 @@ mod tests {
        let uris = extract_html(input, false);
        assert_eq!(uris, expected);
    }
+
    #[test]
    fn test_exclude_email_without_mailto() {
        let input = r#"<!DOCTYPE html>
@ -470,13 +460,12 @@ mod tests {
          </body>
        </html>"#;

-        let expected = vec![];
        let uris = extract_html(input, false);
-        assert_eq!(uris, expected);
+        assert!(uris.is_empty());
    }

    #[test]
-    fn test_email_false_postive() {
+    fn test_email_false_positive() {
        let input = r#"<!DOCTYPE html>
        <html lang="en-US">
          <head>
@ -488,7 +477,33 @@ mod tests {
          </body>
        </html>"#;

-        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert!(uris.is_empty());
+    }
+
+    #[test]
+    fn test_extract_srcset() {
+        let input = r#"
+            <img srcset="/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w" src="/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg">
+        "#;
+
+        let expected = vec![RawUri {
+            text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("srcset".to_string()),
+        },
+        RawUri {
+            text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("srcset".to_string()),
+        },
+        RawUri {
+            text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("src".to_string()),
+        }
+
+        ];
        let uris = extract_html(input, false);
        assert_eq!(uris, expected);
    }
--- a/lychee-lib/src/extract/html/mod.rs
+++ b/lychee-lib/src/extract/html/mod.rs
@ -0,0 +1,71 @@
+pub(crate) mod html5ever;
+pub(crate) mod html5gum;
+mod srcset;
+
+use linkify::{LinkFinder, LinkKind};
+
+/// Check if the given URL is an email link.
+///
+/// This operates on the raw URL strings, not the linkified version because it
+/// gets used in the HTML extractors, which parse the HTML attributes directly
+/// and return the raw strings.
+///
+/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
+/// so there should be no big performance penalty for calling this function.
+pub(crate) fn is_email_link(input: &str) -> bool {
+    let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
+    let email = match findings.next() {
+        None => return false,
+        Some(email) => email.as_str(),
+    };
+
+    // Email needs to match the full string.
+    // Strip the "mailto:" prefix if it exists.
+    input.strip_prefix("mailto:").unwrap_or(input) == email
+}
+
+/// Check if the given element is in the list of preformatted ("verbatim") tags.
+///
+/// These will be excluded from link checking by default.
+// Including the <script> tag is debatable, but the alternative is to
+// have a separate list of tags which need a separate config setting and that
+// seems worse.
+pub(crate) fn is_verbatim_elem(name: &str) -> bool {
+    matches!(
+        name,
+        "code"
+            | "kbd"
+            | "listing"
+            | "noscript"
+            | "plaintext"
+            | "pre"
+            | "samp"
+            | "script"
+            | "textarea"
+            | "var"
+            | "xmp"
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_email_link() {
+        assert!(is_email_link("mailto:steve@apple.com"));
+        assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
+
+        assert!(is_email_link("foo@example.org"));
+        assert!(!is_email_link("foo@example.org in sentence"));
+        assert!(!is_email_link("https://example.org"));
+    }
+
+    #[test]
+    fn test_verbatim_matching() {
+        assert!(is_verbatim_elem("pre"));
+        assert!(is_verbatim_elem("code"));
+        assert!(is_verbatim_elem("listing"));
+        assert!(is_verbatim_elem("script"));
+    }
+}
--- a/lychee-lib/src/extract/html/srcset.rs
+++ b/lychee-lib/src/extract/html/srcset.rs
@ -0,0 +1,218 @@
+//! Extract all image URLs from a srcset.
+//!
+//! A `srcset` is a string containing a comma-separated list of one or more
+//! image candidate strings to be used when determining which image resource to
+//! present inside an `<img>` element.
+//!
+//! Each image candidate string must begin with a valid URL referencing a
+//! non-interactive graphic resource. This is followed by whitespace and then a
+//! condition descriptor that indicates the circumstances in which the indicated
+//! image should be used. Space characters, other than the whitespace separating
+//! the URL and the corresponding condition descriptor, are ignored; this
+//! includes both leading and trailing space, as well as space before or after
+//! each comma.
+//!
+//! Note: this handles cases where a URL contains a comma, which should be
+//! escaped, but is a valid character in a URL and occurs in the wild.
+//! Note: we cannot assume that commas within URLs are encoded as `%2C`, as they
+//! should be according to RFC 3986.
+//! Thus, the parsing process becomes significantly more complex and we need to
+//! use a state machine to keep track of the current state.
+
+use log::info;
+
+enum State {
+    InsideDescriptor,
+    AfterDescriptor,
+    InsideParens,
+}
+
+/// Split an input string at the first character for which
+/// the predicate returns false.
+fn split_at<F>(input: &str, predicate: F) -> (&str, &str)
+where
+    F: Fn(&char) -> bool,
+{
+    for (i, ch) in input.char_indices() {
+        if !predicate(&ch) {
+            return input.split_at(i);
+        }
+    }
+    (input, "")
+}
+
+/// Parse a srcset string into a list of URLs.
+//
+// This state-machine is a bit convoluted, but we keep everything in one place
+// for simplicity so we have to please clippy.
+pub(crate) fn parse(input: &str) -> Vec<&str> {
+    let mut candidates: Vec<&str> = Vec::new();
+    let mut index = 0;
+
+    while index < input.len() {
+        let position = &input[index..];
+        let (start, remaining) = split_at(position, |c| *c == ',' || c.is_whitespace());
+
+        if start.find(',').is_some() {
+            info!("srcset parse Error");
+            return vec![];
+        }
+        index += start.chars().count();
+
+        if remaining.is_empty() {
+            return candidates;
+        }
+
+        let (url, remaining) = split_at(remaining, |c| !c.is_whitespace());
+        index += url.chars().count();
+
+        let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
+
+        if let Some(url) = url.get(..url.len() - comma_count) {
+            candidates.push(url);
+        }
+
+        if comma_count > 1 {
+            info!("srcset parse error (trailing commas)");
+            return vec![];
+        }
+
+        index += 1;
+
+        let (space, remaining) = split_at(remaining, |c| c.is_whitespace());
+        index += space.len();
+
+        index = skip_descriptor(index, remaining);
+    }
+
+    candidates
+}
+
+/// Helper function to skip over a descriptor.
+/// Returns the index of the next character after the descriptor
+/// (i.e. pointing at the comma or the end of the string)
+fn skip_descriptor(mut index: usize, remaining: &str) -> usize {
+    let mut state = State::InsideDescriptor;
+
+    for c in remaining.chars() {
+        index += 1;
+
+        match state {
+            State::InsideDescriptor => match c {
+                ' ' => state = State::AfterDescriptor,
+                '(' => state = State::InsideParens,
+                ',' => return index,
+                _ => {}
+            },
+            State::InsideParens => {
+                if c == ')' {
+                    state = State::InsideDescriptor;
+                }
+            }
+            State::AfterDescriptor => {
+                if c != ' ' {
+                    state = State::InsideDescriptor;
+                }
+            }
+        }
+    }
+
+    index
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_collect_sequence_characters_with_empty_string() {
+        let (sequence, remainder) = split_at("", |c| c.is_alphabetic());
+        assert_eq!(sequence, "");
+        assert_eq!(remainder, "");
+    }
+
+    #[test]
+    fn test_collect_sequence_characters_with_alphabetic_predicate() {
+        let (sequence, remainder) = split_at("abc123", |c| c.is_alphabetic());
+        assert_eq!(sequence, "abc");
+        assert_eq!(remainder, "123");
+    }
+
+    #[test]
+    fn test_collect_sequence_characters_with_digit_predicate() {
+        let (sequence, remainder) = split_at("123abc", char::is_ascii_digit);
+        assert_eq!(sequence, "123");
+        assert_eq!(remainder, "abc");
+    }
+
+    #[test]
+    fn test_collect_sequence_characters_with_no_match() {
+        let (sequence, remainder) = split_at("123abc", |c| c.is_whitespace());
+        assert_eq!(sequence, "");
+        assert_eq!(remainder, "123abc");
+    }
+
+    #[test]
+    fn test_collect_sequence_characters_with_all_match() {
+        let (sequence, remainder) = split_at("123abc", |c| !c.is_whitespace());
+        assert_eq!(sequence, "123abc");
+        assert_eq!(remainder, "");
+    }
+
+    #[test]
+    fn test_parse_no_value() {
+        assert!(parse("").is_empty());
+    }
+
+    #[test]
+    fn test_parse_url_one_value() {
+        let candidates = vec!["test-img-320w.jpg".to_string()];
+        assert_eq!(parse("test-img-320w.jpg 320w"), candidates);
+    }
+
+    #[test]
+    fn test_parse_srcset_two_values() {
+        assert_eq!(
+            parse("test-img-320w.jpg 320w, test-img-480w.jpg 480w"),
+            vec![
+                "test-img-320w.jpg".to_string(),
+                "test-img-480w.jpg".to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_srcset_with_unencoded_comma() {
+        assert_eq!(
+            parse(
+                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w"
+            ),
+            vec![
+                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+                "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_srcset_url() {
+        assert_eq!(
+            parse("https://example.com/image1.jpg 1x, https://example.com/image2.jpg 2x"),
+            vec![
+                "https://example.com/image1.jpg",
+                "https://example.com/image2.jpg"
+            ]
+        );
+    }
+
+    #[test]
+    fn test_parse_srcset_with_commas() {
+        assert_eq!(
+            parse("/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w"),
+            vec![
+                "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg",
+                "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg"
+            ]
+        );
+    }
+}
--- a/lychee-lib/src/extract/markdown.rs
+++ b/lychee-lib/src/extract/markdown.rs
@ -2,7 +2,7 @@ use pulldown_cmark::{Event, Parser, Tag};

 use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};

-use super::html5gum::extract_html;
+use super::html::html5gum::extract_html;

 /// Extract unparsed URL strings from a Markdown string.
 pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@ -1,57 +1,12 @@
 use crate::types::{uri::raw::RawUri, FileType, InputContent};

-mod html5ever;
-mod html5gum;
+mod html;
 mod markdown;
 mod plaintext;

-use linkify::{LinkFinder, LinkKind};
 use markdown::extract_markdown;
 use plaintext::extract_plaintext;

-/// Check if the given URL is an email link.
-///
-/// This operates on the raw URL strings, not the linkified version because it
-/// gets used in the HTML extractors, which parse the HTML attributes directly
-/// and return the raw strings.
-///
-/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
-/// so there should be no big performance penalty for calling this function.
-pub(crate) fn is_email_link(input: &str) -> bool {
-    let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
-    let email = match findings.next() {
-        None => return false,
-        Some(email) => email.as_str(),
-    };
-
-    // Email needs to match the full string.
-    // Strip the "mailto:" prefix if it exists.
-    input.strip_prefix("mailto:").unwrap_or(input) == email
-}
-
-/// Check if the given element is in the list of preformatted ("verbatim") tags.
-///
-/// These will be excluded from link checking by default.
-// Including the <script> tag is debatable, but the alternative is to
-// have a separate list of tags which need a separate config setting and that
-// seems worse.
-pub(crate) fn is_verbatim_elem(name: &str) -> bool {
-    matches!(
-        name,
-        "code"
-            | "kbd"
-            | "listing"
-            | "noscript"
-            | "plaintext"
-            | "pre"
-            | "samp"
-            | "script"
-            | "textarea"
-            | "var"
-            | "xmp"
-    )
-}
-
 /// A handler for extracting links from various input formats like Markdown and
 /// HTML. Allocations should be avoided if possible as this is a
 /// performance-critical section of the library.
@ -90,9 +45,9 @@ impl Extractor {
            FileType::Markdown => extract_markdown(&input_content.content, self.include_verbatim),
            FileType::Html => {
                if self.use_html5ever {
-                    html5ever::extract_html(&input_content.content, self.include_verbatim)
+                    html::html5ever::extract_html(&input_content.content, self.include_verbatim)
                } else {
-                    html5gum::extract_html(&input_content.content, self.include_verbatim)
+                    html::html5gum::extract_html(&input_content.content, self.include_verbatim)
                }
            }
            FileType::Plaintext => extract_plaintext(&input_content.content),
@ -134,14 +89,6 @@ mod tests {
        uris_html5gum
    }

-    #[test]
-    fn test_verbatim_matching() {
-        assert!(is_verbatim_elem("pre"));
-        assert!(is_verbatim_elem("code"));
-        assert!(is_verbatim_elem("listing"));
-        assert!(is_verbatim_elem("script"));
-    }
-
    #[test]
    fn verbatim_elem() {
        let input = r#"
@ -362,14 +309,4 @@ mod tests {

        assert_eq!(links, expected_links);
    }
-
-    #[test]
-    fn test_is_email_link() {
-        assert!(is_email_link("mailto:steve@apple.com"));
-        assert!(!is_email_link("mailto:steve@apple.com in a sentence"));
-
-        assert!(is_email_link("foo@example.org"));
-        assert!(!is_email_link("foo@example.org in sentence"));
-        assert!(!is_email_link("https://example.org"));
-    }
 }