diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index cfbe5ab3..d8b586aa 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -18,6 +18,8 @@ Find link tags in HTML text.
"""
import re
+
+from .srcsetparse import parse_srcset
from .. import strformat, log, LOG_CHECK, url as urlutil
unquote = strformat.unquote
@@ -200,19 +202,9 @@ class LinkFinder:
elif attr == 'archive':
for url in value.split(','):
self.found_url(url, name, base, lineno, column)
- elif attr == 'srcset' and not value.startswith('data:'):
- for img_candidate in value.split(','):
- try:
- url = img_candidate.split()[0]
- except IndexError:
- log.debug(
- LOG_CHECK,
- _("trailing comma in line: "
- "%(line)s srcset attribute: %(value)s")
- % {"line": lineno, "value": value}
- )
- else:
- self.found_url(url, name, base, lineno, column)
+ elif attr == 'srcset':
+ for url in parse_srcset(value):
+ self.found_url(url, name, base, lineno, column)
else:
self.found_url(value, name, base, lineno, column)
diff --git a/linkcheck/htmlutil/srcsetparse.py b/linkcheck/htmlutil/srcsetparse.py
new file mode 100644
index 00000000..9bbff15a
--- /dev/null
+++ b/linkcheck/htmlutil/srcsetparse.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2022 Stefan Fisk
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+srcset attribute parser
+"""
+
+
+_TAB = '\u0009'
+_LF = '\u000A'
+_FF = '\u000C'
+_CR = '\u000D'
+_SPACE = '\u0020'
+_COMMA = '\u002C'
+_LEFT_PARENTHESIS = '\u0028'
+_RIGHT_PARENTHESIS = '\u0029'
+
+
+_WHITESPACE = {_TAB, _LF, _FF, _CR, _SPACE}
+_WHITESPACE_OR_COMMA = _WHITESPACE | {_COMMA}
+
+
+def parse_srcset(input):
+ """
+ Parse HTML srcset
+
+ Based on WhatWG HTML standard § 4.8.4.3.10 Parsing a srcset attribute,
+ but does not parse or validate descriptors.
+
+ https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
+ """
+
+ input_end = len(input)
+ position = 0
+ urls = []
+
+ while position < input_end:
+ # 4. Splitting loop: Collect a sequence of code points that are ASCII
+ # whitespace or U+002C COMMA characters from input given position.
+ while position < input_end and input[position] in _WHITESPACE_OR_COMMA:
+ position += 1
+
+ # 5. If position is past the end of input, return candidates.
+ if position >= input_end:
+ return urls
+
+ # 6. Collect a sequence of code points that are not ASCII
+ # whitespace from input given position, and let that be url.
+ url_start = position
+ while position < input_end and input[position] not in _WHITESPACE:
+ position += 1
+ url_end = position
+
+ # 8, If url ends with U+002C (,), then:
+ if input[url_end - 1] == _COMMA:
+ # Remove all trailing U+002C COMMA characters from url.
+ while url_end > url_start and input[url_end - 1] == _COMMA:
+ url_end -= 1
+ else:
+ # This is a shortened version of 1–4 that simply skips the
+ # descriptors
+ while position < input_end:
+ if input[position] == _LEFT_PARENTHESIS:
+ # Skip until first closing parenthesis
+ while (position < input_end and input[position] !=
+ _RIGHT_PARENTHESIS):
+ position += 1
+ elif input[position] == _COMMA:
+ break
+
+ position += 1
+
+ # 9-15 is parsing and validation of the descriptors, which we ignore
+
+ # If we found an URL
+ if url_end > url_start:
+ urls.append(input[url_start:url_end])
+
+ return urls
diff --git a/tests/test_srcsetparse.py b/tests/test_srcsetparse.py
new file mode 100644
index 00000000..e1f56720
--- /dev/null
+++ b/tests/test_srcsetparse.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2022 Stefan fisk
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+Test srcset attribute parsing.
+"""
+
+import unittest
+from linkcheck.htmlutil.srcsetparse import parse_srcset
+
+from parameterized import parameterized
+
+
+# list of tuples
+# (, )
+parsetests = [
+ ('', []),
+ (' ', []),
+ (',', []),
+ ('\t\n ,,\t\n,, \t\n', []),
+ ('foo', ['foo']),
+ ('foo,bar, ,foo, bar', ['foo,bar', 'foo', 'bar']),
+ ('https://example.com/1 foo, https://example.com/2 bar',
+ ['https://example.com/1', 'https://example.com/2']),
+ (' foo ', ['foo']),
+ (',,,foo,,,', ['foo']),
+ (',foo,bar,baz,', ['foo,bar,baz']),
+ ('foo bar baz', ['foo']),
+ ('foo, bar baz', ['foo', 'bar']),
+ ('foo/1 bar, foo/2', ['foo/1', 'foo/2']),
+ ('foo/1 (foo/2)', ['foo/1']),
+ ('foo/1 (((, foo/2', ['foo/1']),
+]
+
+
+class TestSrcsetParsing(unittest.TestCase):
+ @parameterized.expand(parsetests)
+ def test_parse(self, _in, _urls):
+ self.assertEqual(parse_srcset(_in), _urls)