diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index cfbe5ab3..d8b586aa 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -18,6 +18,8 @@ Find link tags in HTML text. """ import re + +from .srcsetparse import parse_srcset from .. import strformat, log, LOG_CHECK, url as urlutil unquote = strformat.unquote @@ -200,19 +202,9 @@ class LinkFinder: elif attr == 'archive': for url in value.split(','): self.found_url(url, name, base, lineno, column) - elif attr == 'srcset' and not value.startswith('data:'): - for img_candidate in value.split(','): - try: - url = img_candidate.split()[0] - except IndexError: - log.debug( - LOG_CHECK, - _("trailing comma in line: " - "%(line)s srcset attribute: %(value)s") - % {"line": lineno, "value": value} - ) - else: - self.found_url(url, name, base, lineno, column) + elif attr == 'srcset': + for url in parse_srcset(value): + self.found_url(url, name, base, lineno, column) else: self.found_url(value, name, base, lineno, column) diff --git a/linkcheck/htmlutil/srcsetparse.py b/linkcheck/htmlutil/srcsetparse.py new file mode 100644 index 00000000..9bbff15a --- /dev/null +++ b/linkcheck/htmlutil/srcsetparse.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 Stefan Fisk +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +srcset attribute parser +""" + + +_TAB = '\u0009' +_LF = '\u000A' +_FF = '\u000C' +_CR = '\u000D' +_SPACE = '\u0020' +_COMMA = '\u002C' +_LEFT_PARENTHESIS = '\u0028' +_RIGHT_PARENTHESIS = '\u0029' + + +_WHITESPACE = {_TAB, _LF, _FF, _CR, _SPACE} +_WHITESPACE_OR_COMMA = _WHITESPACE | {_COMMA} + + +def parse_srcset(input): + """ + Parse HTML srcset + + Based on WhatWG HTML standard § 4.8.4.3.10 Parsing a srcset attribute, + but does not parse or validate descriptors. + + https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute + """ + + input_end = len(input) + position = 0 + urls = [] + + while position < input_end: + # 4. Splitting loop: Collect a sequence of code points that are ASCII + # whitespace or U+002C COMMA characters from input given position. + while position < input_end and input[position] in _WHITESPACE_OR_COMMA: + position += 1 + + # 5. If position is past the end of input, return candidates. + if position >= input_end: + return urls + + # 6. Collect a sequence of code points that are not ASCII + # whitespace from input given position, and let that be url. + url_start = position + while position < input_end and input[position] not in _WHITESPACE: + position += 1 + url_end = position + + # 8, If url ends with U+002C (,), then: + if input[url_end - 1] == _COMMA: + # Remove all trailing U+002C COMMA characters from url. + while url_end > url_start and input[url_end - 1] == _COMMA: + url_end -= 1 + else: + # This is a shortened version of 1–4 that simply skips the + # descriptors + while position < input_end: + if input[position] == _LEFT_PARENTHESIS: + # Skip until first closing parenthesis + while (position < input_end and input[position] != + _RIGHT_PARENTHESIS): + position += 1 + elif input[position] == _COMMA: + break + + position += 1 + + # 9-15 is parsing and validation of the descriptors, which we ignore + + # If we found an URL + if url_end > url_start: + urls.append(input[url_start:url_end]) + + return urls diff --git a/tests/test_srcsetparse.py b/tests/test_srcsetparse.py new file mode 100644 index 00000000..e1f56720 --- /dev/null +++ b/tests/test_srcsetparse.py @@ -0,0 +1,51 @@ +# Copyright (C) 2022 Stefan fisk +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Test srcset attribute parsing. +""" + +import unittest +from linkcheck.htmlutil.srcsetparse import parse_srcset + +from parameterized import parameterized + + +# list of tuples +# (, ) +parsetests = [ + ('', []), + (' ', []), + (',', []), + ('\t\n ,,\t\n,, \t\n', []), + ('foo', ['foo']), + ('foo,bar, ,foo, bar', ['foo,bar', 'foo', 'bar']), + ('https://example.com/1 foo, https://example.com/2 bar', + ['https://example.com/1', 'https://example.com/2']), + (' foo ', ['foo']), + (',,,foo,,,', ['foo']), + (',foo,bar,baz,', ['foo,bar,baz']), + ('foo bar baz', ['foo']), + ('foo, bar baz', ['foo', 'bar']), + ('foo/1 bar, foo/2', ['foo/1', 'foo/2']), + ('foo/1 (foo/2)', ['foo/1']), + ('foo/1 (((, foo/2', ['foo/1']), +] + + +class TestSrcsetParsing(unittest.TestCase): + @parameterized.expand(parsetests) + def test_parse(self, _in, _urls): + self.assertEqual(parse_srcset(_in), _urls)