diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index abc57458..25d649e9 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -188,6 +188,24 @@ URL checking results **warnings=**\ [**0**\ \|\ **1**] If set log warnings. Default is to log warnings. Command line option: :option:`--no-warnings` +**ignoreerrors=**\ *URL_REGEX* [*MESSAGE_REGEX*] (`MULTILINE`_) + Specify regular expressions to ignore errors for matching URLs, one + per line. A second regular expression can be specified per line to + only ignore matching error messages per corresponding URL. If the + second expression is omitted, all errors are ignored. In contrast + to filtering_, this happens *after* checking, which allows checking + URLs despite certain expected and tolerable errors. Default is to + not ignore any errors. + + :: + [output] + + ignoreerrors= + + ^https://deprecated\.example\.com ^410 Gone + + # ignore all errors (no second expression), also for syntax check: + ^mailto:.*@example\.com$ Progress updates """""""""""""""" diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 4bfba2ab..e7264dcb 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -216,6 +216,7 @@ class UrlBase: % {"url": base_url}, tag=WARN_URL_WHITESPACE, ) + self.ignore_errors = self.aggregate.config['ignoreerrors'] def reset(self): """ @@ -270,6 +271,8 @@ class UrlBase: self.content_type = "" # URLs seen through redirections self.aliases = [] + # error messages (regular expressions) to ignore + self.ignore_errors = [] def set_result(self, msg, valid=True, overwrite=False): """ @@ -289,6 +292,16 @@ class UrlBase: log.warn(LOG_CHECK, "Empty result for %s", self) self.result = msg self.valid = valid + + if not self.valid: + for url_regex, msg_regex in self.ignore_errors: + if not url_regex.search(self.url): + continue + if not msg_regex.search(self.result): + continue + self.valid = True + self.result = f"Ignored: {self.result}" + # free content data self.data = None diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index 229fca3d..9892da0d 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -165,6 +165,7 @@ class Configuration(dict): self["loginextrafields"] = {} # filtering self["externlinks"] = [] + self["ignoreerrors"] = [] self["ignorewarnings"] = [] self["internlinks"] = [] self["checkextern"] = False diff --git a/linkcheck/configuration/confparse.py b/linkcheck/configuration/confparse.py index 3207e3f0..da772326 100644 --- a/linkcheck/configuration/confparse.py +++ b/linkcheck/configuration/confparse.py @@ -16,6 +16,7 @@ """Parse configuration files""" from configparser import RawConfigParser +from re import compile as re_compile import os from .. import ( @@ -158,6 +159,14 @@ class LCConfigParser(RawConfigParser): for val in loggers: output = self.config.logger_new(val, fileoutput=1) self.config['fileoutput'].append(output) + if self.has_option(section, "ignoreerrors"): + for line in read_multiline(self.get(section, "ignoreerrors")): + parts = line.split(maxsplit=1) + if len(parts) == 1: + parts.append('') + self.config["ignoreerrors"].append(tuple( + re_compile(part) for part in parts + )) def read_checking_config(self): """Read configuration options in section "checking".""" diff --git a/linkcheck/data/linkcheckerrc b/linkcheck/data/linkcheckerrc index d468f50e..40abdfa4 100644 --- a/linkcheck/data/linkcheckerrc +++ b/linkcheck/data/linkcheckerrc @@ -18,6 +18,12 @@ #quiet=1 # additional file output #fileoutput = text, html, gml, sql +# errors to ignore (URL regular expression, message regular expression) +#ignoreerrors= +# ignore all errors for broken.example.com: +# ^https?://broken.example.com/ +# ignore SSL errors for dev.example.com: +# ^https://dev.example.com/ ^SSLError .* ##################### logger configuration ########################## diff --git a/tests/checker/test_ignoreerrors.py b/tests/checker/test_ignoreerrors.py new file mode 100644 index 00000000..8b2d3420 --- /dev/null +++ b/tests/checker/test_ignoreerrors.py @@ -0,0 +1,76 @@ +# Copyright (C) 2004-2014 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Test ignoring of errors. +""" + +from re import compile as re_compile + +from tests import need_network +from . import LinkCheckTest + + +class TestFile(LinkCheckTest): + """ + Test whether ignoring of errors per URL works. + """ + + def _test(self, url, url_regex, msg_regex, valid): + """ Shorthand for various tests of ignoring errors. """ + confargs = { + "ignoreerrors": [ + (re_compile(url_regex), re_compile(msg_regex)) + ] + } + resultlines = [ + "url %s" % url, + "cache key %s" % url, + "real url %s" % url, + "valid" if valid else "error", + ] + self.direct(url, resultlines, confargs=confargs) + + def test_no_error(self): + """ Test that unmatched errors are not ignored. """ + self._test("mailto:good@example.com", "", "", True) + self._test("mailto:good@example.com", "^$", "", True) + self._test("mailto:good@example.com", "^$", "^no-match$", True) + self._test("mailto:good@example.com", + r"^mailto:good@example\.com$", "", True) + self._test("mailto:good@example.com", + r"^mailto:good@example\.com$", "^$", True) + + def test_url_regex(self): + """ Test that URLs are properly matched. """ + self._test("mailto:foo", r"^$", "", False) + self._test("mailto:foo", r"", "", True) + self._test("mailto:foo", r"^mailto:foo$", "", True) + self._test("mailto:foobar", r"^mailto:foo", "", True) + + def test_msg_regex(self): + """ Test that error messages are properly matched. """ + self._test("mailto:foo", r"^mailto:foo$", "^$", False) + self._test("mailto:foo", r"^mailto:foo$", "", True) + self._test("mailto:foo", r"^mailto:foo$", + r"^Missing `@' in mail address `foo'.$", True) + + @need_network + def test_internet(self): + """ Test a few well-known Internet URLs. """ + self._test("http://example.com/does-not-exist", + r"^http://example.com/.+$", "^404", True) + self._test("http://does-not-exist.example.com", + r"example.com", "^ConnectionError", True) diff --git a/tests/configuration/data/config0.ini b/tests/configuration/data/config0.ini index f2e0d3cc..7116a0c6 100644 --- a/tests/configuration/data/config0.ini +++ b/tests/configuration/data/config0.ini @@ -51,6 +51,9 @@ verbose=1 warnings=1 quiet=0 fileoutput = Text, html, Gml, sql,csv, xml, gxml, dot +ignoreerrors= + ^https://example.com/does-not-exist ^404 + ^mailto:foo [text] filename=imadoofus.txt diff --git a/tests/configuration/test_config.py b/tests/configuration/test_config.py index 15fe6806..75fcc24f 100644 --- a/tests/configuration/test_config.py +++ b/tests/configuration/test_config.py @@ -19,6 +19,7 @@ Test config parsing. import unittest import os +from re import Pattern import linkcheck.configuration @@ -66,6 +67,17 @@ class TestConfig(unittest.TestCase): self.assertTrue(key in patterns) for key in ("url-unicode-domain",): self.assertTrue(key in config["ignorewarnings"]) + self.assertEqual(len(config["ignoreerrors"]), 2) + for parts in config["ignoreerrors"]: + self.assertEqual(len(parts), 2) + for part in parts: + self.assertTrue(isinstance(part, Pattern)) + self.assertTrue(config["ignoreerrors"][0][1].search( + "404 Not Found" + )) + self.assertTrue(config["ignoreerrors"][1][0].search( + "mailto:foo" + )) self.assertTrue(config["checkextern"]) # authentication section patterns = [x["pattern"].pattern for x in config["authentication"]]