add option to ignore specific errors for specific URLs

This commit is contained in:
Lukas Pirl 2022-07-21 17:32:27 +02:00
parent 36a45b0f96
commit 8c959589c3
No known key found for this signature in database
GPG key ID: 3D9FF3F045C0939F
8 changed files with 138 additions and 0 deletions

View file

@ -188,6 +188,24 @@ URL checking results
**warnings=**\ [**0**\ \|\ **1**]
If set log warnings. Default is to log warnings.
Command line option: :option:`--no-warnings`
**ignoreerrors=**\ *URL_REGEX* [*MESSAGE_REGEX*] (`MULTILINE`_)
Specify regular expressions to ignore errors for matching URLs, one
per line. A second regular expression can be specified per line to
only ignore matching error messages per corresponding URL. If the
second expression is omitted, all errors are ignored. In contrast
to filtering_, this happens *after* checking, which allows checking
URLs despite certain expected and tolerable errors. Default is to
not ignore any errors.
::
[output]
ignoreerrors=
^https://deprecated\.example\.com ^410 Gone
# ignore all errors (no second expression), also for syntax check:
^mailto:.*@example\.com$
Progress updates
""""""""""""""""

View file

@ -216,6 +216,7 @@ class UrlBase:
% {"url": base_url},
tag=WARN_URL_WHITESPACE,
)
self.ignore_errors = self.aggregate.config['ignoreerrors']
def reset(self):
"""
@ -270,6 +271,8 @@ class UrlBase:
self.content_type = ""
# URLs seen through redirections
self.aliases = []
# error messages (regular expressions) to ignore
self.ignore_errors = []
def set_result(self, msg, valid=True, overwrite=False):
"""
@ -289,6 +292,16 @@ class UrlBase:
log.warn(LOG_CHECK, "Empty result for %s", self)
self.result = msg
self.valid = valid
if not self.valid:
for url_regex, msg_regex in self.ignore_errors:
if not url_regex.search(self.url):
continue
if not msg_regex.search(self.result):
continue
self.valid = True
self.result = f"Ignored: {self.result}"
# free content data
self.data = None

View file

@ -165,6 +165,7 @@ class Configuration(dict):
self["loginextrafields"] = {}
# filtering
self["externlinks"] = []
self["ignoreerrors"] = []
self["ignorewarnings"] = []
self["internlinks"] = []
self["checkextern"] = False

View file

@ -16,6 +16,7 @@
"""Parse configuration files"""
from configparser import RawConfigParser
from re import compile as re_compile
import os
from .. import (
@ -158,6 +159,14 @@ class LCConfigParser(RawConfigParser):
for val in loggers:
output = self.config.logger_new(val, fileoutput=1)
self.config['fileoutput'].append(output)
if self.has_option(section, "ignoreerrors"):
for line in read_multiline(self.get(section, "ignoreerrors")):
parts = line.split(maxsplit=1)
if len(parts) == 1:
parts.append('')
self.config["ignoreerrors"].append(tuple(
re_compile(part) for part in parts
))
def read_checking_config(self):
"""Read configuration options in section "checking"."""

View file

@ -18,6 +18,12 @@
#quiet=1
# additional file output
#fileoutput = text, html, gml, sql
# errors to ignore (URL regular expression, message regular expression)
#ignoreerrors=
# ignore all errors for broken.example.com:
# ^https?://broken.example.com/
# ignore SSL errors for dev.example.com:
# ^https://dev.example.com/ ^SSLError .*
##################### logger configuration ##########################

View file

@ -0,0 +1,76 @@
# Copyright (C) 2004-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test ignoring of errors.
"""
from re import compile as re_compile
from tests import need_network
from . import LinkCheckTest
class TestFile(LinkCheckTest):
"""
Test whether ignoring of errors per URL works.
"""
def _test(self, url, url_regex, msg_regex, valid):
""" Shorthand for various tests of ignoring errors. """
confargs = {
"ignoreerrors": [
(re_compile(url_regex), re_compile(msg_regex))
]
}
resultlines = [
"url %s" % url,
"cache key %s" % url,
"real url %s" % url,
"valid" if valid else "error",
]
self.direct(url, resultlines, confargs=confargs)
def test_no_error(self):
""" Test that unmatched errors are not ignored. """
self._test("mailto:good@example.com", "", "", True)
self._test("mailto:good@example.com", "^$", "", True)
self._test("mailto:good@example.com", "^$", "^no-match$", True)
self._test("mailto:good@example.com",
r"^mailto:good@example\.com$", "", True)
self._test("mailto:good@example.com",
r"^mailto:good@example\.com$", "^$", True)
def test_url_regex(self):
""" Test that URLs are properly matched. """
self._test("mailto:foo", r"^$", "", False)
self._test("mailto:foo", r"", "", True)
self._test("mailto:foo", r"^mailto:foo$", "", True)
self._test("mailto:foobar", r"^mailto:foo", "", True)
def test_msg_regex(self):
""" Test that error messages are properly matched. """
self._test("mailto:foo", r"^mailto:foo$", "^$", False)
self._test("mailto:foo", r"^mailto:foo$", "", True)
self._test("mailto:foo", r"^mailto:foo$",
r"^Missing `@' in mail address `foo'.$", True)
@need_network
def test_internet(self):
""" Test a few well-known Internet URLs. """
self._test("http://example.com/does-not-exist",
r"^http://example.com/.+$", "^404", True)
self._test("http://does-not-exist.example.com",
r"example.com", "^ConnectionError", True)

View file

@ -51,6 +51,9 @@ verbose=1
warnings=1
quiet=0
fileoutput = Text, html, Gml, sql,csv, xml, gxml, dot
ignoreerrors=
^https://example.com/does-not-exist ^404
^mailto:foo
[text]
filename=imadoofus.txt

View file

@ -19,6 +19,7 @@ Test config parsing.
import unittest
import os
from re import Pattern
import linkcheck.configuration
@ -66,6 +67,17 @@ class TestConfig(unittest.TestCase):
self.assertTrue(key in patterns)
for key in ("url-unicode-domain",):
self.assertTrue(key in config["ignorewarnings"])
self.assertEqual(len(config["ignoreerrors"]), 2)
for parts in config["ignoreerrors"]:
self.assertEqual(len(parts), 2)
for part in parts:
self.assertTrue(isinstance(part, Pattern))
self.assertTrue(config["ignoreerrors"][0][1].search(
"404 Not Found"
))
self.assertTrue(config["ignoreerrors"][1][0].search(
"mailto:foo"
))
self.assertTrue(config["checkextern"])
# authentication section
patterns = [x["pattern"].pattern for x in config["authentication"]]