diff --git a/doc/changelog.txt b/doc/changelog.txt index 1c8b1033..7727ec74 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -5,6 +5,11 @@ Fixes: some sites. Closes: SF bug #3388291 +Features: +- checking: If a warning regex is configured, multiple matches in + the URL content are added as warnings. + Closes: SF bug #3412317 + 7.1 "A fish called Wanda" (released 6.8.2011) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index e126b4b5..4373edb4 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -723,15 +723,28 @@ class UrlBase (object): def check_warningregex (self): """Check if content matches a given regular expression.""" - warningregex = self.aggregate.config["warningregex"] + config = self.aggregate.config + warningregex = config["warningregex"] if not (warningregex and self.valid and self.is_parseable()): return - log.debug(LOG_CHECK, "checking content") + log.debug(LOG_CHECK, "checking content for warning regex") try: - match = warningregex.search(self.get_content()) - if match: - self.add_warning(_("Found %(match)r in link contents.") % - {"match": match.group()}, tag=WARN_URL_WARNREGEX_FOUND) + content = self.get_content() + curpos = 0 + curline = 1 + # add warnings for found matches, up to the maximum allowed number + for num, match in enumerate(warningregex.finditer(content)): + # calculate line number for match + curline += content.count('\n', curpos, match.start()) + curpos = match.start() + # add a warning message + msg = _("Found %(match)r at line %(line)d in link contents.") + self.add_warning(msg % + {"match": match.group(), "line": curline}, + tag=WARN_URL_WARNREGEX_FOUND) + # check for maximum number of warnings + if num >= config["warningregex_max"]: + break except tuple(ExcList): value = self.handle_exception() self.set_result(unicode_safe(value), valid=False) @@ -950,10 +963,8 @@ class UrlBase (object): self.aggregate.urlqueue.put(url_data) def parse_text (self): - """ - Parse a text file with on url per line; comment and blank - lines are ignored. - """ + """Parse a text file with one url per line; comment and blank + lines are ignored.""" log.debug(LOG_CHECK, "Parsing text %s", self) lineno = 0 for line in self.get_content().splitlines(): diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index c1533fbe..4d6749cd 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -179,6 +179,7 @@ class Configuration (dict): self['output'] = 'text' self['logger'] = None self["warningregex"] = None + self["warningregex_max"] = 5 self["warnsizebytes"] = None self["nntpserver"] = os.environ.get("NNTP_SERVER", None) self["threads"] = 10