remove comments from CSS files before parsing for links

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3601 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-11 08:03:11 +00:00 · 2007-11-14 18:46:14 +00:00 · 2007-11-14 18:46:14 +00:00 · a1d911127b
commit a1d911127b
parent dc11717bce
6 changed files with 26 additions and 3 deletions
--- a/5
+++ b/5
@ -31,6 +31,11 @@
    Type: documentation
    Changed: doc/{en,de}/linkchecker.1

+  * Remove comments from CSS content before searching for links.
+    Type: bugfix
+    Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py
+    Closes: SF bug #1831900
+
 4.7 "300" (released 17.6.2007)

  * Mention in the documentation that --anchors enables logging of
--- a/2
+++ b/2
@ -1,7 +1,5 @@
 - [OPTIMIZATION] Don't store content in TagFinder, only in LinkFinder

- [BUGFIX] Ignore links in commented-out CSS data (SF Bug #1831900)
-
 - [BUG REPORT] Running on Windows XP with threads and a local HTTP Server
  yields a lot of (10061 'Connection Refused') errors. Without threads (-t0)
  gets rid of these errors. Is it the server?
--- a/linkcheck/checker/tests/data/file.css
+++ b/linkcheck/checker/tests/data/file.css
@ -2,3 +2,4 @@
  src:url(file.html)
 }
 background-image:url(file.html)
+/*background-image:url(broken.html)*/
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -726,7 +726,8 @@ class UrlBase (object):
            "Parsing CSS %s", self)
        lineno = 0
        linkfinder = linkcheck.linkparse.css_url_re.finditer
-        for line in self.get_content().splitlines():
+        strip_comments = linkcheck.linkparse.strip_c_comments
+        for line in strip_comments(self.get_content()).splitlines():
            lineno += 1
            for mo in linkfinder(line):
                column = mo.start("url")
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@ -63,6 +63,13 @@ LinkTags = {
 refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
 _quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
 css_url_re = re.compile(ur"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
+c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL)
+
+def strip_c_comments (text):
+    """Remove C/CSS-style comments from text. Note that this method also
+    deliberately removes comments inside of strings."""
+    return c_comment_re.sub('', text)
+

 class TagFinder (object):
    """
--- a/linkcheck/tests/test_linkparser.py
+++ b/linkcheck/tests/test_linkparser.py
@ -77,6 +77,17 @@ class TestLinkparser (unittest.TestCase):
        content = u"<table style='background: url( \"%s\") no-repeat' >"
        self._test_one_link(content % url, url)

+    def test_comment_stripping (self):
+        strip = linkcheck.linkparse.strip_c_comments
+        content = "/* url('http://imadoofus.org')*/"
+        self.assertEqual(strip(content), "")
+        content = "/* * * **/"
+        self.assertEqual(strip(content), "")
+        content = "/* * /* * **//* */"
+        self.assertEqual(strip(content), "")
+        content = "a/* */b/* */c"
+        self.assertEqual(strip(content), "abc")
+

 def test_suite ():
    """