From a1d911127bb8a7a20a417d5f09c4dfd81fa2488f Mon Sep 17 00:00:00 2001 From: calvin Date: Wed, 14 Nov 2007 18:46:14 +0000 Subject: [PATCH] remove comments from CSS files before parsing for links git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3601 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- ChangeLog | 5 +++++ TODO | 2 -- linkcheck/checker/tests/data/file.css | 1 + linkcheck/checker/urlbase.py | 3 ++- linkcheck/linkparse.py | 7 +++++++ linkcheck/tests/test_linkparser.py | 11 +++++++++++ 6 files changed, 26 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3af9adc4..7c2492f9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -31,6 +31,11 @@ Type: documentation Changed: doc/{en,de}/linkchecker.1 + * Remove comments from CSS content before searching for links. + Type: bugfix + Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py + Closes: SF bug #1831900 + 4.7 "300" (released 17.6.2007) * Mention in the documentation that --anchors enables logging of diff --git a/TODO b/TODO index 6a8a67fa..adda574b 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,5 @@ - [OPTIMIZATION] Don't store content in TagFinder, only in LinkFinder -- [BUGFIX] Ignore links in commented-out CSS data (SF Bug #1831900) - - [BUG REPORT] Running on Windows XP with threads and a local HTTP Server yields a lot of (10061 'Connection Refused') errors. Without threads (-t0) gets rid of these errors. Is it the server? diff --git a/linkcheck/checker/tests/data/file.css b/linkcheck/checker/tests/data/file.css index 55ee1ade..dcbf282e 100644 --- a/linkcheck/checker/tests/data/file.css +++ b/linkcheck/checker/tests/data/file.css @@ -2,3 +2,4 @@ src:url(file.html) } background-image:url(file.html) +/*background-image:url(broken.html)*/ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index f23148e5..9c9d2dd1 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -726,7 +726,8 @@ class UrlBase (object): "Parsing CSS %s", self) lineno = 0 linkfinder = linkcheck.linkparse.css_url_re.finditer - for line in self.get_content().splitlines(): + strip_comments = linkcheck.linkparse.strip_c_comments + for line in strip_comments(self.get_content()).splitlines(): lineno += 1 for mo in linkfinder(line): column = mo.start("url") diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py index 5e4602e5..753d4976 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/linkparse.py @@ -63,6 +63,13 @@ LinkTags = { refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P.+)$") _quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)" css_url_re = re.compile(ur"url\(\s*(?P%s)\s*\)" % _quoted_pat) +c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL) + +def strip_c_comments (text): + """Remove C/CSS-style comments from text. Note that this method also + deliberately removes comments inside of strings.""" + return c_comment_re.sub('', text) + class TagFinder (object): """ diff --git a/linkcheck/tests/test_linkparser.py b/linkcheck/tests/test_linkparser.py index 9206861e..5e9ee361 100644 --- a/linkcheck/tests/test_linkparser.py +++ b/linkcheck/tests/test_linkparser.py @@ -77,6 +77,17 @@ class TestLinkparser (unittest.TestCase): content = u"" self._test_one_link(content % url, url) + def test_comment_stripping (self): + strip = linkcheck.linkparse.strip_c_comments + content = "/* url('http://imadoofus.org')*/" + self.assertEqual(strip(content), "") + content = "/* * * **/" + self.assertEqual(strip(content), "") + content = "/* * /* * **//* */" + self.assertEqual(strip(content), "") + content = "a/* */b/* */c" + self.assertEqual(strip(content), "abc") + def test_suite (): """