From a1d911127bb8a7a20a417d5f09c4dfd81fa2488f Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Wed, 14 Nov 2007 18:46:14 +0000
Subject: [PATCH] remove comments from CSS files before parsing for links

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3601 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 ChangeLog                             |  5 +++++
 TODO                                  |  2 --
 linkcheck/checker/tests/data/file.css |  1 +
 linkcheck/checker/urlbase.py          |  3 ++-
 linkcheck/linkparse.py                |  7 +++++++
 linkcheck/tests/test_linkparser.py    | 11 +++++++++++
 6 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 3af9adc4..7c2492f9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,6 +31,11 @@
     Type: documentation
     Changed: doc/{en,de}/linkchecker.1
 
+  * Remove comments from CSS content before searching for links.
+    Type: bugfix
+    Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py
+    Closes: SF bug #1831900
+
 4.7 "300" (released 17.6.2007)
 
   * Mention in the documentation that --anchors enables logging of
diff --git a/TODO b/TODO
index 6a8a67fa..adda574b 100644
--- a/TODO
+++ b/TODO
@@ -1,7 +1,5 @@
 - [OPTIMIZATION] Don't store content in TagFinder, only in LinkFinder
 
-- [BUGFIX] Ignore links in commented-out CSS data (SF Bug #1831900)
-
 - [BUG REPORT] Running on Windows XP with threads and a local HTTP Server
   yields a lot of (10061 'Connection Refused') errors. Without threads (-t0)
   gets rid of these errors. Is it the server?
diff --git a/linkcheck/checker/tests/data/file.css b/linkcheck/checker/tests/data/file.css
index 55ee1ade..dcbf282e 100644
--- a/linkcheck/checker/tests/data/file.css
+++ b/linkcheck/checker/tests/data/file.css
@@ -2,3 +2,4 @@
   src:url(file.html)
 }
 background-image:url(file.html)
+/*background-image:url(broken.html)*/
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index f23148e5..9c9d2dd1 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -726,7 +726,8 @@ class UrlBase (object):
             "Parsing CSS %s", self)
         lineno = 0
         linkfinder = linkcheck.linkparse.css_url_re.finditer
-        for line in self.get_content().splitlines():
+        strip_comments = linkcheck.linkparse.strip_c_comments
+        for line in strip_comments(self.get_content()).splitlines():
             lineno += 1
             for mo in linkfinder(line):
                 column = mo.start("url")
diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py
index 5e4602e5..753d4976 100644
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@@ -63,6 +63,13 @@ LinkTags = {
 refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
 _quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
 css_url_re = re.compile(ur"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
+c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL)
+
+def strip_c_comments (text):
+    """Remove C/CSS-style comments from text. Note that this method also
+    deliberately removes comments inside of strings."""
+    return c_comment_re.sub('', text)
+
 
 class TagFinder (object):
     """
diff --git a/linkcheck/tests/test_linkparser.py b/linkcheck/tests/test_linkparser.py
index 9206861e..5e9ee361 100644
--- a/linkcheck/tests/test_linkparser.py
+++ b/linkcheck/tests/test_linkparser.py
@@ -77,6 +77,17 @@ class TestLinkparser (unittest.TestCase):
         content = u"<table style='background: url( \"%s\") no-repeat' >"
         self._test_one_link(content % url, url)
 
+    def test_comment_stripping (self):
+        strip = linkcheck.linkparse.strip_c_comments
+        content = "/* url('http://imadoofus.org')*/"
+        self.assertEqual(strip(content), "")
+        content = "/* * * **/"
+        self.assertEqual(strip(content), "")
+        content = "/* * /* * **//* */"
+        self.assertEqual(strip(content), "")
+        content = "a/* */b/* */c"
+        self.assertEqual(strip(content), "abc")
+
 
 def test_suite ():
     """