remove comments from CSS files before parsing for links

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3601 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2007-11-14 18:46:14 +00:00
parent dc11717bce
commit a1d911127b
6 changed files with 26 additions and 3 deletions

View file

@ -31,6 +31,11 @@
Type: documentation
Changed: doc/{en,de}/linkchecker.1
* Remove comments from CSS content before searching for links.
Type: bugfix
Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py
Closes: SF bug #1831900
4.7 "300" (released 17.6.2007)
* Mention in the documentation that --anchors enables logging of

2
TODO
View file

@ -1,7 +1,5 @@
- [OPTIMIZATION] Don't store content in TagFinder, only in LinkFinder
- [BUGFIX] Ignore links in commented-out CSS data (SF Bug #1831900)
- [BUG REPORT] Running on Windows XP with threads and a local HTTP Server
yields a lot of (10061 'Connection Refused') errors. Without threads (-t0)
gets rid of these errors. Is it the server?

View file

@ -2,3 +2,4 @@
src:url(file.html)
}
background-image:url(file.html)
/*background-image:url(broken.html)*/

View file

@ -726,7 +726,8 @@ class UrlBase (object):
"Parsing CSS %s", self)
lineno = 0
linkfinder = linkcheck.linkparse.css_url_re.finditer
for line in self.get_content().splitlines():
strip_comments = linkcheck.linkparse.strip_c_comments
for line in strip_comments(self.get_content()).splitlines():
lineno += 1
for mo in linkfinder(line):
column = mo.start("url")

View file

@ -63,6 +63,13 @@ LinkTags = {
refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
_quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
css_url_re = re.compile(ur"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL)
def strip_c_comments (text):
"""Remove C/CSS-style comments from text. Note that this method also
deliberately removes comments inside of strings."""
return c_comment_re.sub('', text)
class TagFinder (object):
"""

View file

@ -77,6 +77,17 @@ class TestLinkparser (unittest.TestCase):
content = u"<table style='background: url( \"%s\") no-repeat' >"
self._test_one_link(content % url, url)
def test_comment_stripping (self):
strip = linkcheck.linkparse.strip_c_comments
content = "/* url('http://imadoofus.org')*/"
self.assertEqual(strip(content), "")
content = "/* * * **/"
self.assertEqual(strip(content), "")
content = "/* * /* * **//* */"
self.assertEqual(strip(content), "")
content = "a/* */b/* */c"
self.assertEqual(strip(content), "abc")
def test_suite ():
"""