full css parsing

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1300 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-24 06:03:43 +00:00 · 2004-04-04 09:30:10 +00:00 · 2004-04-04 09:30:10 +00:00 · e78a8ea539
commit e78a8ea539
parent f4802fd467
4 changed files with 32 additions and 16 deletions
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@ -135,10 +135,10 @@ class FileUrlData (UrlData):


    def parseUrl (self):
-        for key,ro in extensions.items():
+        for key, ro in extensions.items():
            if ro.search(self.url):
                return getattr(self, "parse_"+key)()
-        for key,ro in contents.items():
+        for key, ro in contents.items():
            if ro.search(self.getContent()[:20]):
                return getattr(self, "parse_"+key)()
        return None
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -393,13 +393,17 @@ class HttpUrlData (ProxyUrlData):
        return True


+    def getContentType (self):
+        ptype = self.headers.get('Content-Type', 'application/octet-stream')
+        if ";" in ptype:
+            ptype = ptype.split(';')[0]
+        return ptype
+
+
    def isParseable (self):
        if not (self.valid and self.headers):
            return False
-        ptype = self.headers.gettype()
-        if ";" in ptype:
-            ptype = ptype.split(';')[0]
-        if ptype not in ("text/html", "text/stylesheet"):
+        if self.getContentType() not in ("text/html", "text/css"):
            return False
        encoding = self.headers.get("Content-Encoding")
        if encoding and encoding not in _supported_encodings and \
@ -409,6 +413,15 @@ class HttpUrlData (ProxyUrlData):
        return True


+    def parseUrl (self):
+        ptype = self.getContentType()
+        if ptype=="text/html":
+            self.parse_html()
+        elif ptype=="text/css":
+            self.parse_css()
+        return None
+
+
    def getRobotsTxtUrl (self):
        return "%s://%s/robots.txt"%tuple(self.urlparts[0:2])

--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -23,7 +23,7 @@ from linkcheck.parser import htmlsax
 DNS.DiscoverNameServers()

 import Config, StringUtil, test_support
-from linkparse import LinkFinder, MetaRobotsFinder
+from linkparse import LinkFinder, MetaRobotsFinder, css_url_re
 from debug import *

 ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
@ -634,22 +634,24 @@ class UrlData (object):
           UNUSED and UNTESTED, just use linkchecker `cat file.txt`
        """
        lineno = 0
-        lines = self.getContent().splitlines()
-        for line in lines:
+        for line in self.getContent().splitlines():
            lineno += 1
            line = line.strip()
            if not line or line.startswith('#'): continue
            self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1,
-                                  self.config, self.url, None, lineno, ""))
+                               self.config, parentName=self.url, line=lineno))


    def parse_css (self):
        """parse a CSS file for url() patterns"""
        lineno = 0
-        lines = self.getContent().splitlines()
-        for line in lines:
+        for line in self.getContent().splitlines():
            lineno += 1
-            # XXX todo: css url pattern matching
+            for mo in css_url_re.finditer(line):
+                column = mo.start("url")
+                self.config.appendUrl(GetUrlDataFrom(mo.group("url"),
+                      self.recursionLevel+1, self.config,
+                      parentName=self.url, line=lineno, column=column))


    def __str__ (self):
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@ -53,10 +53,10 @@ LinkTags = {

 # matcher for <meta http-equiv=refresh> tags
 _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
-_css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
+css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")

 class TagFinder (object):
-    """base class putting message in list"""
+    """base class storing parse messages in a list"""
    def __init__ (self, content):
        self.content = content
        # warnings and errors during parsing
@ -151,7 +151,7 @@ class LinkFinder (TagFinder):
            if mo:
                urls.append(mo.group("url"))
        elif attr=='style':
-            for mo in _css_url_re.finditer(url):
+            for mo in css_url_re.finditer(url):
                urls.append(mo.group("url"))
        else:
            urls.append(url)
@ -163,3 +163,4 @@ class LinkFinder (TagFinder):
            self.urls.append((u, self.parser.last_lineno(),
                              self.parser.last_column(), name, base))

+