parse css files recursively

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1058 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-18 19:41:06 +00:00 · 2003-10-17 10:53:48 +00:00 · 2003-10-17 10:53:48 +00:00 · e0a063104e
commit e0a063104e
parent 160237bd52
5 changed files with 53 additions and 11 deletions
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@ -112,6 +112,14 @@ class FileUrlData (UrlData):


    def isHtml (self):
+        if extensions['html'].search(self.url):
+            return True
+        if contents['html'].search(self.getContent()[:20]):
+            return True
+        return False
+
+
+    def isParseable (self):
        # guess by extension
        for ro in extensions.values():
            if ro.search(self.url):
--- a/linkcheck/FtpUrlData.py
+++ b/linkcheck/FtpUrlData.py
@ -65,7 +65,12 @@ class FtpUrlData (ProxyUrlData):


    def isHtml (self):
-        # guess by extension
+        if extensions['html'].search(self.url):
+            return True
+        return False
+
+
+    def isParseable (self):
        for ro in extensions.values():
            if ro.search(self.url):
                return True
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -369,6 +369,20 @@ class HttpUrlData (ProxyUrlData):
        return True


+    def isParseable (self):
+        if not (self.valid and self.headers):
+            return False
+        if self.headers.gettype()[:9] not in ("text/html", "test/stylesheet"):
+            return False
+        encoding = self.headers.get("Content-Encoding")
+        if encoding and encoding not in _supported_encodings and \
+           encoding!='identity':
+            self.setWarning(i18n._('Unsupported content encoding %s.')%\
+                            `encoding`)
+            return False
+        return True
+
+
    def getRobotsTxtUrl (self):
        return self.urlparts[0]+"://"+self.urlparts[1]+"/robots.txt"

--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -212,6 +212,10 @@ class UrlData (object):
        self.validString = i18n._("Valid")+": "+s


+    def isParseable (self):
+        return False
+
+
    def isHtml (self):
        return False

@ -342,8 +346,8 @@ class UrlData (object):
            self.checkConnection()
            if self.cached:
                return
-            if self.anchor and self.config["anchors"]:
-                self.checkAnchors(self.anchor)
+            if self.config["anchors"]:
+                self.checkAnchors()
        except tuple(ExcList):
            type, value, tb = sys.exc_info()
            debug(HURT_ME_PLENTY, "exception",  traceback.format_tb(tb))
@ -417,10 +421,9 @@ class UrlData (object):


    def allowsRecursion (self):
-        # note: isHtml() might not be working if valid is false, so be
-        # sure to test it first.
+        # note: test self.valid before self.isParseable()
        return self.valid and \
-               self.isHtml() and \
+               self.isParseable() and \
               self.hasContent() and \
               not self.cached and \
               (self.config["recursionlevel"] >= 0 and
@ -428,15 +431,17 @@ class UrlData (object):
               not self.extern[0]


-    def checkAnchors (self, anchor):
-        debug(HURT_ME_PLENTY, "checking anchor", anchor)
-        if not (self.valid and anchor and self.isHtml() and self.hasContent()):
+    def checkAnchors (self):
+        if not (self.valid and self.anchor and self.isHtml() and \
+                self.hasContent()):
+            # do not bother
            return
+        debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
        h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
        for cur_anchor,line,column,name,base in h.urls:
-            if cur_anchor == anchor:
+            if cur_anchor == self.anchor:
                return
-        self.setWarning(i18n._("anchor #%s not found") % anchor)
+        self.setWarning(i18n._("anchor #%s not found") % self.anchor)


    def _getExtern (self):
@ -581,6 +586,15 @@ class UrlData (object):
                                  self.config, self.url, None, lineno, ""))


+    def parse_css (self):
+        """parse a CSS file for url() patterns"""
+        lineno = 0
+        lines = self.getContent().splitlines()
+        for line in lines:
+            lineno += 1
+            # XXX todo: css url pattern matching
+
+
    def __str__ (self):
        return ("%s link\n"
 	       "urlname=%s\n"
--- a/linkcheck/init.py
+++ b/linkcheck/init.py
@ -39,6 +39,7 @@ def getLinkPat (arg, strict=False):
 extensions = {
    "html": re.compile(r'(?i)\.s?html?$'),
    "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
+    "css": re.compile(r'(?i)\.css$'), # CSS stylesheet
 #    "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
 }