From e0a063104e8885123e50b15546dd170eda826485 Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Fri, 17 Oct 2003 10:53:48 +0000
Subject: [PATCH] parse css files recursively

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1058 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 linkcheck/FileUrlData.py |  8 ++++++++
 linkcheck/FtpUrlData.py  |  7 ++++++-
 linkcheck/HttpUrlData.py | 14 ++++++++++++++
 linkcheck/UrlData.py     | 34 ++++++++++++++++++++++++----------
 linkcheck/__init__.py    |  1 +
 5 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py
index 1188d2fb..2f51e68c 100644
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@@ -112,6 +112,14 @@ class FileUrlData (UrlData):
 
 
     def isHtml (self):
+        if extensions['html'].search(self.url):
+            return True
+        if contents['html'].search(self.getContent()[:20]):
+            return True
+        return False
+
+
+    def isParseable (self):
         # guess by extension
         for ro in extensions.values():
             if ro.search(self.url):
diff --git a/linkcheck/FtpUrlData.py b/linkcheck/FtpUrlData.py
index 957e350b..2282b284 100644
--- a/linkcheck/FtpUrlData.py
+++ b/linkcheck/FtpUrlData.py
@@ -65,7 +65,12 @@ class FtpUrlData (ProxyUrlData):
 
 
     def isHtml (self):
-        # guess by extension
+        if extensions['html'].search(self.url):
+            return True
+        return False
+
+
+    def isParseable (self):
         for ro in extensions.values():
             if ro.search(self.url):
                 return True
diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py
index 4582fea5..6de78222 100644
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@@ -369,6 +369,20 @@ class HttpUrlData (ProxyUrlData):
         return True
 
 
+    def isParseable (self):
+        if not (self.valid and self.headers):
+            return False
+        if self.headers.gettype()[:9] not in ("text/html", "test/stylesheet"):
+            return False
+        encoding = self.headers.get("Content-Encoding")
+        if encoding and encoding not in _supported_encodings and \
+           encoding!='identity':
+            self.setWarning(i18n._('Unsupported content encoding %s.')%\
+                            `encoding`)
+            return False
+        return True
+
+
     def getRobotsTxtUrl (self):
         return self.urlparts[0]+"://"+self.urlparts[1]+"/robots.txt"
 
diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index f8712833..03b62338 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -212,6 +212,10 @@ class UrlData (object):
         self.validString = i18n._("Valid")+": "+s
 
 
+    def isParseable (self):
+        return False
+
+
     def isHtml (self):
         return False
 
@@ -342,8 +346,8 @@ class UrlData (object):
             self.checkConnection()
             if self.cached:
                 return
-            if self.anchor and self.config["anchors"]:
-                self.checkAnchors(self.anchor)
+            if self.config["anchors"]:
+                self.checkAnchors()
         except tuple(ExcList):
             type, value, tb = sys.exc_info()
             debug(HURT_ME_PLENTY, "exception",  traceback.format_tb(tb))
@@ -417,10 +421,9 @@ class UrlData (object):
 
 
     def allowsRecursion (self):
-        # note: isHtml() might not be working if valid is false, so be
-        # sure to test it first.
+        # note: test self.valid before self.isParseable()
         return self.valid and \
-               self.isHtml() and \
+               self.isParseable() and \
                self.hasContent() and \
                not self.cached and \
                (self.config["recursionlevel"] >= 0 and
@@ -428,15 +431,17 @@ class UrlData (object):
                not self.extern[0]
 
 
-    def checkAnchors (self, anchor):
-        debug(HURT_ME_PLENTY, "checking anchor", anchor)
-        if not (self.valid and anchor and self.isHtml() and self.hasContent()):
+    def checkAnchors (self):
+        if not (self.valid and self.anchor and self.isHtml() and \
+                self.hasContent()):
+            # do not bother
             return
+        debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
         h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
         for cur_anchor,line,column,name,base in h.urls:
-            if cur_anchor == anchor:
+            if cur_anchor == self.anchor:
                 return
-        self.setWarning(i18n._("anchor #%s not found") % anchor)
+        self.setWarning(i18n._("anchor #%s not found") % self.anchor)
 
 
     def _getExtern (self):
@@ -581,6 +586,15 @@ class UrlData (object):
                                   self.config, self.url, None, lineno, ""))
 
 
+    def parse_css (self):
+        """parse a CSS file for url() patterns"""
+        lineno = 0
+        lines = self.getContent().splitlines()
+        for line in lines:
+            lineno += 1
+            # XXX todo: css url pattern matching
+
+
     def __str__ (self):
         return ("%s link\n"
 	       "urlname=%s\n"
diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py
index 32eea2b7..c0f569a9 100644
--- a/linkcheck/__init__.py
+++ b/linkcheck/__init__.py
@@ -39,6 +39,7 @@ def getLinkPat (arg, strict=False):
 extensions = {
     "html": re.compile(r'(?i)\.s?html?$'),
     "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
+    "css": re.compile(r'(?i)\.css$'), # CSS stylesheet
 #    "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
 }