From e0a063104e8885123e50b15546dd170eda826485 Mon Sep 17 00:00:00 2001 From: calvin Date: Fri, 17 Oct 2003 10:53:48 +0000 Subject: [PATCH] parse css files recursively git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1058 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/FileUrlData.py | 8 ++++++++ linkcheck/FtpUrlData.py | 7 ++++++- linkcheck/HttpUrlData.py | 14 ++++++++++++++ linkcheck/UrlData.py | 34 ++++++++++++++++++++++++---------- linkcheck/__init__.py | 1 + 5 files changed, 53 insertions(+), 11 deletions(-) diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py index 1188d2fb..2f51e68c 100644 --- a/linkcheck/FileUrlData.py +++ b/linkcheck/FileUrlData.py @@ -112,6 +112,14 @@ class FileUrlData (UrlData): def isHtml (self): + if extensions['html'].search(self.url): + return True + if contents['html'].search(self.getContent()[:20]): + return True + return False + + + def isParseable (self): # guess by extension for ro in extensions.values(): if ro.search(self.url): diff --git a/linkcheck/FtpUrlData.py b/linkcheck/FtpUrlData.py index 957e350b..2282b284 100644 --- a/linkcheck/FtpUrlData.py +++ b/linkcheck/FtpUrlData.py @@ -65,7 +65,12 @@ class FtpUrlData (ProxyUrlData): def isHtml (self): - # guess by extension + if extensions['html'].search(self.url): + return True + return False + + + def isParseable (self): for ro in extensions.values(): if ro.search(self.url): return True diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index 4582fea5..6de78222 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -369,6 +369,20 @@ class HttpUrlData (ProxyUrlData): return True + def isParseable (self): + if not (self.valid and self.headers): + return False + if self.headers.gettype()[:9] not in ("text/html", "test/stylesheet"): + return False + encoding = self.headers.get("Content-Encoding") + if encoding and encoding not in _supported_encodings and \ + encoding!='identity': + self.setWarning(i18n._('Unsupported content encoding %s.')%\ + `encoding`) + return False + return True + + def getRobotsTxtUrl (self): return self.urlparts[0]+"://"+self.urlparts[1]+"/robots.txt" diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index f8712833..03b62338 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -212,6 +212,10 @@ class UrlData (object): self.validString = i18n._("Valid")+": "+s + def isParseable (self): + return False + + def isHtml (self): return False @@ -342,8 +346,8 @@ class UrlData (object): self.checkConnection() if self.cached: return - if self.anchor and self.config["anchors"]: - self.checkAnchors(self.anchor) + if self.config["anchors"]: + self.checkAnchors() except tuple(ExcList): type, value, tb = sys.exc_info() debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb)) @@ -417,10 +421,9 @@ class UrlData (object): def allowsRecursion (self): - # note: isHtml() might not be working if valid is false, so be - # sure to test it first. + # note: test self.valid before self.isParseable() return self.valid and \ - self.isHtml() and \ + self.isParseable() and \ self.hasContent() and \ not self.cached and \ (self.config["recursionlevel"] >= 0 and @@ -428,15 +431,17 @@ class UrlData (object): not self.extern[0] - def checkAnchors (self, anchor): - debug(HURT_ME_PLENTY, "checking anchor", anchor) - if not (self.valid and anchor and self.isHtml() and self.hasContent()): + def checkAnchors (self): + if not (self.valid and self.anchor and self.isHtml() and \ + self.hasContent()): + # do not bother return + debug(HURT_ME_PLENTY, "checking anchor", self.anchor) h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']}) for cur_anchor,line,column,name,base in h.urls: - if cur_anchor == anchor: + if cur_anchor == self.anchor: return - self.setWarning(i18n._("anchor #%s not found") % anchor) + self.setWarning(i18n._("anchor #%s not found") % self.anchor) def _getExtern (self): @@ -581,6 +586,15 @@ class UrlData (object): self.config, self.url, None, lineno, "")) + def parse_css (self): + """parse a CSS file for url() patterns""" + lineno = 0 + lines = self.getContent().splitlines() + for line in lines: + lineno += 1 + # XXX todo: css url pattern matching + + def __str__ (self): return ("%s link\n" "urlname=%s\n" diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index 32eea2b7..c0f569a9 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -39,6 +39,7 @@ def getLinkPat (arg, strict=False): extensions = { "html": re.compile(r'(?i)\.s?html?$'), "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file + "css": re.compile(r'(?i)\.css$'), # CSS stylesheet # "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'), }