parse css files recursively

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1058 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2003-10-17 10:53:48 +00:00
parent 160237bd52
commit e0a063104e
5 changed files with 53 additions and 11 deletions

View file

@ -112,6 +112,14 @@ class FileUrlData (UrlData):
def isHtml (self):
if extensions['html'].search(self.url):
return True
if contents['html'].search(self.getContent()[:20]):
return True
return False
def isParseable (self):
# guess by extension
for ro in extensions.values():
if ro.search(self.url):

View file

@ -65,7 +65,12 @@ class FtpUrlData (ProxyUrlData):
def isHtml (self):
# guess by extension
if extensions['html'].search(self.url):
return True
return False
def isParseable (self):
for ro in extensions.values():
if ro.search(self.url):
return True

View file

@ -369,6 +369,20 @@ class HttpUrlData (ProxyUrlData):
return True
def isParseable (self):
if not (self.valid and self.headers):
return False
if self.headers.gettype()[:9] not in ("text/html", "test/stylesheet"):
return False
encoding = self.headers.get("Content-Encoding")
if encoding and encoding not in _supported_encodings and \
encoding!='identity':
self.setWarning(i18n._('Unsupported content encoding %s.')%\
`encoding`)
return False
return True
def getRobotsTxtUrl (self):
return self.urlparts[0]+"://"+self.urlparts[1]+"/robots.txt"

View file

@ -212,6 +212,10 @@ class UrlData (object):
self.validString = i18n._("Valid")+": "+s
def isParseable (self):
return False
def isHtml (self):
return False
@ -342,8 +346,8 @@ class UrlData (object):
self.checkConnection()
if self.cached:
return
if self.anchor and self.config["anchors"]:
self.checkAnchors(self.anchor)
if self.config["anchors"]:
self.checkAnchors()
except tuple(ExcList):
type, value, tb = sys.exc_info()
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
@ -417,10 +421,9 @@ class UrlData (object):
def allowsRecursion (self):
# note: isHtml() might not be working if valid is false, so be
# sure to test it first.
# note: test self.valid before self.isParseable()
return self.valid and \
self.isHtml() and \
self.isParseable() and \
self.hasContent() and \
not self.cached and \
(self.config["recursionlevel"] >= 0 and
@ -428,15 +431,17 @@ class UrlData (object):
not self.extern[0]
def checkAnchors (self, anchor):
debug(HURT_ME_PLENTY, "checking anchor", anchor)
if not (self.valid and anchor and self.isHtml() and self.hasContent()):
def checkAnchors (self):
if not (self.valid and self.anchor and self.isHtml() and \
self.hasContent()):
# do not bother
return
debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
for cur_anchor,line,column,name,base in h.urls:
if cur_anchor == anchor:
if cur_anchor == self.anchor:
return
self.setWarning(i18n._("anchor #%s not found") % anchor)
self.setWarning(i18n._("anchor #%s not found") % self.anchor)
def _getExtern (self):
@ -581,6 +586,15 @@ class UrlData (object):
self.config, self.url, None, lineno, ""))
def parse_css (self):
"""parse a CSS file for url() patterns"""
lineno = 0
lines = self.getContent().splitlines()
for line in lines:
lineno += 1
# XXX todo: css url pattern matching
def __str__ (self):
return ("%s link\n"
"urlname=%s\n"

View file

@ -39,6 +39,7 @@ def getLinkPat (arg, strict=False):
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
}