mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 17:13:11 +00:00
parse css files recursively
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1058 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
160237bd52
commit
e0a063104e
5 changed files with 53 additions and 11 deletions
|
|
@ -112,6 +112,14 @@ class FileUrlData (UrlData):
|
|||
|
||||
|
||||
def isHtml (self):
|
||||
if extensions['html'].search(self.url):
|
||||
return True
|
||||
if contents['html'].search(self.getContent()[:20]):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def isParseable (self):
|
||||
# guess by extension
|
||||
for ro in extensions.values():
|
||||
if ro.search(self.url):
|
||||
|
|
|
|||
|
|
@ -65,7 +65,12 @@ class FtpUrlData (ProxyUrlData):
|
|||
|
||||
|
||||
def isHtml (self):
|
||||
# guess by extension
|
||||
if extensions['html'].search(self.url):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def isParseable (self):
|
||||
for ro in extensions.values():
|
||||
if ro.search(self.url):
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -369,6 +369,20 @@ class HttpUrlData (ProxyUrlData):
|
|||
return True
|
||||
|
||||
|
||||
def isParseable (self):
|
||||
if not (self.valid and self.headers):
|
||||
return False
|
||||
if self.headers.gettype()[:9] not in ("text/html", "test/stylesheet"):
|
||||
return False
|
||||
encoding = self.headers.get("Content-Encoding")
|
||||
if encoding and encoding not in _supported_encodings and \
|
||||
encoding!='identity':
|
||||
self.setWarning(i18n._('Unsupported content encoding %s.')%\
|
||||
`encoding`)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def getRobotsTxtUrl (self):
|
||||
return self.urlparts[0]+"://"+self.urlparts[1]+"/robots.txt"
|
||||
|
||||
|
|
|
|||
|
|
@ -212,6 +212,10 @@ class UrlData (object):
|
|||
self.validString = i18n._("Valid")+": "+s
|
||||
|
||||
|
||||
def isParseable (self):
|
||||
return False
|
||||
|
||||
|
||||
def isHtml (self):
|
||||
return False
|
||||
|
||||
|
|
@ -342,8 +346,8 @@ class UrlData (object):
|
|||
self.checkConnection()
|
||||
if self.cached:
|
||||
return
|
||||
if self.anchor and self.config["anchors"]:
|
||||
self.checkAnchors(self.anchor)
|
||||
if self.config["anchors"]:
|
||||
self.checkAnchors()
|
||||
except tuple(ExcList):
|
||||
type, value, tb = sys.exc_info()
|
||||
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
||||
|
|
@ -417,10 +421,9 @@ class UrlData (object):
|
|||
|
||||
|
||||
def allowsRecursion (self):
|
||||
# note: isHtml() might not be working if valid is false, so be
|
||||
# sure to test it first.
|
||||
# note: test self.valid before self.isParseable()
|
||||
return self.valid and \
|
||||
self.isHtml() and \
|
||||
self.isParseable() and \
|
||||
self.hasContent() and \
|
||||
not self.cached and \
|
||||
(self.config["recursionlevel"] >= 0 and
|
||||
|
|
@ -428,15 +431,17 @@ class UrlData (object):
|
|||
not self.extern[0]
|
||||
|
||||
|
||||
def checkAnchors (self, anchor):
|
||||
debug(HURT_ME_PLENTY, "checking anchor", anchor)
|
||||
if not (self.valid and anchor and self.isHtml() and self.hasContent()):
|
||||
def checkAnchors (self):
|
||||
if not (self.valid and self.anchor and self.isHtml() and \
|
||||
self.hasContent()):
|
||||
# do not bother
|
||||
return
|
||||
debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
|
||||
h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
|
||||
for cur_anchor,line,column,name,base in h.urls:
|
||||
if cur_anchor == anchor:
|
||||
if cur_anchor == self.anchor:
|
||||
return
|
||||
self.setWarning(i18n._("anchor #%s not found") % anchor)
|
||||
self.setWarning(i18n._("anchor #%s not found") % self.anchor)
|
||||
|
||||
|
||||
def _getExtern (self):
|
||||
|
|
@ -581,6 +586,15 @@ class UrlData (object):
|
|||
self.config, self.url, None, lineno, ""))
|
||||
|
||||
|
||||
def parse_css (self):
|
||||
"""parse a CSS file for url() patterns"""
|
||||
lineno = 0
|
||||
lines = self.getContent().splitlines()
|
||||
for line in lines:
|
||||
lineno += 1
|
||||
# XXX todo: css url pattern matching
|
||||
|
||||
|
||||
def __str__ (self):
|
||||
return ("%s link\n"
|
||||
"urlname=%s\n"
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ def getLinkPat (arg, strict=False):
|
|||
extensions = {
|
||||
"html": re.compile(r'(?i)\.s?html?$'),
|
||||
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
|
||||
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
|
||||
# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue