mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-09 00:50:58 +00:00
full css parsing
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1300 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
f4802fd467
commit
e78a8ea539
4 changed files with 32 additions and 16 deletions
|
|
@ -135,10 +135,10 @@ class FileUrlData (UrlData):
|
|||
|
||||
|
||||
def parseUrl (self):
|
||||
for key,ro in extensions.items():
|
||||
for key, ro in extensions.items():
|
||||
if ro.search(self.url):
|
||||
return getattr(self, "parse_"+key)()
|
||||
for key,ro in contents.items():
|
||||
for key, ro in contents.items():
|
||||
if ro.search(self.getContent()[:20]):
|
||||
return getattr(self, "parse_"+key)()
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -393,13 +393,17 @@ class HttpUrlData (ProxyUrlData):
|
|||
return True
|
||||
|
||||
|
||||
def getContentType (self):
|
||||
ptype = self.headers.get('Content-Type', 'application/octet-stream')
|
||||
if ";" in ptype:
|
||||
ptype = ptype.split(';')[0]
|
||||
return ptype
|
||||
|
||||
|
||||
def isParseable (self):
|
||||
if not (self.valid and self.headers):
|
||||
return False
|
||||
ptype = self.headers.gettype()
|
||||
if ";" in ptype:
|
||||
ptype = ptype.split(';')[0]
|
||||
if ptype not in ("text/html", "text/stylesheet"):
|
||||
if self.getContentType() not in ("text/html", "text/css"):
|
||||
return False
|
||||
encoding = self.headers.get("Content-Encoding")
|
||||
if encoding and encoding not in _supported_encodings and \
|
||||
|
|
@ -409,6 +413,15 @@ class HttpUrlData (ProxyUrlData):
|
|||
return True
|
||||
|
||||
|
||||
def parseUrl (self):
|
||||
ptype = self.getContentType()
|
||||
if ptype=="text/html":
|
||||
self.parse_html()
|
||||
elif ptype=="text/css":
|
||||
self.parse_css()
|
||||
return None
|
||||
|
||||
|
||||
def getRobotsTxtUrl (self):
|
||||
return "%s://%s/robots.txt"%tuple(self.urlparts[0:2])
|
||||
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ from linkcheck.parser import htmlsax
|
|||
DNS.DiscoverNameServers()
|
||||
|
||||
import Config, StringUtil, test_support
|
||||
from linkparse import LinkFinder, MetaRobotsFinder
|
||||
from linkparse import LinkFinder, MetaRobotsFinder, css_url_re
|
||||
from debug import *
|
||||
|
||||
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
|
||||
|
|
@ -634,22 +634,24 @@ class UrlData (object):
|
|||
UNUSED and UNTESTED, just use linkchecker `cat file.txt`
|
||||
"""
|
||||
lineno = 0
|
||||
lines = self.getContent().splitlines()
|
||||
for line in lines:
|
||||
for line in self.getContent().splitlines():
|
||||
lineno += 1
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'): continue
|
||||
self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1,
|
||||
self.config, self.url, None, lineno, ""))
|
||||
self.config, parentName=self.url, line=lineno))
|
||||
|
||||
|
||||
def parse_css (self):
|
||||
"""parse a CSS file for url() patterns"""
|
||||
lineno = 0
|
||||
lines = self.getContent().splitlines()
|
||||
for line in lines:
|
||||
for line in self.getContent().splitlines():
|
||||
lineno += 1
|
||||
# XXX todo: css url pattern matching
|
||||
for mo in css_url_re.finditer(line):
|
||||
column = mo.start("url")
|
||||
self.config.appendUrl(GetUrlDataFrom(mo.group("url"),
|
||||
self.recursionLevel+1, self.config,
|
||||
parentName=self.url, line=lineno, column=column))
|
||||
|
||||
|
||||
def __str__ (self):
|
||||
|
|
|
|||
|
|
@ -53,10 +53,10 @@ LinkTags = {
|
|||
|
||||
# matcher for <meta http-equiv=refresh> tags
|
||||
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
|
||||
_css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
|
||||
css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
|
||||
|
||||
class TagFinder (object):
|
||||
"""base class putting message in list"""
|
||||
"""base class storing parse messages in a list"""
|
||||
def __init__ (self, content):
|
||||
self.content = content
|
||||
# warnings and errors during parsing
|
||||
|
|
@ -151,7 +151,7 @@ class LinkFinder (TagFinder):
|
|||
if mo:
|
||||
urls.append(mo.group("url"))
|
||||
elif attr=='style':
|
||||
for mo in _css_url_re.finditer(url):
|
||||
for mo in css_url_re.finditer(url):
|
||||
urls.append(mo.group("url"))
|
||||
else:
|
||||
urls.append(url)
|
||||
|
|
@ -163,3 +163,4 @@ class LinkFinder (TagFinder):
|
|||
self.urls.append((u, self.parser.last_lineno(),
|
||||
self.parser.last_column(), name, base))
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue