full css parsing

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1300 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-04-04 09:30:10 +00:00
parent f4802fd467
commit e78a8ea539
4 changed files with 32 additions and 16 deletions

View file

@ -135,10 +135,10 @@ class FileUrlData (UrlData):
def parseUrl (self):
for key,ro in extensions.items():
for key, ro in extensions.items():
if ro.search(self.url):
return getattr(self, "parse_"+key)()
for key,ro in contents.items():
for key, ro in contents.items():
if ro.search(self.getContent()[:20]):
return getattr(self, "parse_"+key)()
return None

View file

@ -393,13 +393,17 @@ class HttpUrlData (ProxyUrlData):
return True
def getContentType (self):
ptype = self.headers.get('Content-Type', 'application/octet-stream')
if ";" in ptype:
ptype = ptype.split(';')[0]
return ptype
def isParseable (self):
if not (self.valid and self.headers):
return False
ptype = self.headers.gettype()
if ";" in ptype:
ptype = ptype.split(';')[0]
if ptype not in ("text/html", "text/stylesheet"):
if self.getContentType() not in ("text/html", "text/css"):
return False
encoding = self.headers.get("Content-Encoding")
if encoding and encoding not in _supported_encodings and \
@ -409,6 +413,15 @@ class HttpUrlData (ProxyUrlData):
return True
def parseUrl (self):
ptype = self.getContentType()
if ptype=="text/html":
self.parse_html()
elif ptype=="text/css":
self.parse_css()
return None
def getRobotsTxtUrl (self):
return "%s://%s/robots.txt"%tuple(self.urlparts[0:2])

View file

@ -23,7 +23,7 @@ from linkcheck.parser import htmlsax
DNS.DiscoverNameServers()
import Config, StringUtil, test_support
from linkparse import LinkFinder, MetaRobotsFinder
from linkparse import LinkFinder, MetaRobotsFinder, css_url_re
from debug import *
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
@ -634,22 +634,24 @@ class UrlData (object):
UNUSED and UNTESTED, just use linkchecker `cat file.txt`
"""
lineno = 0
lines = self.getContent().splitlines()
for line in lines:
for line in self.getContent().splitlines():
lineno += 1
line = line.strip()
if not line or line.startswith('#'): continue
self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1,
self.config, self.url, None, lineno, ""))
self.config, parentName=self.url, line=lineno))
def parse_css (self):
"""parse a CSS file for url() patterns"""
lineno = 0
lines = self.getContent().splitlines()
for line in lines:
for line in self.getContent().splitlines():
lineno += 1
# XXX todo: css url pattern matching
for mo in css_url_re.finditer(line):
column = mo.start("url")
self.config.appendUrl(GetUrlDataFrom(mo.group("url"),
self.recursionLevel+1, self.config,
parentName=self.url, line=lineno, column=column))
def __str__ (self):

View file

@ -53,10 +53,10 @@ LinkTags = {
# matcher for <meta http-equiv=refresh> tags
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
_css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
class TagFinder (object):
"""base class putting message in list"""
"""base class storing parse messages in a list"""
def __init__ (self, content):
self.content = content
# warnings and errors during parsing
@ -151,7 +151,7 @@ class LinkFinder (TagFinder):
if mo:
urls.append(mo.group("url"))
elif attr=='style':
for mo in _css_url_re.finditer(url):
for mo in css_url_re.finditer(url):
urls.append(mo.group("url"))
else:
urls.append(url)
@ -163,3 +163,4 @@ class LinkFinder (TagFinder):
self.urls.append((u, self.parser.last_lineno(),
self.parser.last_column(), name, base))