diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py
index 38069cc1..2fe0f4a4 100644
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@@ -135,10 +135,10 @@ class FileUrlData (UrlData):
def parseUrl (self):
- for key,ro in extensions.items():
+ for key, ro in extensions.items():
if ro.search(self.url):
return getattr(self, "parse_"+key)()
- for key,ro in contents.items():
+ for key, ro in contents.items():
if ro.search(self.getContent()[:20]):
return getattr(self, "parse_"+key)()
return None
diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py
index 04798a0c..e1a76ee1 100644
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@@ -393,13 +393,17 @@ class HttpUrlData (ProxyUrlData):
return True
+ def getContentType (self):
+ ptype = self.headers.get('Content-Type', 'application/octet-stream')
+ if ";" in ptype:
+ ptype = ptype.split(';')[0]
+ return ptype
+
+
def isParseable (self):
if not (self.valid and self.headers):
return False
- ptype = self.headers.gettype()
- if ";" in ptype:
- ptype = ptype.split(';')[0]
- if ptype not in ("text/html", "text/stylesheet"):
+ if self.getContentType() not in ("text/html", "text/css"):
return False
encoding = self.headers.get("Content-Encoding")
if encoding and encoding not in _supported_encodings and \
@@ -409,6 +413,15 @@ class HttpUrlData (ProxyUrlData):
return True
+ def parseUrl (self):
+ ptype = self.getContentType()
+ if ptype=="text/html":
+ self.parse_html()
+ elif ptype=="text/css":
+ self.parse_css()
+ return None
+
+
def getRobotsTxtUrl (self):
return "%s://%s/robots.txt"%tuple(self.urlparts[0:2])
diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index e555c504..a1030d12 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -23,7 +23,7 @@ from linkcheck.parser import htmlsax
DNS.DiscoverNameServers()
import Config, StringUtil, test_support
-from linkparse import LinkFinder, MetaRobotsFinder
+from linkparse import LinkFinder, MetaRobotsFinder, css_url_re
from debug import *
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
@@ -634,22 +634,24 @@ class UrlData (object):
UNUSED and UNTESTED, just use linkchecker `cat file.txt`
"""
lineno = 0
- lines = self.getContent().splitlines()
- for line in lines:
+ for line in self.getContent().splitlines():
lineno += 1
line = line.strip()
if not line or line.startswith('#'): continue
self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1,
- self.config, self.url, None, lineno, ""))
+ self.config, parentName=self.url, line=lineno))
def parse_css (self):
"""parse a CSS file for url() patterns"""
lineno = 0
- lines = self.getContent().splitlines()
- for line in lines:
+ for line in self.getContent().splitlines():
lineno += 1
- # XXX todo: css url pattern matching
+ for mo in css_url_re.finditer(line):
+ column = mo.start("url")
+ self.config.appendUrl(GetUrlDataFrom(mo.group("url"),
+ self.recursionLevel+1, self.config,
+ parentName=self.url, line=lineno, column=column))
def __str__ (self):
diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py
index b4413324..7a601dca 100644
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@@ -53,10 +53,10 @@ LinkTags = {
# matcher for tags
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$")
-_css_url_re = re.compile(r"url\((?P[^\)]+)\)")
+css_url_re = re.compile(r"url\((?P[^\)]+)\)")
class TagFinder (object):
- """base class putting message in list"""
+ """base class storing parse messages in a list"""
def __init__ (self, content):
self.content = content
# warnings and errors during parsing
@@ -151,7 +151,7 @@ class LinkFinder (TagFinder):
if mo:
urls.append(mo.group("url"))
elif attr=='style':
- for mo in _css_url_re.finditer(url):
+ for mo in css_url_re.finditer(url):
urls.append(mo.group("url"))
else:
urls.append(url)
@@ -163,3 +163,4 @@ class LinkFinder (TagFinder):
self.urls.append((u, self.parser.last_lineno(),
self.parser.last_column(), name, base))
+