diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py index 38069cc1..2fe0f4a4 100644 --- a/linkcheck/FileUrlData.py +++ b/linkcheck/FileUrlData.py @@ -135,10 +135,10 @@ class FileUrlData (UrlData): def parseUrl (self): - for key,ro in extensions.items(): + for key, ro in extensions.items(): if ro.search(self.url): return getattr(self, "parse_"+key)() - for key,ro in contents.items(): + for key, ro in contents.items(): if ro.search(self.getContent()[:20]): return getattr(self, "parse_"+key)() return None diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index 04798a0c..e1a76ee1 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -393,13 +393,17 @@ class HttpUrlData (ProxyUrlData): return True + def getContentType (self): + ptype = self.headers.get('Content-Type', 'application/octet-stream') + if ";" in ptype: + ptype = ptype.split(';')[0] + return ptype + + def isParseable (self): if not (self.valid and self.headers): return False - ptype = self.headers.gettype() - if ";" in ptype: - ptype = ptype.split(';')[0] - if ptype not in ("text/html", "text/stylesheet"): + if self.getContentType() not in ("text/html", "text/css"): return False encoding = self.headers.get("Content-Encoding") if encoding and encoding not in _supported_encodings and \ @@ -409,6 +413,15 @@ class HttpUrlData (ProxyUrlData): return True + def parseUrl (self): + ptype = self.getContentType() + if ptype=="text/html": + self.parse_html() + elif ptype=="text/css": + self.parse_css() + return None + + def getRobotsTxtUrl (self): return "%s://%s/robots.txt"%tuple(self.urlparts[0:2]) diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index e555c504..a1030d12 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -23,7 +23,7 @@ from linkcheck.parser import htmlsax DNS.DiscoverNameServers() import Config, StringUtil, test_support -from linkparse import LinkFinder, MetaRobotsFinder +from linkparse import LinkFinder, MetaRobotsFinder, css_url_re from debug import * ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search @@ -634,22 +634,24 @@ class UrlData (object): UNUSED and UNTESTED, just use linkchecker `cat file.txt` """ lineno = 0 - lines = self.getContent().splitlines() - for line in lines: + for line in self.getContent().splitlines(): lineno += 1 line = line.strip() if not line or line.startswith('#'): continue self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1, - self.config, self.url, None, lineno, "")) + self.config, parentName=self.url, line=lineno)) def parse_css (self): """parse a CSS file for url() patterns""" lineno = 0 - lines = self.getContent().splitlines() - for line in lines: + for line in self.getContent().splitlines(): lineno += 1 - # XXX todo: css url pattern matching + for mo in css_url_re.finditer(line): + column = mo.start("url") + self.config.appendUrl(GetUrlDataFrom(mo.group("url"), + self.recursionLevel+1, self.config, + parentName=self.url, line=lineno, column=column)) def __str__ (self): diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py index b4413324..7a601dca 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/linkparse.py @@ -53,10 +53,10 @@ LinkTags = { # matcher for tags _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$") -_css_url_re = re.compile(r"url\((?P[^\)]+)\)") +css_url_re = re.compile(r"url\((?P[^\)]+)\)") class TagFinder (object): - """base class putting message in list""" + """base class storing parse messages in a list""" def __init__ (self, content): self.content = content # warnings and errors during parsing @@ -151,7 +151,7 @@ class LinkFinder (TagFinder): if mo: urls.append(mo.group("url")) elif attr=='style': - for mo in _css_url_re.finditer(url): + for mo in css_url_re.finditer(url): urls.append(mo.group("url")) else: urls.append(url) @@ -163,3 +163,4 @@ class LinkFinder (TagFinder): self.urls.append((u, self.parser.last_lineno(), self.parser.last_column(), name, base)) +