diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 52f00e4e..f8712833 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -432,7 +432,7 @@ class UrlData (object): debug(HURT_ME_PLENTY, "checking anchor", anchor) if not (self.valid and anchor and self.isHtml() and self.hasContent()): return - h = LinkParser(self.getContent(), {'a': ['name'], None: ['id']}) + h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']}) for cur_anchor,line,column,name,base in h.urls: if cur_anchor == anchor: return @@ -526,7 +526,7 @@ class UrlData (object): def parse_html (self): # search for a possible base reference - h = LinkParser(self.getContent(), {'base': ['href']}) + h = LinkParser(self.getContent(), tags={'base': ['href']}) baseRef = None if len(h.urls)>=1: baseRef = h.urls[0][0] diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py index 6be9ce05..6d539a36 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/linkparse.py @@ -49,10 +49,12 @@ LinkTags = { 'th': ['background'], 'tr': ['background'], 'xmp': ['href'], + None: ['style'], } # matcher for tags _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$") +_style_background_re = re.compile(r"background-image:\s*url\((?P.+?)\)") class LinkParser (HtmlParser): """Parse the content for a list of links. After parsing, the urls @@ -76,8 +78,9 @@ class LinkParser (HtmlParser): debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs) debug(NIGHTMARE, "line", self.lineno(), "col", self.column(), "old line", self.last_lineno(), "old col", self.last_column()) - tags = self.tags.get(tag, self.tags.get(None, [])) - for attr in tags: + tagattrs = self.tags.get(tag, []) + tagattrs.extend(self.tags.get(None, [])) + for attr in tagattrs: if attr in attrs: # name of this link if tag=='a' and attr=='href': @@ -86,6 +89,8 @@ class LinkParser (HtmlParser): name = linkname.href_name(self.content[self.pos():]) elif tag=='img': name = StringUtil.unquote(attrs.get('alt', '')) + if not name: + name = StringUtil.unquote(attrs.get('title', '')) else: name = "" # possible codebase @@ -93,21 +98,27 @@ class LinkParser (HtmlParser): base = StringUtil.unquote(attrs.get('codebase')) else: base = "" - # add link to url list value = StringUtil.unquote(attrs[attr]) + # add link to url list self.addLink(tag, attr, value, name, base) def addLink (self, tag, attr, url, name, base): - debug(NIGHTMARE, "LinkParser add link", tag, attr, url, name, base) # look for meta refresh if tag=='meta': - metamatch = _refresh_re.match(url) - if metamatch: - url = metamatch.group("url") + mo = _refresh_re.match(url) + if mo: + url = mo.group("url") else: # only meta refresh has an url, so return return + elif attr=='style': + mo = _style_background_re.search(url) + if mo: + url = mo.group("url") + else: + return + debug(NIGHTMARE, "LinkParser add link", tag, attr, url, name, base) self.urls.append((url, self.last_lineno(), self.last_column(), name, base))