diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index adb4f347..4b680873 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -187,16 +187,18 @@ class LinkFinder (TagFinder): """Find HTML links, and apply them to the callback function with the format (url, lineno, column, name, codebase).""" - def __init__ (self, callback, tags=None): + def __init__ (self, callback, tags): """Store content in buffer and initialize URL list.""" super(LinkFinder, self).__init__() self.callback = callback - if tags is None: - self.tags = LinkTags - else: - self.tags = tags + # set universal tag attributes using tagname None + self.universal_attrs = set(tags.get(None, [])) + self.tags = dict() + for tag, attrs in tags.items(): + self.tags[tag] = set(attrs) + # add universal tag attributes + self.tags[tag].update(self.universal_attrs) self.base_ref = u'' - log.debug(LOG_CHECK, "link finder") def start_element (self, tag, attrs): """Search for links and store found URLs in a list.""" @@ -204,15 +206,9 @@ class LinkFinder (TagFinder): log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) if tag == "base" and not self.base_ref: self.base_ref = unquote(attrs.get_true("href", u'')) - tagattrs = self.tags.get(tag, []) - # add universal tag attributes using tagname None - tagattrs.extend(self.tags.get(None, [])) - # eliminate duplicate tag attributes - tagattrs = set(tagattrs) + tagattrs = self.tags.get(tag, self.universal_attrs) # parse URLs in tag (possibly multiple URLs in CSS styles) - for attr in tagattrs: - if attr not in attrs: - continue + for attr in tagattrs.intersection(attrs): if tag == "meta" and not is_meta_url(attr, attrs): continue if tag == "form" and not is_form_get(attr, attrs): @@ -252,31 +248,31 @@ class LinkFinder (TagFinder): name = u"" return name - def parse_tag (self, tag, attr, url, name, base): + def parse_tag (self, tag, attr, value, name, base): """Add given url data to url list.""" assert isinstance(tag, unicode), repr(tag) assert isinstance(attr, unicode), repr(attr) assert isinstance(name, unicode), repr(name) assert isinstance(base, unicode), repr(base) - assert isinstance(url, unicode) or url is None, repr(url) - urls = [] + assert isinstance(value, unicode) or value is None, repr(value) # look for meta refresh - if tag == u'meta' and url: - mo = refresh_re.match(url) + if tag == u'meta' and value: + mo = refresh_re.match(value) if mo: - urls.append(mo.group("url")) + self.found_url(mo.group("url"), name, base) elif attr != 'content': - urls.append(url) - elif attr == u'style' and url: - for mo in css_url_re.finditer(url): - u = mo.group("url") - urls.append(unquote(u, matching=True)) + self.found_url(value, name, base) + elif attr == u'style' and value: + for mo in css_url_re.finditer(value): + url = unquote(mo.group("url"), matching=True) + self.found_url(url, name, base) elif attr == u'archive': - urls.extend(url.split(u',')) + for url in value.split(u','): + self.found_url(url, name, base) else: - urls.append(url) - for u in urls: - assert isinstance(u, unicode) or u is None, repr(u) - log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base) - self.callback(u, self.parser.last_lineno(), - self.parser.last_column(), name, base) + self.found_url(value, name, base) + + def found_url(self, url, name, base): + assert isinstance(url, unicode) or url is None, repr(url) + self.callback(url, self.parser.last_lineno(), + self.parser.last_column(), name, base) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index fff2a9c2..d521f100 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -40,7 +40,7 @@ def parse_html (url_data): """Parse into HTML content and search for URLs to check. Found URLs are added to the URL queue. """ - find_links(url_data, url_data.add_url) + find_links(url_data, url_data.add_url, linkparse.LinkTags) def parse_opera (url_data): @@ -126,7 +126,7 @@ def parse_wml (url_data): """Parse into WML content and search for URLs to check. Found URLs are added to the URL queue. """ - find_links(url_data, url_data.add_url, tags=linkparse.WmlTags) + find_links(url_data, url_data.add_url, linkparse.WmlTags) def get_temp_filename (content): @@ -141,12 +141,12 @@ def get_temp_filename (content): return filename -def find_links (url_data, callback, tags=None): +def find_links (url_data, callback, tags): """Parse into content and search for URLs to check. Found URLs are added to the URL queue. """ # construct parser object - handler = linkparse.LinkFinder(callback, tags=tags) + handler = linkparse.LinkFinder(callback, tags) parser = htmlsax.parser(handler) if url_data.charset: parser.encoding = url_data.charset diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index 5d3c25ad..01f4461a 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -37,7 +37,7 @@ class AnchorCheck(_ContentPlugin): log.debug(LOG_PLUGIN, "checking content for invalid anchors") # list of parsed anchors self.anchors = [] - find_links(url_data, self.add_anchor, tags=linkparse.AnchorTags) + find_links(url_data, self.add_anchor, linkparse.AnchorTags) self.check_anchor(url_data) def add_anchor (self, url, line, column, name, base): diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 099e66a5..415857f2 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -30,7 +30,7 @@ class TestLinkparser (unittest.TestCase): def _test_one_link (self, content, url): self.count_url = 0 - h = linkparse.LinkFinder(self._test_one_url(url)) + h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) p = linkcheck.HtmlParser.htmlsax.parser(h) h.parser = p try: @@ -52,7 +52,7 @@ class TestLinkparser (unittest.TestCase): def _test_no_link (self, content): def callback (url, line, column, name, base): self.assertTrue(False, 'URL %r found' % url) - h = linkparse.LinkFinder(callback) + h = linkparse.LinkFinder(callback, linkparse.LinkTags) p = linkcheck.HtmlParser.htmlsax.parser(h) h.parser = p try: