Simpler link parsing routine.

2026-05-08 14:44:46 +00:00 · 2014-03-27 19:49:17 +01:00 · 2014-03-27 19:49:17 +01:00 · b6b5c7a12e
commit b6b5c7a12e
parent f180592cc4
4 changed files with 35 additions and 39 deletions
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -187,16 +187,18 @@ class LinkFinder (TagFinder):
    """Find HTML links, and apply them to the callback function with the
    format (url, lineno, column, name, codebase)."""

-    def __init__ (self, callback, tags=None):
+    def __init__ (self, callback, tags):
        """Store content in buffer and initialize URL list."""
        super(LinkFinder, self).__init__()
        self.callback = callback
-        if tags is None:
-            self.tags = LinkTags
-        else:
-            self.tags = tags
+        # set universal tag attributes using tagname None
+        self.universal_attrs = set(tags.get(None, []))
+        self.tags = dict()
+        for  tag, attrs in tags.items():
+            self.tags[tag] = set(attrs)
+            # add universal tag attributes
+            self.tags[tag].update(self.universal_attrs)
        self.base_ref = u''
-        log.debug(LOG_CHECK, "link finder")

    def start_element (self, tag, attrs):
        """Search for links and store found URLs in a list."""
@ -204,15 +206,9 @@ class LinkFinder (TagFinder):
        log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
        if tag == "base" and not self.base_ref:
            self.base_ref = unquote(attrs.get_true("href", u''))
-        tagattrs = self.tags.get(tag, [])
-        # add universal tag attributes using tagname None
-        tagattrs.extend(self.tags.get(None, []))
-        # eliminate duplicate tag attributes
-        tagattrs = set(tagattrs)
+        tagattrs = self.tags.get(tag, self.universal_attrs)
        # parse URLs in tag (possibly multiple URLs in CSS styles)
-        for attr in tagattrs:
-            if attr not in attrs:
-                continue
+        for attr in tagattrs.intersection(attrs):
            if tag == "meta" and not is_meta_url(attr, attrs):
                continue
            if tag == "form" and not is_form_get(attr, attrs):
@ -252,31 +248,31 @@ class LinkFinder (TagFinder):
            name = u""
        return name

-    def parse_tag (self, tag, attr, url, name, base):
+    def parse_tag (self, tag, attr, value, name, base):
        """Add given url data to url list."""
        assert isinstance(tag, unicode), repr(tag)
        assert isinstance(attr, unicode), repr(attr)
        assert isinstance(name, unicode), repr(name)
        assert isinstance(base, unicode), repr(base)
-        assert isinstance(url, unicode) or url is None, repr(url)
-        urls = []
+        assert isinstance(value, unicode) or value is None, repr(value)
        # look for meta refresh
-        if tag == u'meta' and url:
-            mo = refresh_re.match(url)
+        if tag == u'meta' and value:
+            mo = refresh_re.match(value)
            if mo:
-                urls.append(mo.group("url"))
+                self.found_url(mo.group("url"), name, base)
            elif attr != 'content':
-                urls.append(url)
-        elif attr == u'style' and url:
-            for mo in css_url_re.finditer(url):
-                u = mo.group("url")
-                urls.append(unquote(u, matching=True))
+                self.found_url(value, name, base)
+        elif attr == u'style' and value:
+            for mo in css_url_re.finditer(value):
+                url = unquote(mo.group("url"), matching=True)
+                self.found_url(url, name, base)
        elif attr == u'archive':
-            urls.extend(url.split(u','))
+            for url in value.split(u','):
+                self.found_url(url, name, base)
        else:
-            urls.append(url)
-        for u in urls:
-            assert isinstance(u, unicode) or u is None, repr(u)
-            log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
-            self.callback(u, self.parser.last_lineno(),
-                          self.parser.last_column(), name, base)
+            self.found_url(value, name, base)
+
+    def found_url(self, url, name, base):
+        assert isinstance(url, unicode) or url is None, repr(url)
+        self.callback(url, self.parser.last_lineno(),
+                      self.parser.last_column(), name, base)
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -40,7 +40,7 @@ def parse_html (url_data):
    """Parse into HTML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url)
+    find_links(url_data, url_data.add_url, linkparse.LinkTags)


 def parse_opera (url_data):
@ -126,7 +126,7 @@ def parse_wml (url_data):
    """Parse into WML content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
-    find_links(url_data, url_data.add_url, tags=linkparse.WmlTags)
+    find_links(url_data, url_data.add_url, linkparse.WmlTags)


 def get_temp_filename (content):
@ -141,12 +141,12 @@ def get_temp_filename (content):
    return filename


-def find_links (url_data, callback, tags=None):
+def find_links (url_data, callback, tags):
    """Parse into content and search for URLs to check.
    Found URLs are added to the URL queue.
    """
    # construct parser object
-    handler = linkparse.LinkFinder(callback, tags=tags)
+    handler = linkparse.LinkFinder(callback, tags)
    parser = htmlsax.parser(handler)
    if url_data.charset:
        parser.encoding = url_data.charset
--- a/linkcheck/plugins/anchorcheck.py
+++ b/linkcheck/plugins/anchorcheck.py
@ -37,7 +37,7 @@ class AnchorCheck(_ContentPlugin):
        log.debug(LOG_PLUGIN, "checking content for invalid anchors")
        # list of parsed anchors
        self.anchors = []
-        find_links(url_data, self.add_anchor, tags=linkparse.AnchorTags)
+        find_links(url_data, self.add_anchor, linkparse.AnchorTags)
        self.check_anchor(url_data)

    def add_anchor (self, url, line, column, name, base):
--- a/tests/test_linkparser.py
+++ b/tests/test_linkparser.py
@ -30,7 +30,7 @@ class TestLinkparser (unittest.TestCase):

    def _test_one_link (self, content, url):
        self.count_url = 0
-        h = linkparse.LinkFinder(self._test_one_url(url))
+        h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
        p = linkcheck.HtmlParser.htmlsax.parser(h)
        h.parser = p
        try:
@ -52,7 +52,7 @@ class TestLinkparser (unittest.TestCase):
    def _test_no_link (self, content):
        def callback (url, line, column, name, base):
            self.assertTrue(False, 'URL %r found' % url)
-        h = linkparse.LinkFinder(callback)
+        h = linkparse.LinkFinder(callback, linkparse.LinkTags)
        p = linkcheck.HtmlParser.htmlsax.parser(h)
        h.parser = p
        try: