Use BeautifulSoup element attrs directly

2026-05-08 22:54:51 +00:00 · 2020-04-03 19:24:08 +01:00 · 2020-04-03 19:24:08 +01:00 · 3ff3d72492
commit 3ff3d72492
parent a7e1e20172
7 changed files with 19 additions and 176 deletions
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@ -26,8 +26,6 @@ filterwarnings("ignore",

 from bs4 import BeautifulSoup, Tag

-from ..containers import ListDict
-

 class Parser(object):
    handler = None
@ -57,26 +55,16 @@ class Parser(object):
    def parse_contents(self, contents):
        for content in contents:
            if isinstance(content, Tag):
-                attrs = ListDict()
-                for k, v_list in sorted(content.attrs.items()):
-                    if not isinstance(v_list, list):
-                        v_list = [v_list]
-                    for v in v_list:
-                        # empty parameters returned by BS4
-                        # are sometimes in bytes:
-                        if v == b'':
-                            v = u''
-                        attrs[k] = v
                self.tag_lineno = content.sourceline
                self.tag_column = None if content.sourcepos is None \
                    else content.sourcepos + 1
                if content.is_empty_element:
                    self.handler.start_end_element(
-                        content.name, attrs, content.text.strip(),
+                        content.name, content.attrs, content.text.strip(),
                    )
                else:
                    self.handler.start_element(
-                        content.name, attrs, content.text.strip(),
+                        content.name, content.attrs, content.text.strip(),
                    )
                    if hasattr(content, 'contents'):  # recursion
                        self.parse_contents(content.contents)
@ -85,7 +73,8 @@ class Parser(object):

    def flush(self):
        if self.soup is None:
-            self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
+            self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
+                                      multi_valued_attributes=None)
        if hasattr(self.soup, 'contents'):
            self.parse_contents(self.soup.contents)
        self.encoding = self.soup.original_encoding
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -310,6 +310,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        if self.text is None:
            self.get_raw_content()
            self.soup = BeautifulSoup(self.data, "html.parser",
+                                      multi_valued_attributes=None,
                                      from_encoding=self.encoding)
            self.text = self.data.decode(self.soup.original_encoding)
        return self.text
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -657,7 +657,8 @@ class UrlBase (object):
    def get_content (self):
        if self.text is None:
            self.get_raw_content()
-            self.soup = BeautifulSoup(self.data, "html.parser")
+            self.soup = BeautifulSoup(self.data, "html.parser",
+                                      multi_valued_attributes=None)
            self.text = self.data.decode(self.soup.original_encoding)
            self.encoding = self.soup.original_encoding
        return self.text
--- a/linkcheck/containers.py
+++ b/linkcheck/containers.py
@ -30,90 +30,6 @@ class AttrDict (dict):
        return self[name]


-class ListDict (dict):
-    """A dictionary whose iterators reflect the order in which elements
-    were added.
-    """
-
-    def __init__ (self):
-        """Initialize sorted key list."""
-        super(ListDict, self).__init__()
-        # sorted list of keys
-        self._keys = []
-
-    def setdefault (self, key, *args):
-        """Remember key order if key not found."""
-        if key not in self:
-            self._keys.append(key)
-        return super(ListDict, self).setdefault(key, *args)
-
-    def __setitem__ (self, key, value):
-        """Add key,value to dict, append key to sorted list."""
-        if key not in self:
-            self._keys.append(key)
-        super(ListDict, self).__setitem__(key, value)
-
-    def __delitem__ (self, key):
-        """Remove key from dict."""
-        self._keys.remove(key)
-        super(ListDict, self).__delitem__(key)
-
-    def pop (self, key):
-        """Remove key from dict and return value."""
-        if key in self._keys:
-            self._keys.remove(key)
-        super(ListDict, self).pop(key)
-
-    def popitem (self):
-        """Remove oldest key from dict and return item."""
-        if self._keys:
-            k = self._keys[0]
-            v = self[k]
-            del self[k]
-            return (k, v)
-        raise KeyError("popitem() on empty dictionary")
-
-    def values (self):
-        """Return sorted list of values."""
-        return [self[k] for k in self._keys]
-
-    def items (self):
-        """Return sorted list of items."""
-        return [(k, self[k]) for k in self._keys]
-
-    def keys (self):
-        """Return sorted list of keys."""
-        return self._keys[:]
-
-    def itervalues (self):
-        """Return iterator over sorted values."""
-        for k in self._keys:
-            yield self[k]
-
-    def iteritems (self):
-        """Return iterator over sorted items."""
-        for k in self._keys:
-            yield (k, self[k])
-
-    def iterkeys (self):
-        """Return iterator over sorted keys."""
-        return iter(self._keys)
-
-    def clear (self):
-        """Remove all dict entries."""
-        self._keys = []
-        super(ListDict, self).clear()
-
-    def get_true (self, key, default):
-        """Return default element if key is not in the dict, or if self[key]
-        evaluates to False. Useful for example if value is None, but
-        default value should be an empty string.
-        """
-        if key not in self or not self[key]:
-            return default
-        return self[key]
-
-
 class CaselessDict (dict):
    """A dictionary ignoring the case of keys (which must be strings)."""

--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -130,7 +130,7 @@ class MetaRobotsFinder (TagFinder):
    def start_element (self, tag, attrs, element_text=None):
        """Search for meta robots.txt "nofollow" and "noindex" flags."""
        if tag == 'meta' and attrs.get('name') == 'robots':
-            val = attrs.get_true('content', u'').lower().split(u',')
+            val = attrs.get('content', u'').lower().split(u',')
            self.follow = u'nofollow' not in val
            self.index = u'noindex' not in val
            raise StopParse("found <meta name=robots> tag")
@ -142,11 +142,11 @@ def is_meta_url (attr, attrs):
    """Check if the meta attributes contain a URL."""
    res = False
    if attr == "content":
-        equiv = attrs.get_true('http-equiv', u'').lower()
-        scheme = attrs.get_true('scheme', u'').lower()
+        equiv = attrs.get('http-equiv', u'').lower()
+        scheme = attrs.get('scheme', u'').lower()
        res = equiv in (u'refresh',) or scheme in (u'dcterms.uri',)
    if attr == "href":
-        rel = attrs.get_true('rel', u'').lower()
+        rel = attrs.get('rel', u'').lower()
        res = rel in (u'shortcut icon', u'icon')
    return res

@ -155,7 +155,7 @@ def is_form_get(attr, attrs):
    """Check if this is a GET form action URL."""
    res = False
    if attr == "action":
-        method = attrs.get_true('method', u'').lower()
+        method = attrs.get('method', u'').lower()
        res = method != 'post'
    return res

@ -182,7 +182,7 @@ class LinkFinder (TagFinder):
        log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
        log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
        if tag == "base" and not self.base_ref:
-            self.base_ref = attrs.get_true("href", u'')
+            self.base_ref = attrs.get("href", u'')
        tagattrs = self.tags.get(tag, self.universal_attrs)
        # parse URLs in tag (possibly multiple URLs in CSS styles)
        for attr in sorted(tagattrs.intersection(attrs)):
@ -195,7 +195,7 @@ class LinkFinder (TagFinder):
            # possible codebase
            base = u''
            if tag  == 'applet':
-                base = attrs.get_true('codebase', u'')
+                base = attrs.get('codebase', u'')
            if not base:
                base = self.base_ref
            # note: value can be None
@ -212,11 +212,11 @@ class LinkFinder (TagFinder):
        """Parse attrs for link name. Return name of link."""
        if tag == 'a' and attr == 'href':
            if not name:
-                name = attrs.get_true('title', u'')
+                name = attrs.get('title', u'')
        elif tag == 'img':
-            name = attrs.get_true('alt', u'')
+            name = attrs.get('alt', u'')
            if not name:
-                name = attrs.get_true('title', u'')
+                name = attrs.get('title', u'')
        else:
            name = u""
        return name
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@ -114,7 +114,7 @@ class HtmlPrettyPrinter:
        @return: None
        """
        self.fd.write("<%s" % tag.replace("/", ""))
-        for key, val in attrs.items():
+        for key, val in sorted(attrs.items()):
            if val is None:
                self.fd.write(" %s" % key)
            else:
--- a/tests/test_containers.py
+++ b/tests/test_containers.py
@ -19,7 +19,7 @@ Test container routines.
 """

 import unittest
-import random
+
 import linkcheck.containers

 from builtins import range
@ -39,70 +39,6 @@ class TestAttrDict (unittest.TestCase):
        self.assertTrue(isinstance(self.d.get, type({}.get)))


-class TestListDict (unittest.TestCase):
-    """Test list dictionary routines."""
-
-    def setUp (self):
-        """Set up self.d as empty listdict."""
-        self.d = linkcheck.containers.ListDict()
-
-    def test_insertion_order (self):
-        self.assertTrue(not self.d)
-        self.d[2] = 1
-        self.d[1] = 2
-        self.assertTrue(2 in self.d)
-        self.assertTrue(1 in self.d)
-
-    def test_deletion_order (self):
-        self.assertTrue(not self.d)
-        self.d[2] = 1
-        self.d[1] = 2
-        del self.d[1]
-        self.assertTrue(2 in self.d)
-        self.assertTrue(1 not in self.d)
-
-    def test_update_order (self):
-        self.assertTrue(not self.d)
-        self.d[2] = 1
-        self.d[1] = 2
-        self.d[1] = 1
-        self.assertEqual(self.d[1], 1)
-
-    def test_sorting (self):
-        self.assertTrue(not self.d)
-        toinsert = random.sample(range(10000000), 60)
-        for x in toinsert:
-            self.d[x] = x
-        for i, k in enumerate(self.d.keys()):
-            self.assertEqual(self.d[k], toinsert[i])
-        for i, k in enumerate(self.d.iterkeys()):
-            self.assertEqual(self.d[k], toinsert[i])
-        for x in self.d.values():
-            self.assertTrue(x in toinsert)
-        for x in self.d.itervalues():
-            self.assertTrue(x in toinsert)
-        for x, y in self.d.items():
-            self.assertTrue(x in toinsert)
-            self.assertTrue(y in toinsert)
-        for x, y in self.d.iteritems():
-            self.assertTrue(x in toinsert)
-            self.assertTrue(y in toinsert)
-
-    def test_clear (self):
-        self.assertTrue(not self.d)
-        self.d[2] = 1
-        self.d[1] = 3
-        self.d.clear()
-        self.assertTrue(not self.d)
-
-    def test_get_true (self):
-        self.assertTrue(not self.d)
-        self.d["a"] = 0
-        self.d["b"] = 1
-        self.assertEqual(self.d.get_true("a", 2), 2)
-        self.assertEqual(self.d.get_true("b", 2), 1)
-
-
 class TestCaselessDict (unittest.TestCase):
    """Test caseless dictionary routines."""