diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 1aa5c265..6b3f76ad 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -26,8 +26,6 @@ filterwarnings("ignore",
from bs4 import BeautifulSoup, Tag
-from ..containers import ListDict
-
class Parser(object):
handler = None
@@ -57,26 +55,16 @@ class Parser(object):
def parse_contents(self, contents):
for content in contents:
if isinstance(content, Tag):
- attrs = ListDict()
- for k, v_list in sorted(content.attrs.items()):
- if not isinstance(v_list, list):
- v_list = [v_list]
- for v in v_list:
- # empty parameters returned by BS4
- # are sometimes in bytes:
- if v == b'':
- v = u''
- attrs[k] = v
self.tag_lineno = content.sourceline
self.tag_column = None if content.sourcepos is None \
else content.sourcepos + 1
if content.is_empty_element:
self.handler.start_end_element(
- content.name, attrs, content.text.strip(),
+ content.name, content.attrs, content.text.strip(),
)
else:
self.handler.start_element(
- content.name, attrs, content.text.strip(),
+ content.name, content.attrs, content.text.strip(),
)
if hasattr(content, 'contents'): # recursion
self.parse_contents(content.contents)
@@ -85,7 +73,8 @@ class Parser(object):
def flush(self):
if self.soup is None:
- self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
+ self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
+ multi_valued_attributes=None)
if hasattr(self.soup, 'contents'):
self.parse_contents(self.soup.contents)
self.encoding = self.soup.original_encoding
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 167133ed..9e6459ef 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -310,6 +310,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if self.text is None:
self.get_raw_content()
self.soup = BeautifulSoup(self.data, "html.parser",
+ multi_valued_attributes=None,
from_encoding=self.encoding)
self.text = self.data.decode(self.soup.original_encoding)
return self.text
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index 8268453d..ca924ad3 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -657,7 +657,8 @@ class UrlBase (object):
def get_content (self):
if self.text is None:
self.get_raw_content()
- self.soup = BeautifulSoup(self.data, "html.parser")
+ self.soup = BeautifulSoup(self.data, "html.parser",
+ multi_valued_attributes=None)
self.text = self.data.decode(self.soup.original_encoding)
self.encoding = self.soup.original_encoding
return self.text
diff --git a/linkcheck/containers.py b/linkcheck/containers.py
index 2cbcf253..2e8706b7 100644
--- a/linkcheck/containers.py
+++ b/linkcheck/containers.py
@@ -30,90 +30,6 @@ class AttrDict (dict):
return self[name]
-class ListDict (dict):
- """A dictionary whose iterators reflect the order in which elements
- were added.
- """
-
- def __init__ (self):
- """Initialize sorted key list."""
- super(ListDict, self).__init__()
- # sorted list of keys
- self._keys = []
-
- def setdefault (self, key, *args):
- """Remember key order if key not found."""
- if key not in self:
- self._keys.append(key)
- return super(ListDict, self).setdefault(key, *args)
-
- def __setitem__ (self, key, value):
- """Add key,value to dict, append key to sorted list."""
- if key not in self:
- self._keys.append(key)
- super(ListDict, self).__setitem__(key, value)
-
- def __delitem__ (self, key):
- """Remove key from dict."""
- self._keys.remove(key)
- super(ListDict, self).__delitem__(key)
-
- def pop (self, key):
- """Remove key from dict and return value."""
- if key in self._keys:
- self._keys.remove(key)
- super(ListDict, self).pop(key)
-
- def popitem (self):
- """Remove oldest key from dict and return item."""
- if self._keys:
- k = self._keys[0]
- v = self[k]
- del self[k]
- return (k, v)
- raise KeyError("popitem() on empty dictionary")
-
- def values (self):
- """Return sorted list of values."""
- return [self[k] for k in self._keys]
-
- def items (self):
- """Return sorted list of items."""
- return [(k, self[k]) for k in self._keys]
-
- def keys (self):
- """Return sorted list of keys."""
- return self._keys[:]
-
- def itervalues (self):
- """Return iterator over sorted values."""
- for k in self._keys:
- yield self[k]
-
- def iteritems (self):
- """Return iterator over sorted items."""
- for k in self._keys:
- yield (k, self[k])
-
- def iterkeys (self):
- """Return iterator over sorted keys."""
- return iter(self._keys)
-
- def clear (self):
- """Remove all dict entries."""
- self._keys = []
- super(ListDict, self).clear()
-
- def get_true (self, key, default):
- """Return default element if key is not in the dict, or if self[key]
- evaluates to False. Useful for example if value is None, but
- default value should be an empty string.
- """
- if key not in self or not self[key]:
- return default
- return self[key]
-
-
class CaselessDict (dict):
"""A dictionary ignoring the case of keys (which must be strings)."""
diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py
index b60bfa14..e5295817 100644
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -130,7 +130,7 @@ class MetaRobotsFinder (TagFinder):
def start_element (self, tag, attrs, element_text=None):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
- val = attrs.get_true('content', u'').lower().split(u',')
+ val = attrs.get('content', u'').lower().split(u',')
self.follow = u'nofollow' not in val
self.index = u'noindex' not in val
raise StopParse("found tag")
@@ -142,11 +142,11 @@ def is_meta_url (attr, attrs):
"""Check if the meta attributes contain a URL."""
res = False
if attr == "content":
- equiv = attrs.get_true('http-equiv', u'').lower()
- scheme = attrs.get_true('scheme', u'').lower()
+ equiv = attrs.get('http-equiv', u'').lower()
+ scheme = attrs.get('scheme', u'').lower()
res = equiv in (u'refresh',) or scheme in (u'dcterms.uri',)
if attr == "href":
- rel = attrs.get_true('rel', u'').lower()
+ rel = attrs.get('rel', u'').lower()
res = rel in (u'shortcut icon', u'icon')
return res
@@ -155,7 +155,7 @@ def is_form_get(attr, attrs):
"""Check if this is a GET form action URL."""
res = False
if attr == "action":
- method = attrs.get_true('method', u'').lower()
+ method = attrs.get('method', u'').lower()
res = method != 'post'
return res
@@ -182,7 +182,7 @@ class LinkFinder (TagFinder):
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
if tag == "base" and not self.base_ref:
- self.base_ref = attrs.get_true("href", u'')
+ self.base_ref = attrs.get("href", u'')
tagattrs = self.tags.get(tag, self.universal_attrs)
# parse URLs in tag (possibly multiple URLs in CSS styles)
for attr in sorted(tagattrs.intersection(attrs)):
@@ -195,7 +195,7 @@ class LinkFinder (TagFinder):
# possible codebase
base = u''
if tag == 'applet':
- base = attrs.get_true('codebase', u'')
+ base = attrs.get('codebase', u'')
if not base:
base = self.base_ref
# note: value can be None
@@ -212,11 +212,11 @@ class LinkFinder (TagFinder):
"""Parse attrs for link name. Return name of link."""
if tag == 'a' and attr == 'href':
if not name:
- name = attrs.get_true('title', u'')
+ name = attrs.get('title', u'')
elif tag == 'img':
- name = attrs.get_true('alt', u'')
+ name = attrs.get('alt', u'')
if not name:
- name = attrs.get_true('title', u'')
+ name = attrs.get('title', u'')
else:
name = u""
return name
diff --git a/tests/htmllib.py b/tests/htmllib.py
index ecf988d4..6f1c5b19 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -114,7 +114,7 @@ class HtmlPrettyPrinter:
@return: None
"""
self.fd.write("<%s" % tag.replace("/", ""))
- for key, val in attrs.items():
+ for key, val in sorted(attrs.items()):
if val is None:
self.fd.write(" %s" % key)
else:
diff --git a/tests/test_containers.py b/tests/test_containers.py
index 6f830c6c..a90c8f10 100644
--- a/tests/test_containers.py
+++ b/tests/test_containers.py
@@ -19,7 +19,7 @@ Test container routines.
"""
import unittest
-import random
+
import linkcheck.containers
from builtins import range
@@ -39,70 +39,6 @@ class TestAttrDict (unittest.TestCase):
self.assertTrue(isinstance(self.d.get, type({}.get)))
-class TestListDict (unittest.TestCase):
- """Test list dictionary routines."""
-
- def setUp (self):
- """Set up self.d as empty listdict."""
- self.d = linkcheck.containers.ListDict()
-
- def test_insertion_order (self):
- self.assertTrue(not self.d)
- self.d[2] = 1
- self.d[1] = 2
- self.assertTrue(2 in self.d)
- self.assertTrue(1 in self.d)
-
- def test_deletion_order (self):
- self.assertTrue(not self.d)
- self.d[2] = 1
- self.d[1] = 2
- del self.d[1]
- self.assertTrue(2 in self.d)
- self.assertTrue(1 not in self.d)
-
- def test_update_order (self):
- self.assertTrue(not self.d)
- self.d[2] = 1
- self.d[1] = 2
- self.d[1] = 1
- self.assertEqual(self.d[1], 1)
-
- def test_sorting (self):
- self.assertTrue(not self.d)
- toinsert = random.sample(range(10000000), 60)
- for x in toinsert:
- self.d[x] = x
- for i, k in enumerate(self.d.keys()):
- self.assertEqual(self.d[k], toinsert[i])
- for i, k in enumerate(self.d.iterkeys()):
- self.assertEqual(self.d[k], toinsert[i])
- for x in self.d.values():
- self.assertTrue(x in toinsert)
- for x in self.d.itervalues():
- self.assertTrue(x in toinsert)
- for x, y in self.d.items():
- self.assertTrue(x in toinsert)
- self.assertTrue(y in toinsert)
- for x, y in self.d.iteritems():
- self.assertTrue(x in toinsert)
- self.assertTrue(y in toinsert)
-
- def test_clear (self):
- self.assertTrue(not self.d)
- self.d[2] = 1
- self.d[1] = 3
- self.d.clear()
- self.assertTrue(not self.d)
-
- def test_get_true (self):
- self.assertTrue(not self.d)
- self.d["a"] = 0
- self.d["b"] = 1
- self.assertEqual(self.d.get_true("a", 2), 2)
- self.assertEqual(self.d.get_true("b", 2), 1)
-
-
class TestCaselessDict (unittest.TestCase):
"""Test caseless dictionary routines."""