diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 1aa5c265..6b3f76ad 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -26,8 +26,6 @@ filterwarnings("ignore", from bs4 import BeautifulSoup, Tag -from ..containers import ListDict - class Parser(object): handler = None @@ -57,26 +55,16 @@ class Parser(object): def parse_contents(self, contents): for content in contents: if isinstance(content, Tag): - attrs = ListDict() - for k, v_list in sorted(content.attrs.items()): - if not isinstance(v_list, list): - v_list = [v_list] - for v in v_list: - # empty parameters returned by BS4 - # are sometimes in bytes: - if v == b'': - v = u'' - attrs[k] = v self.tag_lineno = content.sourceline self.tag_column = None if content.sourcepos is None \ else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( - content.name, attrs, content.text.strip(), + content.name, content.attrs, content.text.strip(), ) else: self.handler.start_element( - content.name, attrs, content.text.strip(), + content.name, content.attrs, content.text.strip(), ) if hasattr(content, 'contents'): # recursion self.parse_contents(content.contents) @@ -85,7 +73,8 @@ class Parser(object): def flush(self): if self.soup is None: - self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser') + self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser', + multi_valued_attributes=None) if hasattr(self.soup, 'contents'): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 167133ed..9e6459ef 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -310,6 +310,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if self.text is None: self.get_raw_content() self.soup = BeautifulSoup(self.data, "html.parser", + multi_valued_attributes=None, from_encoding=self.encoding) self.text = self.data.decode(self.soup.original_encoding) return self.text diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 8268453d..ca924ad3 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -657,7 +657,8 @@ class UrlBase (object): def get_content (self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser") + self.soup = BeautifulSoup(self.data, "html.parser", + multi_valued_attributes=None) self.text = self.data.decode(self.soup.original_encoding) self.encoding = self.soup.original_encoding return self.text diff --git a/linkcheck/containers.py b/linkcheck/containers.py index 2cbcf253..2e8706b7 100644 --- a/linkcheck/containers.py +++ b/linkcheck/containers.py @@ -30,90 +30,6 @@ class AttrDict (dict): return self[name] -class ListDict (dict): - """A dictionary whose iterators reflect the order in which elements - were added. - """ - - def __init__ (self): - """Initialize sorted key list.""" - super(ListDict, self).__init__() - # sorted list of keys - self._keys = [] - - def setdefault (self, key, *args): - """Remember key order if key not found.""" - if key not in self: - self._keys.append(key) - return super(ListDict, self).setdefault(key, *args) - - def __setitem__ (self, key, value): - """Add key,value to dict, append key to sorted list.""" - if key not in self: - self._keys.append(key) - super(ListDict, self).__setitem__(key, value) - - def __delitem__ (self, key): - """Remove key from dict.""" - self._keys.remove(key) - super(ListDict, self).__delitem__(key) - - def pop (self, key): - """Remove key from dict and return value.""" - if key in self._keys: - self._keys.remove(key) - super(ListDict, self).pop(key) - - def popitem (self): - """Remove oldest key from dict and return item.""" - if self._keys: - k = self._keys[0] - v = self[k] - del self[k] - return (k, v) - raise KeyError("popitem() on empty dictionary") - - def values (self): - """Return sorted list of values.""" - return [self[k] for k in self._keys] - - def items (self): - """Return sorted list of items.""" - return [(k, self[k]) for k in self._keys] - - def keys (self): - """Return sorted list of keys.""" - return self._keys[:] - - def itervalues (self): - """Return iterator over sorted values.""" - for k in self._keys: - yield self[k] - - def iteritems (self): - """Return iterator over sorted items.""" - for k in self._keys: - yield (k, self[k]) - - def iterkeys (self): - """Return iterator over sorted keys.""" - return iter(self._keys) - - def clear (self): - """Remove all dict entries.""" - self._keys = [] - super(ListDict, self).clear() - - def get_true (self, key, default): - """Return default element if key is not in the dict, or if self[key] - evaluates to False. Useful for example if value is None, but - default value should be an empty string. - """ - if key not in self or not self[key]: - return default - return self[key] - - class CaselessDict (dict): """A dictionary ignoring the case of keys (which must be strings).""" diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index b60bfa14..e5295817 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -130,7 +130,7 @@ class MetaRobotsFinder (TagFinder): def start_element (self, tag, attrs, element_text=None): """Search for meta robots.txt "nofollow" and "noindex" flags.""" if tag == 'meta' and attrs.get('name') == 'robots': - val = attrs.get_true('content', u'').lower().split(u',') + val = attrs.get('content', u'').lower().split(u',') self.follow = u'nofollow' not in val self.index = u'noindex' not in val raise StopParse("found tag") @@ -142,11 +142,11 @@ def is_meta_url (attr, attrs): """Check if the meta attributes contain a URL.""" res = False if attr == "content": - equiv = attrs.get_true('http-equiv', u'').lower() - scheme = attrs.get_true('scheme', u'').lower() + equiv = attrs.get('http-equiv', u'').lower() + scheme = attrs.get('scheme', u'').lower() res = equiv in (u'refresh',) or scheme in (u'dcterms.uri',) if attr == "href": - rel = attrs.get_true('rel', u'').lower() + rel = attrs.get('rel', u'').lower() res = rel in (u'shortcut icon', u'icon') return res @@ -155,7 +155,7 @@ def is_form_get(attr, attrs): """Check if this is a GET form action URL.""" res = False if attr == "action": - method = attrs.get_true('method', u'').lower() + method = attrs.get('method', u'').lower() res = method != 'post' return res @@ -182,7 +182,7 @@ class LinkFinder (TagFinder): log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column()) if tag == "base" and not self.base_ref: - self.base_ref = attrs.get_true("href", u'') + self.base_ref = attrs.get("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) # parse URLs in tag (possibly multiple URLs in CSS styles) for attr in sorted(tagattrs.intersection(attrs)): @@ -195,7 +195,7 @@ class LinkFinder (TagFinder): # possible codebase base = u'' if tag == 'applet': - base = attrs.get_true('codebase', u'') + base = attrs.get('codebase', u'') if not base: base = self.base_ref # note: value can be None @@ -212,11 +212,11 @@ class LinkFinder (TagFinder): """Parse attrs for link name. Return name of link.""" if tag == 'a' and attr == 'href': if not name: - name = attrs.get_true('title', u'') + name = attrs.get('title', u'') elif tag == 'img': - name = attrs.get_true('alt', u'') + name = attrs.get('alt', u'') if not name: - name = attrs.get_true('title', u'') + name = attrs.get('title', u'') else: name = u"" return name diff --git a/tests/htmllib.py b/tests/htmllib.py index ecf988d4..6f1c5b19 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -114,7 +114,7 @@ class HtmlPrettyPrinter: @return: None """ self.fd.write("<%s" % tag.replace("/", "")) - for key, val in attrs.items(): + for key, val in sorted(attrs.items()): if val is None: self.fd.write(" %s" % key) else: diff --git a/tests/test_containers.py b/tests/test_containers.py index 6f830c6c..a90c8f10 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -19,7 +19,7 @@ Test container routines. """ import unittest -import random + import linkcheck.containers from builtins import range @@ -39,70 +39,6 @@ class TestAttrDict (unittest.TestCase): self.assertTrue(isinstance(self.d.get, type({}.get))) -class TestListDict (unittest.TestCase): - """Test list dictionary routines.""" - - def setUp (self): - """Set up self.d as empty listdict.""" - self.d = linkcheck.containers.ListDict() - - def test_insertion_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - self.assertTrue(2 in self.d) - self.assertTrue(1 in self.d) - - def test_deletion_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - del self.d[1] - self.assertTrue(2 in self.d) - self.assertTrue(1 not in self.d) - - def test_update_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - self.d[1] = 1 - self.assertEqual(self.d[1], 1) - - def test_sorting (self): - self.assertTrue(not self.d) - toinsert = random.sample(range(10000000), 60) - for x in toinsert: - self.d[x] = x - for i, k in enumerate(self.d.keys()): - self.assertEqual(self.d[k], toinsert[i]) - for i, k in enumerate(self.d.iterkeys()): - self.assertEqual(self.d[k], toinsert[i]) - for x in self.d.values(): - self.assertTrue(x in toinsert) - for x in self.d.itervalues(): - self.assertTrue(x in toinsert) - for x, y in self.d.items(): - self.assertTrue(x in toinsert) - self.assertTrue(y in toinsert) - for x, y in self.d.iteritems(): - self.assertTrue(x in toinsert) - self.assertTrue(y in toinsert) - - def test_clear (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 3 - self.d.clear() - self.assertTrue(not self.d) - - def test_get_true (self): - self.assertTrue(not self.d) - self.d["a"] = 0 - self.d["b"] = 1 - self.assertEqual(self.d.get_true("a", 2), 2) - self.assertEqual(self.d.get_true("b", 2), 1) - - class TestCaselessDict (unittest.TestCase): """Test caseless dictionary routines."""