From a7e1e20172b32271cbf79c489bfd7d8652d1cbeb Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 3 Apr 2020 19:24:08 +0100 Subject: [PATCH 1/5] Remove last line and column from Parser Only used for debug log message and not very useful. --- linkcheck/HtmlParser/htmlsax.py | 10 ---------- linkcheck/htmlutil/linkparse.py | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index dba7806b..1aa5c265 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -53,8 +53,6 @@ class Parser(object): self.html_doc = None self.tag_lineno = None self.tag_column = None - self.last_tag_lineno = None - self.last_tag_column = None def parse_contents(self, contents): for content in contents: @@ -69,8 +67,6 @@ class Parser(object): if v == b'': v = u'' attrs[k] = v - self.last_tag_lineno = self.tag_lineno - self.last_tag_column = self.tag_column self.tag_lineno = content.sourceline self.tag_column = None if content.sourcepos is None \ else content.sourcepos + 1 @@ -100,15 +96,9 @@ class Parser(object): def lineno(self): return self.tag_lineno - def last_lineno(self): - return self.last_tag_lineno - def column(self): return self.tag_column - def last_column(self): - return self.last_tag_column - def parser(handler=None): return Parser(handler) diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 33aa4321..b60bfa14 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -180,7 +180,7 @@ class LinkFinder (TagFinder): def start_element (self, tag, attrs, element_text=None): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) - log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) + log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column()) if tag == "base" and not self.base_ref: self.base_ref = attrs.get_true("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) From 3ff3d7249255bae9e5dcb93ef1358aec741a304d Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 3 Apr 2020 19:24:08 +0100 Subject: [PATCH 2/5] Use BeautifulSoup element attrs directly --- linkcheck/HtmlParser/htmlsax.py | 19 ++------ linkcheck/checker/httpurl.py | 1 + linkcheck/checker/urlbase.py | 3 +- linkcheck/containers.py | 84 --------------------------------- linkcheck/htmlutil/linkparse.py | 20 ++++---- tests/htmllib.py | 2 +- tests/test_containers.py | 66 +------------------------- 7 files changed, 19 insertions(+), 176 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 1aa5c265..6b3f76ad 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -26,8 +26,6 @@ filterwarnings("ignore", from bs4 import BeautifulSoup, Tag -from ..containers import ListDict - class Parser(object): handler = None @@ -57,26 +55,16 @@ class Parser(object): def parse_contents(self, contents): for content in contents: if isinstance(content, Tag): - attrs = ListDict() - for k, v_list in sorted(content.attrs.items()): - if not isinstance(v_list, list): - v_list = [v_list] - for v in v_list: - # empty parameters returned by BS4 - # are sometimes in bytes: - if v == b'': - v = u'' - attrs[k] = v self.tag_lineno = content.sourceline self.tag_column = None if content.sourcepos is None \ else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( - content.name, attrs, content.text.strip(), + content.name, content.attrs, content.text.strip(), ) else: self.handler.start_element( - content.name, attrs, content.text.strip(), + content.name, content.attrs, content.text.strip(), ) if hasattr(content, 'contents'): # recursion self.parse_contents(content.contents) @@ -85,7 +73,8 @@ class Parser(object): def flush(self): if self.soup is None: - self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser') + self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser', + multi_valued_attributes=None) if hasattr(self.soup, 'contents'): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 167133ed..9e6459ef 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -310,6 +310,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if self.text is None: self.get_raw_content() self.soup = BeautifulSoup(self.data, "html.parser", + multi_valued_attributes=None, from_encoding=self.encoding) self.text = self.data.decode(self.soup.original_encoding) return self.text diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 8268453d..ca924ad3 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -657,7 +657,8 @@ class UrlBase (object): def get_content (self): if self.text is None: self.get_raw_content() - self.soup = BeautifulSoup(self.data, "html.parser") + self.soup = BeautifulSoup(self.data, "html.parser", + multi_valued_attributes=None) self.text = self.data.decode(self.soup.original_encoding) self.encoding = self.soup.original_encoding return self.text diff --git a/linkcheck/containers.py b/linkcheck/containers.py index 2cbcf253..2e8706b7 100644 --- a/linkcheck/containers.py +++ b/linkcheck/containers.py @@ -30,90 +30,6 @@ class AttrDict (dict): return self[name] -class ListDict (dict): - """A dictionary whose iterators reflect the order in which elements - were added. - """ - - def __init__ (self): - """Initialize sorted key list.""" - super(ListDict, self).__init__() - # sorted list of keys - self._keys = [] - - def setdefault (self, key, *args): - """Remember key order if key not found.""" - if key not in self: - self._keys.append(key) - return super(ListDict, self).setdefault(key, *args) - - def __setitem__ (self, key, value): - """Add key,value to dict, append key to sorted list.""" - if key not in self: - self._keys.append(key) - super(ListDict, self).__setitem__(key, value) - - def __delitem__ (self, key): - """Remove key from dict.""" - self._keys.remove(key) - super(ListDict, self).__delitem__(key) - - def pop (self, key): - """Remove key from dict and return value.""" - if key in self._keys: - self._keys.remove(key) - super(ListDict, self).pop(key) - - def popitem (self): - """Remove oldest key from dict and return item.""" - if self._keys: - k = self._keys[0] - v = self[k] - del self[k] - return (k, v) - raise KeyError("popitem() on empty dictionary") - - def values (self): - """Return sorted list of values.""" - return [self[k] for k in self._keys] - - def items (self): - """Return sorted list of items.""" - return [(k, self[k]) for k in self._keys] - - def keys (self): - """Return sorted list of keys.""" - return self._keys[:] - - def itervalues (self): - """Return iterator over sorted values.""" - for k in self._keys: - yield self[k] - - def iteritems (self): - """Return iterator over sorted items.""" - for k in self._keys: - yield (k, self[k]) - - def iterkeys (self): - """Return iterator over sorted keys.""" - return iter(self._keys) - - def clear (self): - """Remove all dict entries.""" - self._keys = [] - super(ListDict, self).clear() - - def get_true (self, key, default): - """Return default element if key is not in the dict, or if self[key] - evaluates to False. Useful for example if value is None, but - default value should be an empty string. - """ - if key not in self or not self[key]: - return default - return self[key] - - class CaselessDict (dict): """A dictionary ignoring the case of keys (which must be strings).""" diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index b60bfa14..e5295817 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -130,7 +130,7 @@ class MetaRobotsFinder (TagFinder): def start_element (self, tag, attrs, element_text=None): """Search for meta robots.txt "nofollow" and "noindex" flags.""" if tag == 'meta' and attrs.get('name') == 'robots': - val = attrs.get_true('content', u'').lower().split(u',') + val = attrs.get('content', u'').lower().split(u',') self.follow = u'nofollow' not in val self.index = u'noindex' not in val raise StopParse("found tag") @@ -142,11 +142,11 @@ def is_meta_url (attr, attrs): """Check if the meta attributes contain a URL.""" res = False if attr == "content": - equiv = attrs.get_true('http-equiv', u'').lower() - scheme = attrs.get_true('scheme', u'').lower() + equiv = attrs.get('http-equiv', u'').lower() + scheme = attrs.get('scheme', u'').lower() res = equiv in (u'refresh',) or scheme in (u'dcterms.uri',) if attr == "href": - rel = attrs.get_true('rel', u'').lower() + rel = attrs.get('rel', u'').lower() res = rel in (u'shortcut icon', u'icon') return res @@ -155,7 +155,7 @@ def is_form_get(attr, attrs): """Check if this is a GET form action URL.""" res = False if attr == "action": - method = attrs.get_true('method', u'').lower() + method = attrs.get('method', u'').lower() res = method != 'post' return res @@ -182,7 +182,7 @@ class LinkFinder (TagFinder): log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column()) if tag == "base" and not self.base_ref: - self.base_ref = attrs.get_true("href", u'') + self.base_ref = attrs.get("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) # parse URLs in tag (possibly multiple URLs in CSS styles) for attr in sorted(tagattrs.intersection(attrs)): @@ -195,7 +195,7 @@ class LinkFinder (TagFinder): # possible codebase base = u'' if tag == 'applet': - base = attrs.get_true('codebase', u'') + base = attrs.get('codebase', u'') if not base: base = self.base_ref # note: value can be None @@ -212,11 +212,11 @@ class LinkFinder (TagFinder): """Parse attrs for link name. Return name of link.""" if tag == 'a' and attr == 'href': if not name: - name = attrs.get_true('title', u'') + name = attrs.get('title', u'') elif tag == 'img': - name = attrs.get_true('alt', u'') + name = attrs.get('alt', u'') if not name: - name = attrs.get_true('title', u'') + name = attrs.get('title', u'') else: name = u"" return name diff --git a/tests/htmllib.py b/tests/htmllib.py index ecf988d4..6f1c5b19 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -114,7 +114,7 @@ class HtmlPrettyPrinter: @return: None """ self.fd.write("<%s" % tag.replace("/", "")) - for key, val in attrs.items(): + for key, val in sorted(attrs.items()): if val is None: self.fd.write(" %s" % key) else: diff --git a/tests/test_containers.py b/tests/test_containers.py index 6f830c6c..a90c8f10 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -19,7 +19,7 @@ Test container routines. """ import unittest -import random + import linkcheck.containers from builtins import range @@ -39,70 +39,6 @@ class TestAttrDict (unittest.TestCase): self.assertTrue(isinstance(self.d.get, type({}.get))) -class TestListDict (unittest.TestCase): - """Test list dictionary routines.""" - - def setUp (self): - """Set up self.d as empty listdict.""" - self.d = linkcheck.containers.ListDict() - - def test_insertion_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - self.assertTrue(2 in self.d) - self.assertTrue(1 in self.d) - - def test_deletion_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - del self.d[1] - self.assertTrue(2 in self.d) - self.assertTrue(1 not in self.d) - - def test_update_order (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 2 - self.d[1] = 1 - self.assertEqual(self.d[1], 1) - - def test_sorting (self): - self.assertTrue(not self.d) - toinsert = random.sample(range(10000000), 60) - for x in toinsert: - self.d[x] = x - for i, k in enumerate(self.d.keys()): - self.assertEqual(self.d[k], toinsert[i]) - for i, k in enumerate(self.d.iterkeys()): - self.assertEqual(self.d[k], toinsert[i]) - for x in self.d.values(): - self.assertTrue(x in toinsert) - for x in self.d.itervalues(): - self.assertTrue(x in toinsert) - for x, y in self.d.items(): - self.assertTrue(x in toinsert) - self.assertTrue(y in toinsert) - for x, y in self.d.iteritems(): - self.assertTrue(x in toinsert) - self.assertTrue(y in toinsert) - - def test_clear (self): - self.assertTrue(not self.d) - self.d[2] = 1 - self.d[1] = 3 - self.d.clear() - self.assertTrue(not self.d) - - def test_get_true (self): - self.assertTrue(not self.d) - self.d["a"] = 0 - self.d["b"] = 1 - self.assertEqual(self.d.get_true("a", 2), 2) - self.assertEqual(self.d.get_true("b", 2), 1) - - class TestCaselessDict (unittest.TestCase): """Test caseless dictionary routines.""" From 036b900ffc3b4099e63574059c8036c92a3a8f1a Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 3 Apr 2020 19:24:08 +0100 Subject: [PATCH 3/5] Remove unused linkcheck.containers classes --- linkcheck/containers.py | 95 --------------------------- tests/test_containers.py | 134 --------------------------------------- 2 files changed, 229 deletions(-) diff --git a/linkcheck/containers.py b/linkcheck/containers.py index 2e8706b7..e17d0290 100644 --- a/linkcheck/containers.py +++ b/linkcheck/containers.py @@ -18,86 +18,6 @@ Special container classes. """ -from collections import namedtuple -from past.builtins import basestring - -class AttrDict (dict): - """Dictionary allowing attribute access to its elements if they - are valid attribute names and not already existing methods.""" - - def __getattr__ (self, name): - """Return attribute name from dict.""" - return self[name] - - -class CaselessDict (dict): - """A dictionary ignoring the case of keys (which must be strings).""" - - def __getitem__ (self, key): - """Return lowercase key item.""" - assert isinstance(key, basestring) - return dict.__getitem__(self, key.lower()) - - def __delitem__ (self, key): - """Remove lowercase key item.""" - assert isinstance(key, basestring) - return dict.__delitem__(self, key.lower()) - - def __setitem__ (self, key, value): - """Set lowercase key item.""" - assert isinstance(key, basestring) - dict.__setitem__(self, key.lower(), value) - - def __contains__ (self, key): - """Check lowercase key item.""" - assert isinstance(key, basestring) - return dict.__contains__(self, key.lower()) - - def get (self, key, def_val=None): - """Return lowercase key value.""" - assert isinstance(key, basestring) - return dict.get(self, key.lower(), def_val) - - def setdefault (self, key, *args): - """Set lowercase key value and return.""" - assert isinstance(key, basestring) - return dict.setdefault(self, key.lower(), *args) - - def update (self, other): - """Update this dict with lowercase key from other dict""" - for k, v in other.items(): - dict.__setitem__(self, k.lower(), v) - - def fromkeys (cls, iterable, value=None): - """Construct new caseless dict from given data.""" - d = cls() - for k in iterable: - dict.__setitem__(d, k.lower(), value) - return d - fromkeys = classmethod(fromkeys) - - def pop (self, key, *args): - """Remove lowercase key from dict and return value.""" - assert isinstance(key, basestring) - return dict.pop(self, key.lower(), *args) - - -class CaselessSortedDict (CaselessDict): - """Caseless dictionary with sorted keys.""" - - def keys (self): - """Return sorted key list.""" - return sorted(super(CaselessSortedDict, self).keys()) - - def items (self): - """Return sorted item list.""" - return [(x, self[x]) for x in self.keys()] - - def iteritems (self): - """Return sorted item iterator.""" - return ((x, self[x]) for x in self.keys()) - - class LFUCache (dict): """Limited cache which purges least frequently used items.""" @@ -182,18 +102,3 @@ class LFUCache (dict): """Remove and return a value.""" value = super(LFUCache, self).pop() return value[1] - - -def enum (*names): - """Return an enum datatype instance from given list of keyword names. - The enum values are zero-based integers. - - >>> Status = enum('open', 'pending', 'closed') - >>> Status.open - 0 - >>> Status.pending - 1 - >>> Status.closed - 2 - """ - return namedtuple('Enum', ' '.join(names))(*range(len(names))) diff --git a/tests/test_containers.py b/tests/test_containers.py index a90c8f10..0d119b48 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -24,130 +24,6 @@ import linkcheck.containers from builtins import range -class TestAttrDict (unittest.TestCase): - - def setUp (self): - self.d = linkcheck.containers.AttrDict() - - def test_access (self): - self.d["test"] = 1 - self.assertEqual(self.d.test, self.d["test"]) - self.assertEqual(self.d.test, 1) - - def test_method (self): - self.d["get"] = 1 - self.assertTrue(isinstance(self.d.get, type({}.get))) - - -class TestCaselessDict (unittest.TestCase): - """Test caseless dictionary routines.""" - - def setUp (self): - """Set up self.d as empty caseless dict.""" - self.d = linkcheck.containers.CaselessDict() - - def test_insert (self): - self.assertTrue(not self.d) - self.d["a"] = 1 - self.assertTrue("a" in self.d) - self.assertTrue("A" in self.d) - self.d["aBcD"] = 2 - self.assertTrue("abcd" in self.d) - self.assertTrue("Abcd" in self.d) - self.assertTrue("ABCD" in self.d) - - def test_delete (self): - self.assertTrue(not self.d) - self.d["a"] = 1 - del self.d["A"] - self.assertTrue("a" not in self.d) - self.assertTrue("A" not in self.d) - - def test_update (self): - self.assertTrue(not self.d) - self.d["a"] = 1 - self.d["A"] = 2 - self.assertEqual(self.d["a"], 2) - - def test_clear (self): - self.assertTrue(not self.d) - self.d["a"] = 5 - self.d["b"] = 6 - self.d.clear() - self.assertTrue(not self.d) - - def test_containment (self): - self.assertTrue(not self.d) - self.assertTrue("A" not in self.d) - self.assertTrue("a" not in self.d) - self.d["a"] = 5 - self.assertTrue("A" in self.d) - self.assertTrue("a" in self.d) - - def test_setdefault (self): - self.assertTrue(not self.d) - self.d["a"] = 5 - self.assertEqual(self.d.setdefault("A", 6), 5) - self.assertEqual(self.d.setdefault("b", 7), 7) - - def test_get (self): - self.assertTrue(not self.d) - self.d["a"] = 42 - self.assertEqual(self.d.get("A"), 42) - self.assertTrue(self.d.get("B") is None) - - def test_update2 (self): - self.assertTrue(not self.d) - self.d["a"] = 42 - self.d.update({"A": 43}) - self.assertEqual(self.d["a"], 43) - - def test_fromkeys (self): - self.assertTrue(not self.d) - keys = ["a", "A", "b", "C"] - d1 = self.d.fromkeys(keys, 42) - for key in keys: - self.assertEqual(d1[key], 42) - - def test_pop (self): - self.assertTrue(not self.d) - self.d["a"] = 42 - self.assertEqual(self.d.pop("A"), 42) - self.assertTrue(not self.d) - self.assertRaises(KeyError, self.d.pop, "A") - - def test_popitem (self): - self.assertTrue(not self.d) - self.d["a"] = 42 - self.assertEqual(self.d.popitem(), ("a", 42)) - self.assertTrue(not self.d) - self.assertRaises(KeyError, self.d.popitem) - - -class TestCaselessSortedDict (unittest.TestCase): - """Test caseless sorted dictionary routines.""" - - def setUp (self): - """Set up self.d as empty caseless sorted dict.""" - self.d = linkcheck.containers.CaselessSortedDict() - - def test_sorted (self): - self.assertTrue(not self.d) - self.d["b"] = 6 - self.d["a"] = 7 - self.d["C"] = 8 - prev = None - for key in self.d.keys(): - if prev is not None: - self.assertTrue(key > prev) - prev = key - prev = None - for key, value in self.d.items(): - self.assertEqual(value, self.d[key]) - if prev is not None: - self.assertTrue(key > prev) - prev = key - class TestLFUCache (unittest.TestCase): """Test LFU cache implementation.""" @@ -185,13 +61,3 @@ class TestLFUCache (unittest.TestCase): self.d[i] = i self.d[1001] = 1001 self.assertTrue(950 <= len(self.d) <= self.size) - - -class TestEnum (unittest.TestCase): - - def test_enum (self): - e = linkcheck.containers.enum("a", "b", "c") - self.assertEqual(e.a, 0) - self.assertEqual(e.b, 1) - self.assertEqual(e.c, 2) - self.assertEqual(e, (0, 1, 2)) From 0c5e3bb403cdfe296df090af6913287e6b97aeca Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 3 Apr 2020 19:24:08 +0100 Subject: [PATCH 4/5] Remove old HtmlParser .gitignore htmlparse.output was a product of the built-in parser. --- linkcheck/HtmlParser/.gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 linkcheck/HtmlParser/.gitignore diff --git a/linkcheck/HtmlParser/.gitignore b/linkcheck/HtmlParser/.gitignore deleted file mode 100644 index 6a9aa715..00000000 --- a/linkcheck/HtmlParser/.gitignore +++ /dev/null @@ -1 +0,0 @@ -htmlparse.output From fe024fb0c80b77602af40635dbad6f5022333e27 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 3 Apr 2020 19:24:08 +0100 Subject: [PATCH 5/5] Remove unused Parser.debug() method --- linkcheck/HtmlParser/htmlsax.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 6b3f76ad..7975b6e7 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -79,9 +79,6 @@ class Parser(object): self.parse_contents(self.soup.contents) self.encoding = self.soup.original_encoding - def debug(self, text): - raise NotImplementedError("debug is not implemented") - def lineno(self): return self.tag_lineno