diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py index ce411977..4e7174b9 100644 --- a/linkcheck/HtmlParser/__init__.py +++ b/linkcheck/HtmlParser/__init__.py @@ -26,11 +26,6 @@ in the ListDict (ie. "" with lead to a {href: None} dict entry). Used callbacks of a handler are: -- Comments: - def comment (data) - @param data: - @type data: Unicode string - - Start tag: def start_element (tag, attrs) @param tag: tag name @@ -50,28 +45,6 @@ Used callbacks of a handler are: @param tag: tag name @type tag: Unicode string -- Document type: - def doctype (data) - @param data: doctype string data - @type data: Unicode string - -- Processing instruction (PI): - def pi (name, data=None) - @param name: instruction name - @type name: Unicode string - @param data: instruction data - @type data: Unicode string - -- Character data: - def cdata (data) - @param data: character data - @type data: Unicode string - -- Characters: data - def characters(data): data - @param data: data - @type data: Unicode string - Additionally, there are error and warning callbacks: - Parser warning. diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 564bd69b..dba7806b 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -24,8 +24,7 @@ filterwarnings("ignore", message="The soupsieve package is not installed. CSS selectors cannot be used.", category=UserWarning, module="bs4") -from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction, - Tag) +from bs4 import BeautifulSoup, Tag from ..containers import ListDict @@ -87,28 +86,6 @@ class Parser(object): self.parse_contents(content.contents) if hasattr(self.handler, 'end_element'): self.handler.end_element(content.name) - if content.comments: - for comment in content.comments: - if hasattr(self.handler, 'comment'): - self.handler.comment(comment) - elif isinstance(content, Doctype): - if hasattr(self.handler, 'doctype'): - self.handler.doctype( - content[len('DOCTYPE '):] - if content.upper().startswith('DOCTYPE ') - else content) - elif isinstance(content, Comment): - if hasattr(self.handler, 'comment'): - self.handler.comment(content.strip()) - elif isinstance(content, CData): - if hasattr(self.handler, 'cdata'): - self.handler.cdata(content) - elif isinstance(content, ProcessingInstruction): - if hasattr(self.handler, 'pi'): - self.handler.pi(content.strip("? ")) - else: - if hasattr(self.handler, 'characters'): - self.handler.characters(content) def flush(self): if self.soup is None: diff --git a/tests/htmllib.py b/tests/htmllib.py index 0eb96073..ecf988d4 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -19,11 +19,9 @@ Default HTML parser handler classes. """ import sys -from builtins import bytes, str as str_text -from builtins import chr -class HtmlPrinter (object): +class HtmlPrinter: """ Handles all functions by printing the function name and attributes. """ @@ -46,7 +44,7 @@ class HtmlPrinter (object): @return: None """ self.fd.write(self.mem) - self.fd.write(str_text(attrs)) + self.fd.write(str(attrs)) def __getattr__ (self, name): """ @@ -61,7 +59,7 @@ class HtmlPrinter (object): return self._print -class HtmlPrettyPrinter (object): +class HtmlPrettyPrinter: """ Print out all parsed HTML data in encoded form. Also stores error and warnings messages. @@ -79,16 +77,6 @@ class HtmlPrettyPrinter (object): self.fd = fd self.encoding = encoding - def comment (self, data): - """ - Print HTML comment. - - @param data: the comment - @type data: string - @return: None - """ - self.fd.write("" % data) - def start_element (self, tag, attrs, element_text=None): """ Print HTML start element. @@ -99,7 +87,7 @@ class HtmlPrettyPrinter (object): @type attrs: dict @return: None """ - self._start_element(tag, attrs, u">") + self._start_element(tag, attrs, ">", element_text) def start_end_element (self, tag, attrs, element_text=None): """ @@ -111,9 +99,9 @@ class HtmlPrettyPrinter (object): @type attrs: dict @return: None """ - self._start_element(tag, attrs, u"/>") + self._start_element(tag, attrs, "/>", element_text) - def _start_element (self, tag, attrs, end): + def _start_element (self, tag, attrs, end, element_text=None): """ Print HTML element with end string. @@ -125,13 +113,15 @@ class HtmlPrettyPrinter (object): @type end: string @return: None """ - self.fd.write(u"<%s" % tag.replace("/", "")) + self.fd.write("<%s" % tag.replace("/", "")) for key, val in attrs.items(): if val is None: - self.fd.write(u" %s" % key) + self.fd.write(" %s" % key) else: - self.fd.write(u' %s="%s"' % (key, quote_attrval(val))) + self.fd.write(' %s="%s"' % (key, quote_attrval(val))) self.fd.write(end) + if element_text: + self.fd.write(element_text) def end_element (self, tag): """ @@ -143,46 +133,6 @@ class HtmlPrettyPrinter (object): """ self.fd.write("" % tag) - def doctype (self, data): - """ - Print HTML document type. - - @param data: the document type - @type data: string - @return: None - """ - self.fd.write("" % data) - - def pi (self, data): - """ - Print HTML pi. - - @param data: the tag data - @type data: string - @return: None - """ - self.fd.write("" % data) - - def cdata (self, data): - """ - Print HTML cdata. - - @param data: the character data - @type data: string - @return: None - """ - self.fd.write("" % data) - - def characters (self, data): - """ - Print characters. - - @param data: the character data - @type data: string - @return: None - """ - self.fd.write(data) - def quote_attrval (s): """ @@ -195,18 +145,14 @@ def quote_attrval (s): """ res = [] for c in s: - try: # Python 2 - ord_c = ord(c) - except TypeError: - ord_c = c - if ord_c <= 127: + if ord(c) <= 127: # ASCII - if c == u'&': - res.append(u"&") - elif c == u'"': - res.append(u""") + if c == '&': + res.append("&") + elif c == '"': + res.append(""") else: - res.append(chr(ord_c)) + res.append(c) else: - res.append(u"&#%d;" % ord_c) - return u"".join(res) + res.append("&#%d;" % ord(c)) + return "".join(res) diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a058f44..fc831361 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -20,10 +20,7 @@ Test html parsing. import linkcheck.HtmlParser.htmlsax -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO +from io import StringIO import unittest from parameterized import parameterized @@ -38,7 +35,6 @@ parsetests = [ ("""""", """"""), ("""""", """"""), ("""""", """"""), - ("""""", """"""), ("""""", """"""), ("""""", """"""), @@ -51,10 +47,8 @@ parsetests = [ ("""""", """"""), ("""""", """"""), ("""""", """"""), - ("""< a>""", """< a>"""), - ("""< a >""", """< a >"""), - ("""<>""", """<>"""), - ("""< >""", """< >"""), + ("""<>""", """"""), + ("""< >""", """"""), ("""""", u""""""), ("""""", u""""""), ("""""", u""""""), @@ -64,7 +58,6 @@ parsetests = [ ("""""", """"""), # reduce test ("""<""", """<"""), - ("""d>""", """d>"""), # numbers in tag ("""

bla

""", """

bla

"""), # more start tags @@ -72,49 +65,10 @@ parsetests = [ ("""
""", """
"""), ("""
""", """
"""), ("""
""", """

"""), - # comments - ("""< 1>""", """< 1>"""), - ("""< 2>""", """< 2>"""), - ("""< 3>""", """< 3>"""), - ("""< 4>""", """< 4>"""), - ("""< 5>""", """< 5>"""), - ("""< 7>""", """< 7>"""), - ("""""", """"""), - ("""< 9>""", """< 9>"""), - ("""< 10>""", """< 10>"""), - ("""""", """"""), # empty comment - # invalid comments - ("""< 8>"""), - ("""< 6>"""), - ("""""", """"""), - ("""""", """"""), - (""""""), - ("""< a>""", """< a>"""), - ("""< a>""", """< a>"""), - # end tags - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""< / a>""", """< / a>"""), - ("""< /a>""", """< /a>"""), - ("""
""", """"""), # start and end tag (HTML doctype assumed) ("""""", """"""), ("""""", """"""), ("""""", """"""), - # declaration tags - ("""""", - """"""), - # misc - ("""""", - """"""), - # javascript - ("""""", """"""), - ("""""", - """"""), - ("""""", - """"""), # line continuation (Dr. Fun webpage) ("""""", """"""), ("""""", """"""), @@ -144,15 +98,9 @@ parsetests = [ ("""""", """"""), # note that \u8156 is not valid encoding and therefore gets removed ("""""", """"""), - # non-ascii characters - ("""<Üzgür> fahr żżżżżż{""", - u"""<Üzgür> fahr żżżżżż{"""), # mailto link ("""1""", """1"""), - # doctype XHTML - ("""""", - """"""), # meta tag with charset encoding ("""""", """"""), @@ -164,22 +112,13 @@ parsetests = [ """"""), ("""""", """"""), - # CDATA - ("""hallo]]>""", """hallo]]>"""), # missing > in end tag ("""""", """"""), ("""""", """"""), # missing beginning quote ("""""", """"""), # stray < before start tag - ("""<0.""", """<0."""), - # stray < before end tag - ("""<0.""", """<0."""), - # missing end quote (XXX TODO) - #("""\n"""), - #("""\na"""), - #("""\n""", """\n"""), - #("""\n""", """\n"""), + ("""<0.""", """"""), # HTML5 tags ("""""", """"""), ] -flushtests = [ - ("<", "<"), - ("' self.encoding_test(html, "utf-8")