diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py
index ce411977..4e7174b9 100644
--- a/linkcheck/HtmlParser/__init__.py
+++ b/linkcheck/HtmlParser/__init__.py
@@ -26,11 +26,6 @@ in the ListDict (ie. "" with lead to a {href: None} dict entry).
Used callbacks of a handler are:
-- Comments:
- def comment (data)
- @param data:
- @type data: Unicode string
-
- Start tag:
def start_element (tag, attrs)
@param tag: tag name
@@ -50,28 +45,6 @@ Used callbacks of a handler are:
@param tag: tag name
@type tag: Unicode string
-- Document type:
- def doctype (data)
- @param data: doctype string data
- @type data: Unicode string
-
-- Processing instruction (PI):
- def pi (name, data=None)
- @param name: instruction name
- @type name: Unicode string
- @param data: instruction data
- @type data: Unicode string
-
-- Character data:
- def cdata (data)
- @param data: character data
- @type data: Unicode string
-
-- Characters: data
- def characters(data): data
- @param data: data
- @type data: Unicode string
-
Additionally, there are error and warning callbacks:
- Parser warning.
diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py
index 564bd69b..dba7806b 100644
--- a/linkcheck/HtmlParser/htmlsax.py
+++ b/linkcheck/HtmlParser/htmlsax.py
@@ -24,8 +24,7 @@ filterwarnings("ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
-from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
- Tag)
+from bs4 import BeautifulSoup, Tag
from ..containers import ListDict
@@ -87,28 +86,6 @@ class Parser(object):
self.parse_contents(content.contents)
if hasattr(self.handler, 'end_element'):
self.handler.end_element(content.name)
- if content.comments:
- for comment in content.comments:
- if hasattr(self.handler, 'comment'):
- self.handler.comment(comment)
- elif isinstance(content, Doctype):
- if hasattr(self.handler, 'doctype'):
- self.handler.doctype(
- content[len('DOCTYPE '):]
- if content.upper().startswith('DOCTYPE ')
- else content)
- elif isinstance(content, Comment):
- if hasattr(self.handler, 'comment'):
- self.handler.comment(content.strip())
- elif isinstance(content, CData):
- if hasattr(self.handler, 'cdata'):
- self.handler.cdata(content)
- elif isinstance(content, ProcessingInstruction):
- if hasattr(self.handler, 'pi'):
- self.handler.pi(content.strip("? "))
- else:
- if hasattr(self.handler, 'characters'):
- self.handler.characters(content)
def flush(self):
if self.soup is None:
diff --git a/tests/htmllib.py b/tests/htmllib.py
index 0eb96073..ecf988d4 100644
--- a/tests/htmllib.py
+++ b/tests/htmllib.py
@@ -19,11 +19,9 @@ Default HTML parser handler classes.
"""
import sys
-from builtins import bytes, str as str_text
-from builtins import chr
-class HtmlPrinter (object):
+class HtmlPrinter:
"""
Handles all functions by printing the function name and attributes.
"""
@@ -46,7 +44,7 @@ class HtmlPrinter (object):
@return: None
"""
self.fd.write(self.mem)
- self.fd.write(str_text(attrs))
+ self.fd.write(str(attrs))
def __getattr__ (self, name):
"""
@@ -61,7 +59,7 @@ class HtmlPrinter (object):
return self._print
-class HtmlPrettyPrinter (object):
+class HtmlPrettyPrinter:
"""
Print out all parsed HTML data in encoded form.
Also stores error and warnings messages.
@@ -79,16 +77,6 @@ class HtmlPrettyPrinter (object):
self.fd = fd
self.encoding = encoding
- def comment (self, data):
- """
- Print HTML comment.
-
- @param data: the comment
- @type data: string
- @return: None
- """
- self.fd.write("" % data)
-
def start_element (self, tag, attrs, element_text=None):
"""
Print HTML start element.
@@ -99,7 +87,7 @@ class HtmlPrettyPrinter (object):
@type attrs: dict
@return: None
"""
- self._start_element(tag, attrs, u">")
+ self._start_element(tag, attrs, ">", element_text)
def start_end_element (self, tag, attrs, element_text=None):
"""
@@ -111,9 +99,9 @@ class HtmlPrettyPrinter (object):
@type attrs: dict
@return: None
"""
- self._start_element(tag, attrs, u"/>")
+ self._start_element(tag, attrs, "/>", element_text)
- def _start_element (self, tag, attrs, end):
+ def _start_element (self, tag, attrs, end, element_text=None):
"""
Print HTML element with end string.
@@ -125,13 +113,15 @@ class HtmlPrettyPrinter (object):
@type end: string
@return: None
"""
- self.fd.write(u"<%s" % tag.replace("/", ""))
+ self.fd.write("<%s" % tag.replace("/", ""))
for key, val in attrs.items():
if val is None:
- self.fd.write(u" %s" % key)
+ self.fd.write(" %s" % key)
else:
- self.fd.write(u' %s="%s"' % (key, quote_attrval(val)))
+ self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
self.fd.write(end)
+ if element_text:
+ self.fd.write(element_text)
def end_element (self, tag):
"""
@@ -143,46 +133,6 @@ class HtmlPrettyPrinter (object):
"""
self.fd.write("%s>" % tag)
- def doctype (self, data):
- """
- Print HTML document type.
-
- @param data: the document type
- @type data: string
- @return: None
- """
- self.fd.write("" % data)
-
- def pi (self, data):
- """
- Print HTML pi.
-
- @param data: the tag data
- @type data: string
- @return: None
- """
- self.fd.write("%s?>" % data)
-
- def cdata (self, data):
- """
- Print HTML cdata.
-
- @param data: the character data
- @type data: string
- @return: None
- """
- self.fd.write("" % data)
-
- def characters (self, data):
- """
- Print characters.
-
- @param data: the character data
- @type data: string
- @return: None
- """
- self.fd.write(data)
-
def quote_attrval (s):
"""
@@ -195,18 +145,14 @@ def quote_attrval (s):
"""
res = []
for c in s:
- try: # Python 2
- ord_c = ord(c)
- except TypeError:
- ord_c = c
- if ord_c <= 127:
+ if ord(c) <= 127:
# ASCII
- if c == u'&':
- res.append(u"&")
- elif c == u'"':
- res.append(u""")
+ if c == '&':
+ res.append("&")
+ elif c == '"':
+ res.append(""")
else:
- res.append(chr(ord_c))
+ res.append(c)
else:
- res.append(u"%d;" % ord_c)
- return u"".join(res)
+ res.append("%d;" % ord(c))
+ return "".join(res)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 9a058f44..fc831361 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -20,10 +20,7 @@ Test html parsing.
import linkcheck.HtmlParser.htmlsax
-try:
- from cStringIO import StringIO
-except ImportError:
- from io import StringIO
+from io import StringIO
import unittest
from parameterized import parameterized
@@ -38,7 +35,6 @@ parsetests = [
("""""", """"""),
("""""", """"""),
("""""", """"""),
- ("""""", """"""),
("""""", """"""),
("""""", """"""),
@@ -51,10 +47,8 @@ parsetests = [
("""""", """"""),
("""""", """"""),
("""""", """"""),
- ("""< a>""", """< a>"""),
- ("""< a >""", """< a >"""),
- ("""<>""", """<>"""),
- ("""< >""", """< >"""),
+ ("""<>""", """"""),
+ ("""< >""", """"""),
("""""", u""""""),
("""""", u""""""),
("""""", u""""""),
@@ -64,7 +58,6 @@ parsetests = [
("""""", """"""),
# reduce test
("""<""", """<"""),
- ("""d>""", """d>"""),
# numbers in tag
("""bla
""", """bla
"""),
# more start tags
@@ -72,49 +65,10 @@ parsetests = [
("""
""", """
"""),
("""
""", """
"""),
("""
""", """
"""),
- # comments
- ("""< 1>""", """< 1>"""),
- ("""< 2>""", """< 2>"""),
- ("""< 3>""", """< 3>"""),
- ("""< 4>""", """< 4>"""),
- ("""< 5>""", """< 5>"""),
- ("""< 7>""", """< 7>"""),
- ("""""", """"""),
- ("""< 9>""", """< 9>"""),
- ("""< 10>""", """< 10>"""),
- ("""""", """"""), # empty comment
- # invalid comments
- ("""< 8>"""),
- ("""< 6>"""),
- ("""""", """"""),
- ("""""", """"""),
- (""""""),
- ("""< a>""", """< a>"""),
- ("""< a>""", """< a>"""),
- # end tags
- ("""""", """"""),
- (""" a>""", """"""),
- (""" a >""", """"""),
- ("""""", """"""),
- ("""< / a>""", """< / a>"""),
- ("""< /a>""", """< /a>"""),
- ("""""", """"""),
# start and end tag (HTML doctype assumed)
("""""", """"""),
("""""", """"""),
("""""", """"""),
- # declaration tags
- ("""""",
- """"""),
- # misc
- ("""""",
- """"""),
- # javascript
- ("""""", """"""),
- ("""""",
- """"""),
- ("""""",
- """"""),
# line continuation (Dr. Fun webpage)
("""
""", """
"""),
("""
""", """
"""),
@@ -144,15 +98,9 @@ parsetests = [
("""""", """"""),
# note that \u8156 is not valid encoding and therefore gets removed
("""""", """"""),
- # non-ascii characters
- ("""<Üzgür> fahr żżżżżż{""",
- u"""<Üzgür> fahr żżżżżż{"""),
# mailto link
("""1""",
"""1"""),
- # doctype XHTML
- ("""""",
- """"""),
# meta tag with charset encoding
("""""",
""""""),
@@ -164,22 +112,13 @@ parsetests = [
""""""),
("""""",
""""""),
- # CDATA
- ("""hallo]]>""", """hallo]]>"""),
# missing > in end tag
("""""", """"""),
("""""", """"""),
# missing beginning quote
("""
""", """ | | """),
# stray < before start tag
- ("""<0.""", """<0. | | """),
- # stray < before end tag
- ("""<0.""", """<0."""),
- # missing end quote (XXX TODO)
- #("""\n"""),
- #(""" | | \na"""),
- #("""\n""", """\n"""),
- #(""" | \n""", """ | \n"""),
+ ("""<0.""", """ | | """),
# HTML5 tags
("""