diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py index 3fca9f95..3fc837a8 100644 --- a/linkcheck/HtmlParser/__init__.py +++ b/linkcheck/HtmlParser/__init__.py @@ -33,13 +33,6 @@ Used callbacks of a handler are: @param attrs: tag attributes @type attrs: ListDict -- Start-end tag: - def start_end_element(tag, attrs): - @param tag: tag name - @type tag: Unicode string - @param attrs: tag attributes - @type attrs: ListDict - Additionally, there are error and warning callbacks: - Parser warning. diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index 6f524683..d6f8cc4f 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -42,20 +42,14 @@ class Parser(object): def parse_contents(self, contents): for content in contents: if isinstance(content, Tag): - tag_column = None if content.sourcepos is None \ + self.handler.start_element( + content.name, content.attrs, content.text.strip(), + content.sourceline, + None if content.sourcepos is None else content.sourcepos + 1 - if content.is_empty_element: - self.handler.start_end_element( - content.name, content.attrs, content.text.strip(), - content.sourceline, tag_column - ) - else: - self.handler.start_element( - content.name, content.attrs, content.text.strip(), - content.sourceline, tag_column - ) - if hasattr(content, 'contents'): # recursion - self.parse_contents(content.contents) + ) + if hasattr(content, 'contents'): # recursion + self.parse_contents(content.contents) def parser(handler=None): diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index b2ed61e6..c455cffa 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -109,11 +109,6 @@ class TagFinder (object): """Does nothing, override in a subclass.""" pass - def start_end_element (self, tag, attrs, element_text, lineno, column): - """Delegate a combined start/end element (eg.
) to - the start_element method. Ignore the end element part.""" - self.start_element(tag, attrs, element_text, lineno, column) - class MetaRobotsFinder (TagFinder): """Class for finding robots.txt meta values in HTML.""" diff --git a/tests/htmllib.py b/tests/htmllib.py index 06cebbc4..08318704 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -49,42 +49,16 @@ class HtmlPrettyPrinter: @type attrs: dict @return: None """ - self._start_element(tag, attrs, ">", element_text) - self.fd.write("" % tag) - - def start_end_element (self, tag, attrs, element_text, lineno, column): - """ - Print HTML start-end element. - - @param tag: tag name - @type tag: string - @param attrs: tag attributes - @type attrs: dict - @return: None - """ - self._start_element(tag, attrs, "/>", element_text) - - def _start_element (self, tag, attrs, end, element_text): - """ - Print HTML element with end string. - - @param tag: tag name - @type tag: string - @param attrs: tag attributes - @type attrs: dict - @param end: either > or /> - @type end: string - @return: None - """ self.fd.write("<%s" % tag.replace("/", "")) for key, val in sorted(attrs.items()): if val is None: self.fd.write(" %s" % key) else: self.fd.write(' %s="%s"' % (key, quote_attrval(val))) - self.fd.write(end) if element_text: - self.fd.write(element_text) + self.fd.write(">%s" % (element_text, tag)) + else: + self.fd.write("/>") def quote_attrval (s): diff --git a/tests/test_parser.py b/tests/test_parser.py index 7e087082..206f8ce4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -31,42 +31,42 @@ from .htmllib import HtmlPrettyPrinter # (, ) parsetests = [ # start tags - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), ("""<>""", """"""), ("""< >""", """"""), - ("""""", u""""""), - ("""""", u""""""), - ("""""", u""""""), + ("""""", u""""""), + ("""""", u""""""), + ("""""", u""""""), # multiple attribute names should be ignored... - ("""""", """"""), + ("""""", """"""), # ... but which one wins - in our implementation the last one - ("""""", """"""), + ("""""", """"""), # reduce test ("""<""", """<"""), # numbers in tag ("""

bla

""", """

bla

"""), # more start tags - ("""""", """"""), - ("""
""", """
"""), + ("""""", """"""), + ("""
""", """
"""), ("""
""", """
"""), - ("""

""", """

"""), + ("""
""", """

"""), # start and end tag (HTML doctype assumed) - ("""
""", """"""), + ("""""", """"""), ("""""", """"""), ("""""", """"""), # line continuation (Dr. Fun webpage) @@ -74,30 +74,30 @@ parsetests = [ ("""""", """"""), ("""""", """"""), # href with $ - ("""""", """"""), + ("""""", """"""), # quoting - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), ("""""", """"""), + """'/images/nav.gif',1);move(this);"/>"""), ("""""", - """"""), + """"""), # entity resolving - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), - ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), + ("""""", """"""), # note that \u8156 is not valid encoding and therefore gets removed - ("""""", """"""), + ("""""", """"""), # mailto link ("""1""", """1"""), @@ -116,19 +116,20 @@ parsetests = [ ("""""", """"""), ("""""", """"""), # missing beginning quote - ("""""", """"""), + ("""""", """"""), # stray < before start tag - ("""<0.""", """"""), + ("""<0.""", """"""), # HTML5 tags - ("""