mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-23 01:10:27 +00:00
Merge pull request #351 from cjmayo/tagsonly
Remove support for non-Tag elements from Parser
This commit is contained in:
commit
cf4e6bb235
4 changed files with 24 additions and 205 deletions
|
|
@ -26,11 +26,6 @@ in the ListDict (ie. "<a href>" with lead to a {href: None} dict entry).
|
|||
|
||||
Used callbacks of a handler are:
|
||||
|
||||
- Comments: <!--data-->
|
||||
def comment (data)
|
||||
@param data:
|
||||
@type data: Unicode string
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs)
|
||||
@param tag: tag name
|
||||
|
|
@ -50,28 +45,6 @@ Used callbacks of a handler are:
|
|||
@param tag: tag name
|
||||
@type tag: Unicode string
|
||||
|
||||
- Document type: <!DOCTYPE data>
|
||||
def doctype (data)
|
||||
@param data: doctype string data
|
||||
@type data: Unicode string
|
||||
|
||||
- Processing instruction (PI): <?name data?>
|
||||
def pi (name, data=None)
|
||||
@param name: instruction name
|
||||
@type name: Unicode string
|
||||
@param data: instruction data
|
||||
@type data: Unicode string
|
||||
|
||||
- Character data: <![CDATA[data]]>
|
||||
def cdata (data)
|
||||
@param data: character data
|
||||
@type data: Unicode string
|
||||
|
||||
- Characters: data
|
||||
def characters(data): data
|
||||
@param data: data
|
||||
@type data: Unicode string
|
||||
|
||||
Additionally, there are error and warning callbacks:
|
||||
|
||||
- Parser warning.
|
||||
|
|
|
|||
|
|
@ -24,8 +24,7 @@ filterwarnings("ignore",
|
|||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
|
||||
from bs4 import (BeautifulSoup, CData, Comment, Doctype, ProcessingInstruction,
|
||||
Tag)
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from ..containers import ListDict
|
||||
|
||||
|
|
@ -87,28 +86,6 @@ class Parser(object):
|
|||
self.parse_contents(content.contents)
|
||||
if hasattr(self.handler, 'end_element'):
|
||||
self.handler.end_element(content.name)
|
||||
if content.comments:
|
||||
for comment in content.comments:
|
||||
if hasattr(self.handler, 'comment'):
|
||||
self.handler.comment(comment)
|
||||
elif isinstance(content, Doctype):
|
||||
if hasattr(self.handler, 'doctype'):
|
||||
self.handler.doctype(
|
||||
content[len('DOCTYPE '):]
|
||||
if content.upper().startswith('DOCTYPE ')
|
||||
else content)
|
||||
elif isinstance(content, Comment):
|
||||
if hasattr(self.handler, 'comment'):
|
||||
self.handler.comment(content.strip())
|
||||
elif isinstance(content, CData):
|
||||
if hasattr(self.handler, 'cdata'):
|
||||
self.handler.cdata(content)
|
||||
elif isinstance(content, ProcessingInstruction):
|
||||
if hasattr(self.handler, 'pi'):
|
||||
self.handler.pi(content.strip("? "))
|
||||
else:
|
||||
if hasattr(self.handler, 'characters'):
|
||||
self.handler.characters(content)
|
||||
|
||||
def flush(self):
|
||||
if self.soup is None:
|
||||
|
|
|
|||
|
|
@ -19,11 +19,9 @@ Default HTML parser handler classes.
|
|||
"""
|
||||
|
||||
import sys
|
||||
from builtins import bytes, str as str_text
|
||||
from builtins import chr
|
||||
|
||||
|
||||
class HtmlPrinter (object):
|
||||
class HtmlPrinter:
|
||||
"""
|
||||
Handles all functions by printing the function name and attributes.
|
||||
"""
|
||||
|
|
@ -46,7 +44,7 @@ class HtmlPrinter (object):
|
|||
@return: None
|
||||
"""
|
||||
self.fd.write(self.mem)
|
||||
self.fd.write(str_text(attrs))
|
||||
self.fd.write(str(attrs))
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""
|
||||
|
|
@ -61,7 +59,7 @@ class HtmlPrinter (object):
|
|||
return self._print
|
||||
|
||||
|
||||
class HtmlPrettyPrinter (object):
|
||||
class HtmlPrettyPrinter:
|
||||
"""
|
||||
Print out all parsed HTML data in encoded form.
|
||||
Also stores error and warnings messages.
|
||||
|
|
@ -79,16 +77,6 @@ class HtmlPrettyPrinter (object):
|
|||
self.fd = fd
|
||||
self.encoding = encoding
|
||||
|
||||
def comment (self, data):
|
||||
"""
|
||||
Print HTML comment.
|
||||
|
||||
@param data: the comment
|
||||
@type data: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write("<!-- %s -->" % data)
|
||||
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""
|
||||
Print HTML start element.
|
||||
|
|
@ -99,7 +87,7 @@ class HtmlPrettyPrinter (object):
|
|||
@type attrs: dict
|
||||
@return: None
|
||||
"""
|
||||
self._start_element(tag, attrs, u">")
|
||||
self._start_element(tag, attrs, ">", element_text)
|
||||
|
||||
def start_end_element (self, tag, attrs, element_text=None):
|
||||
"""
|
||||
|
|
@ -111,9 +99,9 @@ class HtmlPrettyPrinter (object):
|
|||
@type attrs: dict
|
||||
@return: None
|
||||
"""
|
||||
self._start_element(tag, attrs, u"/>")
|
||||
self._start_element(tag, attrs, "/>", element_text)
|
||||
|
||||
def _start_element (self, tag, attrs, end):
|
||||
def _start_element (self, tag, attrs, end, element_text=None):
|
||||
"""
|
||||
Print HTML element with end string.
|
||||
|
||||
|
|
@ -125,13 +113,15 @@ class HtmlPrettyPrinter (object):
|
|||
@type end: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write(u"<%s" % tag.replace("/", ""))
|
||||
self.fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in attrs.items():
|
||||
if val is None:
|
||||
self.fd.write(u" %s" % key)
|
||||
self.fd.write(" %s" % key)
|
||||
else:
|
||||
self.fd.write(u' %s="%s"' % (key, quote_attrval(val)))
|
||||
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
|
||||
self.fd.write(end)
|
||||
if element_text:
|
||||
self.fd.write(element_text)
|
||||
|
||||
def end_element (self, tag):
|
||||
"""
|
||||
|
|
@ -143,46 +133,6 @@ class HtmlPrettyPrinter (object):
|
|||
"""
|
||||
self.fd.write("</%s>" % tag)
|
||||
|
||||
def doctype (self, data):
|
||||
"""
|
||||
Print HTML document type.
|
||||
|
||||
@param data: the document type
|
||||
@type data: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write("<!DOCTYPE %s>" % data)
|
||||
|
||||
def pi (self, data):
|
||||
"""
|
||||
Print HTML pi.
|
||||
|
||||
@param data: the tag data
|
||||
@type data: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write("<?%s?>" % data)
|
||||
|
||||
def cdata (self, data):
|
||||
"""
|
||||
Print HTML cdata.
|
||||
|
||||
@param data: the character data
|
||||
@type data: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write("<![CDATA[%s]]>" % data)
|
||||
|
||||
def characters (self, data):
|
||||
"""
|
||||
Print characters.
|
||||
|
||||
@param data: the character data
|
||||
@type data: string
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write(data)
|
||||
|
||||
|
||||
def quote_attrval (s):
|
||||
"""
|
||||
|
|
@ -195,18 +145,14 @@ def quote_attrval (s):
|
|||
"""
|
||||
res = []
|
||||
for c in s:
|
||||
try: # Python 2
|
||||
ord_c = ord(c)
|
||||
except TypeError:
|
||||
ord_c = c
|
||||
if ord_c <= 127:
|
||||
if ord(c) <= 127:
|
||||
# ASCII
|
||||
if c == u'&':
|
||||
res.append(u"&")
|
||||
elif c == u'"':
|
||||
res.append(u""")
|
||||
if c == '&':
|
||||
res.append("&")
|
||||
elif c == '"':
|
||||
res.append(""")
|
||||
else:
|
||||
res.append(chr(ord_c))
|
||||
res.append(c)
|
||||
else:
|
||||
res.append(u"&#%d;" % ord_c)
|
||||
return u"".join(res)
|
||||
res.append("&#%d;" % ord(c))
|
||||
return "".join(res)
|
||||
|
|
|
|||
|
|
@ -20,10 +20,7 @@ Test html parsing.
|
|||
|
||||
import linkcheck.HtmlParser.htmlsax
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO
|
||||
from io import StringIO
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
|
@ -38,7 +35,6 @@ parsetests = [
|
|||
("""<a b='c' >""", """<a b="c"></a>"""),
|
||||
("""<a b=c" >""", """<a b="c""></a>"""),
|
||||
("""<a b=c' >""", """<a b="c'"></a>"""),
|
||||
("""<a b="c >""", """<a b="c >"""),
|
||||
("""<a b="" >""", """<a b=""></a>"""),
|
||||
("""<a b='' >""", """<a b=""></a>"""),
|
||||
("""<a b=>""", """<a b=""></a>"""),
|
||||
|
|
@ -51,10 +47,8 @@ parsetests = [
|
|||
("""<a b ="c" >""", """<a b="c"></a>"""),
|
||||
("""<a b = "c" >""", """<a b="c"></a>"""),
|
||||
("""<a >""", """<a></a>"""),
|
||||
("""< a>""", """< a>"""),
|
||||
("""< a >""", """< a >"""),
|
||||
("""<>""", """<>"""),
|
||||
("""< >""", """< >"""),
|
||||
("""<>""", """"""),
|
||||
("""< >""", """"""),
|
||||
("""<aä>""", u"""<aä></aä>"""),
|
||||
("""<a aä="b">""", u"""<a aä="b"></a>"""),
|
||||
("""<a a="bä">""", u"""<a a="bä"></a>"""),
|
||||
|
|
@ -64,7 +58,6 @@ parsetests = [
|
|||
("""<a b="c" b="d" >""", """<a b="d"></a>"""),
|
||||
# reduce test
|
||||
("""<a b="c"><""", """<a b="c"><</a>"""),
|
||||
("""d>""", """d>"""),
|
||||
# numbers in tag
|
||||
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
|
||||
# more start tags
|
||||
|
|
@ -72,49 +65,10 @@ parsetests = [
|
|||
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br/>"""),
|
||||
("""<br/>""", """<br/>"""),
|
||||
("""<a b="50%"><br>""", """<a b="50%"><br/></a>"""),
|
||||
# comments
|
||||
("""<!---->< 1>""", """<!-- -->< 1>"""),
|
||||
("""<!-- a - b -->< 2>""", """<!-- a - b -->< 2>"""),
|
||||
("""<!----->< 3>""", """<!-- - -->< 3>"""),
|
||||
("""<!------>< 4>""", """<!-- -- -->< 4>"""),
|
||||
("""<!------->< 5>""", """<!-- --- -->< 5>"""),
|
||||
("""<!-- -->< 7>""", """<!-- -->< 7>"""),
|
||||
("""<!---- />-->""", """<!-- -- /> -->"""),
|
||||
("""<!-- a-2 -->< 9>""", """<!-- a-2 -->< 9>"""),
|
||||
("""<!-- --- -->< 10>""", """<!-- --- -->< 10>"""),
|
||||
("""<!>""", """<!-- -->"""), # empty comment
|
||||
# invalid comments
|
||||
("""<!-- -- >< 8>""", """<!-- -->< 8>"""),
|
||||
("""<!---- >< 6>""", """<!-- -->< 6>"""),
|
||||
("""<!- blubb ->""", """<!-- - blubb - -->"""),
|
||||
("""<! -- blubb -->""", """<!-- -- blubb -- -->"""),
|
||||
("""<!-- blubb -- >""", """<!-- blubb -->"""),
|
||||
("""<! blubb !>< a>""", """<!-- blubb ! -->< a>"""),
|
||||
("""<! blubb >< a>""", """<!-- blubb -->< a>"""),
|
||||
# end tags
|
||||
("""</a>""", """"""),
|
||||
("""</ a>""", """"""),
|
||||
("""</ a >""", """"""),
|
||||
("""</a >""", """"""),
|
||||
("""< / a>""", """< / a>"""),
|
||||
("""< /a>""", """< /a>"""),
|
||||
("""</aä>""", """"""),
|
||||
# start and end tag (HTML doctype assumed)
|
||||
("""<a/>""", """<a></a>"""),
|
||||
("""<meta/>""", """<meta/>"""),
|
||||
("""<MetA/>""", """<meta/>"""),
|
||||
# declaration tags
|
||||
("""<!DOCtype adrbook SYSTEM "adrbook.dtd">""",
|
||||
"""<!DOCTYPE adrbook SYSTEM "adrbook.dtd">"""),
|
||||
# misc
|
||||
("""<?xmL version="1.0" encoding="latin1"?>""",
|
||||
"""<?xmL version="1.0" encoding="latin1"?>"""),
|
||||
# javascript
|
||||
("""<script >\n</script>""", """<script>\n</script>"""),
|
||||
("""<sCrIpt lang="a">bla </a> fasel</scripT>""",
|
||||
"""<script lang="a">bla </a> fasel</script>"""),
|
||||
("""<script ><!--bla//-->// </script >""",
|
||||
"""<script><!--bla//-->// </script>"""),
|
||||
# line continuation (Dr. Fun webpage)
|
||||
("""<img bo\\\nrder=0 >""", """<img bo\\="" rder="0"/>"""),
|
||||
("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
|
||||
|
|
@ -144,15 +98,9 @@ parsetests = [
|
|||
("""<a href="mailto:" >""", """<a href="mailto:"></a>"""),
|
||||
# note that \u8156 is not valid encoding and therefore gets removed
|
||||
("""<a href="῜ailto:" >""", """<a href="῜ailto:"></a>"""),
|
||||
# non-ascii characters
|
||||
("""<Üzgür> fahr </langsamer> żżżżżż{""",
|
||||
u"""<Üzgür> fahr żżżżżż{"""),
|
||||
# mailto link
|
||||
("""<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
|
||||
"""<a href="mailto:calvin@LocalHost?subject=Hallo&to=michi">1</a>"""),
|
||||
# doctype XHTML
|
||||
("""<!DOCTYPe html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><MeTa a="b"/>""",
|
||||
"""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><meta a="b"/>"""),
|
||||
# meta tag with charset encoding
|
||||
("""<meta http-equiv="content-type" content>""",
|
||||
"""<meta content="" http-equiv="content-type"/>"""),
|
||||
|
|
@ -164,22 +112,13 @@ parsetests = [
|
|||
"""<meta content="text/html; charset=iso8859-1" http-equiv="content-type"/>"""),
|
||||
("""<meta http-equiv="content-type" content="text/html; charset=hulla">""",
|
||||
"""<meta content="text/html; charset=hulla" http-equiv="content-type"/>"""),
|
||||
# CDATA
|
||||
("""<![CDATA[<a>hallo</a>]]>""", """<![CDATA[<a>hallo</a>]]>"""),
|
||||
# missing > in end tag
|
||||
("""</td <td a="b" >""", """"""),
|
||||
("""</td<td a="b" >""", """"""),
|
||||
# missing beginning quote
|
||||
("""<td a=b">""", """<td a="b""></td>"""),
|
||||
# stray < before start tag
|
||||
("""<0.<td a="b" >""", """<0.<td a="b"></td>"""),
|
||||
# stray < before end tag
|
||||
("""<0.</td >""", """<0."""),
|
||||
# missing end quote (XXX TODO)
|
||||
#("""<td a="b>\n""", """<td a="b">\n"""),
|
||||
#("""<td a="b></td>\na""", """<td a="b"></td>\na"""),
|
||||
#("""<a b="c><a b="c>\n""", """<a b="c"><a b="c">\n"""),
|
||||
#("""<td a="b c="d"></td>\n""", """<td a="b" c="d"></td>\n"""),
|
||||
("""<0.<td a="b" >""", """<td a="b"></td>"""),
|
||||
# HTML5 tags
|
||||
("""<audio src=bla>""", """<audio src="bla"></audio>"""),
|
||||
("""<button formaction=bla>""", """<button formaction="bla"></button>"""),
|
||||
|
|
@ -192,13 +131,6 @@ parsetests = [
|
|||
("""<a></a><b></b>""", """<a></a><b></b>"""),
|
||||
]
|
||||
|
||||
flushtests = [
|
||||
("<", "<"),
|
||||
("<a", "<a"),
|
||||
("<!a", "<!a"),
|
||||
("<?a", "<?a"),
|
||||
]
|
||||
|
||||
|
||||
class TestParser (unittest.TestCase):
|
||||
"""
|
||||
|
|
@ -270,15 +202,6 @@ class TestParser (unittest.TestCase):
|
|||
self.htmlparser2.feed(c)
|
||||
self.assertEqual(out.getvalue(), out2.getvalue())
|
||||
|
||||
@parameterized.expand(flushtests)
|
||||
def test_flush (self, _in, _out):
|
||||
# Test parser flushing.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
self.htmlparser.feed(_in)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
|
||||
def test_encoding_detection_utf_content (self):
|
||||
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
||||
self.encoding_test(html, "utf-8")
|
||||
|
|
|
|||
Loading…
Reference in a new issue