mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Markup is not being passed in pieces to the parser, so simplify the interface and reduce the state further.
192 lines
7.9 KiB
Python
192 lines
7.9 KiB
Python
# -*- coding: utf8 -*-
|
|
# Copyright (C) 2004-2012 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Test html parsing.
|
|
"""
|
|
|
|
from linkcheck.HtmlParser import htmlsax
|
|
|
|
from io import StringIO
|
|
import unittest
|
|
|
|
from parameterized import parameterized
|
|
|
|
from .htmllib import HtmlPrettyPrinter
|
|
|
|
# list of tuples
|
|
# (<test pattern>, <expected parse output>)
|
|
parsetests = [
|
|
# start tags
|
|
("""<a b="c" >""", """<a b="c"></a>"""),
|
|
("""<a b='c' >""", """<a b="c"></a>"""),
|
|
("""<a b=c" >""", """<a b="c""></a>"""),
|
|
("""<a b=c' >""", """<a b="c'"></a>"""),
|
|
("""<a b="" >""", """<a b=""></a>"""),
|
|
("""<a b='' >""", """<a b=""></a>"""),
|
|
("""<a b=>""", """<a b=""></a>"""),
|
|
("""<a b= >""", """<a b=""></a>"""),
|
|
("""<a =c>""", """<a =c=""></a>"""),
|
|
("""<a =c >""", """<a =c=""></a>"""),
|
|
("""<a =>""", """<a ==""></a>"""),
|
|
("""<a = >""", """<a ==""></a>"""),
|
|
("""<a b= "c" >""", """<a b="c"></a>"""),
|
|
("""<a b ="c" >""", """<a b="c"></a>"""),
|
|
("""<a b = "c" >""", """<a b="c"></a>"""),
|
|
("""<a >""", """<a></a>"""),
|
|
("""<>""", """"""),
|
|
("""< >""", """"""),
|
|
("""<aä>""", u"""<aä></aä>"""),
|
|
("""<a aä="b">""", u"""<a aä="b"></a>"""),
|
|
("""<a a="bä">""", u"""<a a="bä"></a>"""),
|
|
# multiple attribute names should be ignored...
|
|
("""<a b="c" b="c" >""", """<a b="c"></a>"""),
|
|
# ... but which one wins - in our implementation the last one
|
|
("""<a b="c" b="d" >""", """<a b="d"></a>"""),
|
|
# reduce test
|
|
("""<a b="c"><""", """<a b="c"><</a>"""),
|
|
# numbers in tag
|
|
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
|
|
# more start tags
|
|
("""<a b=c"><a b="c">""", """<a b="c""><a b="c"></a></a>"""),
|
|
("""<a b=/c/></a><br>""", """<a b="/c/"></a><br/>"""),
|
|
("""<br/>""", """<br/>"""),
|
|
("""<a b="50%"><br>""", """<a b="50%"><br/></a>"""),
|
|
# start and end tag (HTML doctype assumed)
|
|
("""<a/>""", """<a></a>"""),
|
|
("""<meta/>""", """<meta/>"""),
|
|
("""<MetA/>""", """<meta/>"""),
|
|
# line continuation (Dr. Fun webpage)
|
|
("""<img bo\\\nrder=0 >""", """<img bo\\="" rder="0"/>"""),
|
|
("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
|
|
("""<img align='mid\\\ndle'>""", """<img align="mid\\\ndle"/>"""),
|
|
# href with $
|
|
("""<a href="123$456">""", """<a href="123$456"></a>"""),
|
|
# quoting
|
|
("""<a href=/ >""", """<a href="/"></a>"""),
|
|
("""<a href= />""", """<a href="/"></a>"""),
|
|
("""<a href= >""", """<a href=""></a>"""),
|
|
("""<a href="'" >""", """<a href="'"></a>"""),
|
|
("""<a href='"' >""", """<a href="""></a>"""),
|
|
("""<a href="bla" %]" >""", """<a %]"="" href="bla"></a>"""),
|
|
("""<a href=bla" >""", """<a href="bla""></a>"""),
|
|
("""<a onmouseover=blubb('nav1','',"""\
|
|
"""'/images/nav.gif',1);move(this); b="c">""",
|
|
"""<a b="c" onmouseover="blubb('nav1','',"""\
|
|
"""'/images/nav.gif',1);move(this);"></a>"""),
|
|
("""<a onClick=location.href('/index.htm') b="c">""",
|
|
"""<a b="c" onclick="location.href('/index.htm')"></a>"""),
|
|
# entity resolving
|
|
("""<a href="D;ailto:" >""", """<a href="D;ailto:"></a>"""),
|
|
("""<a href="&ailto:" >""", """<a href="&ailto:"></a>"""),
|
|
("""<a href="&amp;ailto:" >""", """<a href="&amp;ailto:"></a>"""),
|
|
("""<a href="&hulla;ailto:" >""", """<a href="&hulla;ailto:"></a>"""),
|
|
("""<a href="mailto:" >""", """<a href="mailto:"></a>"""),
|
|
("""<a href="mailto:" >""", """<a href="mailto:"></a>"""),
|
|
# note that \u8156 is not valid encoding and therefore gets removed
|
|
("""<a href="῜ailto:" >""", """<a href="῜ailto:"></a>"""),
|
|
# mailto link
|
|
("""<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
|
|
"""<a href="mailto:calvin@LocalHost?subject=Hallo&to=michi">1</a>"""),
|
|
# meta tag with charset encoding
|
|
("""<meta http-equiv="content-type" content>""",
|
|
"""<meta content="" http-equiv="content-type"/>"""),
|
|
("""<meta http-equiv="content-type" content=>""",
|
|
"""<meta content="" http-equiv="content-type"/>"""),
|
|
("""<meta http-equiv="content-type" content="hulla">""",
|
|
"""<meta content="hulla" http-equiv="content-type"/>"""),
|
|
("""<meta http-equiv="content-type" content="text/html; charset=iso8859-1">""",
|
|
"""<meta content="text/html; charset=iso8859-1" http-equiv="content-type"/>"""),
|
|
("""<meta http-equiv="content-type" content="text/html; charset=hulla">""",
|
|
"""<meta content="text/html; charset=hulla" http-equiv="content-type"/>"""),
|
|
# missing > in end tag
|
|
("""</td <td a="b" >""", """"""),
|
|
("""</td<td a="b" >""", """"""),
|
|
# missing beginning quote
|
|
("""<td a=b">""", """<td a="b""></td>"""),
|
|
# stray < before start tag
|
|
("""<0.<td a="b" >""", """<td a="b"></td>"""),
|
|
# HTML5 tags
|
|
("""<audio src=bla>""", """<audio src="bla"></audio>"""),
|
|
("""<button formaction=bla>""", """<button formaction="bla"></button>"""),
|
|
("""<html manifest=bla>""", """<html manifest="bla"></html>"""),
|
|
("""<source src=bla>""", """<source src="bla"/>"""),
|
|
("""<track src=bla>""", """<track src="bla"/>"""),
|
|
("""<video src=bla>""", """<video src="bla"></video>"""),
|
|
# Test inserted tag s
|
|
("""<b><a></a></b>""", """<b><a></a></b>"""),
|
|
("""<a></a><b></b>""", """<a></a><b></b>"""),
|
|
]
|
|
|
|
|
|
class TestParser (unittest.TestCase):
|
|
"""
|
|
Test html parser.
|
|
"""
|
|
|
|
@parameterized.expand(parsetests)
|
|
def test_parse (self, _in, _out):
|
|
# Parse all test patterns in one go.
|
|
out = StringIO()
|
|
handler = HtmlPrettyPrinter(out)
|
|
parser = htmlsax.parser(handler)
|
|
parser.feed_soup(htmlsax.make_soup(_in))
|
|
self.check_results(_in, _out, out)
|
|
|
|
def check_results (self, htmlparser, _in, _out, out):
|
|
"""
|
|
Check parse results.
|
|
"""
|
|
htmlparser.flush()
|
|
res = out.getvalue()
|
|
msg = "Test error; in: %r, out: %r, expect: %r" % \
|
|
(_in, res, _out)
|
|
self.assertEqual(res, _out, msg=msg)
|
|
htmlparser.reset()
|
|
|
|
def test_encoding_detection_utf_content (self):
|
|
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
|
self.encoding_test(html, "utf-8")
|
|
|
|
def test_encoding_detection_utf_charset (self):
|
|
html = b'<meta charset="UTF-8">'
|
|
self.encoding_test(html, "utf-8")
|
|
|
|
def test_encoding_detection_iso_content (self):
|
|
html = b'<meta http-equiv="content-type" content="text/html; charset=ISO8859-1">'
|
|
self.encoding_test(html, "iso8859-1")
|
|
|
|
def test_encoding_detection_iso_charset (self):
|
|
html = b'<meta charset="ISO8859-1">'
|
|
self.encoding_test(html, "iso8859-1")
|
|
|
|
def test_encoding_detection_iso_bad_charset (self):
|
|
html = b'<meta charset="hulla">'
|
|
self.encoding_test(html, "ascii")
|
|
|
|
def test_encoding_detection_iso_bad_content (self):
|
|
html = b'<meta http-equiv="content-type" content="text/html; charset=blabla">'
|
|
self.encoding_test(html, "ascii")
|
|
|
|
def encoding_test (self, html, expected):
|
|
parser = htmlsax.parser()
|
|
self.assertEqual(parser.encoding, None)
|
|
out = StringIO()
|
|
handler = HtmlPrettyPrinter(out)
|
|
parser = htmlsax.parser(handler)
|
|
parser.feed_soup(htmlsax.make_soup(html))
|
|
parser.flush()
|
|
self.assertEqual(parser.encoding, expected)
|