linkchecker/tests/test_parser.py

202 lines
7.8 KiB
Python
Raw Permalink Normal View History

2012-03-30 20:24:10 +00:00
# Copyright (C) 2004-2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2009-07-24 21:58:20 +00:00
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test html parsing.
"""
from linkcheck.htmlutil import htmlsoup
from io import StringIO
import pytest
2018-01-08 18:46:20 +00:00
from .htmllib import pretty_print_html
# list of tuples
# (<test pattern>, <expected parse output>)
parsetests = [
# start tags
("""<a b="c" >""", """<a b="c"/>"""),
("""<a b='c' >""", """<a b="c"/>"""),
("""<a b=c" >""", """<a b="c&quot;"/>"""),
("""<a b=c' >""", """<a b="c'"/>"""),
("""<a b="" >""", """<a b=""/>"""),
("""<a b='' >""", """<a b=""/>"""),
("""<a b=>""", """<a b=""/>"""),
("""<a b= >""", """<a b=""/>"""),
("""<a =c>""", """<a =c=""/>"""),
("""<a =c >""", """<a =c=""/>"""),
("""<a =>""", """<a ==""/>"""),
("""<a = >""", """<a ==""/>"""),
("""<a b= "c" >""", """<a b="c"/>"""),
("""<a b ="c" >""", """<a b="c"/>"""),
("""<a b = "c" >""", """<a b="c"/>"""),
("""<a >""", """<a/>"""),
("""<>""", """"""),
("""< >""", """"""),
2020-04-30 19:11:59 +00:00
("""<aä>""", """<aä/>"""),
("""<a aä="b">""", """<a aä="b"/>"""),
("""<a a="">""", """<a a="b&#228;"/>"""),
# multiple attribute names should be ignored...
("""<a b="c" b="c" >""", """<a b="c"/>"""),
# ... but which one wins - in our implementation the last one
("""<a b="c" b="d" >""", """<a b="d"/>"""),
# reduce test
2019-07-22 18:59:37 +00:00
("""<a b="c"><""", """<a b="c"><</a>"""),
# numbers in tag
("""<h1>bla</h1>""", """<h1>bla</h1>"""),
# more start tags
("""<a b=c"><a b="c">""", """<a b="c&quot;"/><a b="c"/>"""),
("""<a b=/c/></a><br>""", """<a b="/c/"/><br/>"""),
2019-07-22 18:59:37 +00:00
("""<br/>""", """<br/>"""),
("""<a b="50%"><br>""", """<a b="50%"/><br/>"""),
# start and end tag (HTML doctype assumed)
("""<a/>""", """<a/>"""),
2019-07-22 18:59:37 +00:00
("""<meta/>""", """<meta/>"""),
("""<MetA/>""", """<meta/>"""),
# line continuation (Dr. Fun webpage)
2019-07-22 18:59:37 +00:00
("""<img bo\\\nrder=0 >""", """<img bo\\="" rder="0"/>"""),
("""<img align="mid\\\ndle">""", """<img align="mid\\\ndle"/>"""),
("""<img align='mid\\\ndle'>""", """<img align="mid\\\ndle"/>"""),
# href with $
("""<a href="123$456">""", """<a href="123$456"/>"""),
# quoting
("""<a href=/ >""", """<a href="/"/>"""),
("""<a href= />""", """<a href="/"/>"""),
("""<a href= >""", """<a href=""/>"""),
("""<a href="'" >""", """<a href="'"/>"""),
("""<a href='"' >""", """<a href="&quot;"/>"""),
("""<a href="bla" %]" >""", """<a %]"="" href="bla"/>"""),
("""<a href=bla" >""", """<a href="bla&quot;"/>"""),
2020-05-28 19:29:13 +00:00
(
"""<a onmouseover=blubb('nav1','',"""
"""'/images/nav.gif',1);move(this); b="c">""",
"""<a b="c" onmouseover="blubb('nav1','',"""
"""'/images/nav.gif',1);move(this);"/>""",
),
(
"""<a onClick=location.href('/index.htm') b="c">""",
"""<a b="c" onclick="location.href('/index.htm')"/>""",
),
# entity resolving
("""<a href="&#6D;ailto:" >""", """<a href="D;ailto:"/>"""),
("""<a href="&amp;ailto:" >""", """<a href="&amp;ailto:"/>"""),
("""<a href="&amp;amp;ailto:" >""", """<a href="&amp;amp;ailto:"/>"""),
("""<a href="&hulla;ailto:" >""", """<a href="&amp;hulla;ailto:"/>"""),
("""<a href="&#109;ailto:" >""", """<a href="mailto:"/>"""),
("""<a href="&#x6D;ailto:" >""", """<a href="mailto:"/>"""),
# note that \u8156 is not valid encoding and therefore gets removed
("""<a href="&#8156;ailto:" >""", """<a href="&#8156;ailto:"/>"""),
# mailto link
2020-05-28 19:29:13 +00:00
(
"""<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>""",
"""<a href="mailto:calvin@LocalHost?subject=Hallo&amp;to=michi">1</a>""",
),
# meta tag with charset encoding
2020-05-28 19:29:13 +00:00
(
"""<meta http-equiv="content-type" content>""",
"""<meta content="" http-equiv="content-type"/>""",
),
(
"""<meta http-equiv="content-type" content=>""",
"""<meta content="" http-equiv="content-type"/>""",
),
(
"""<meta http-equiv="content-type" content="hulla">""",
"""<meta content="hulla" http-equiv="content-type"/>""",
),
(
"""<meta http-equiv="content-type" content="text/html; charset=iso8859-1">""",
"""<meta content="text/html; charset=iso8859-1" http-equiv="content-type"/>""",
),
(
"""<meta http-equiv="content-type" content="text/html; charset=hulla">""",
"""<meta content="text/html; charset=hulla" http-equiv="content-type"/>""",
),
# missing > in end tag
2019-07-22 18:59:37 +00:00
("""</td <td a="b" >""", """"""),
("""</td<td a="b" >""", """"""),
# missing beginning quote
("""<td a=b">""", """<td a="b&quot;"/>"""),
2011-05-28 11:39:04 +00:00
# stray < before start tag
("""<0.<td a="b" >""", """<td a="b"/>"""),
2011-12-30 11:30:07 +00:00
# HTML5 tags
("""<audio src=bla>""", """<audio src="bla"/>"""),
("""<button formaction=bla>""", """<button formaction="bla"/>"""),
("""<html manifest=bla>""", """<html manifest="bla"/>"""),
2019-07-22 18:59:37 +00:00
("""<source src=bla>""", """<source src="bla"/>"""),
("""<track src=bla>""", """<track src="bla"/>"""),
("""<video src=bla>""", """<video src="bla"/>"""),
2019-07-22 18:59:37 +00:00
# Test inserted tag s
("""<a></a><b></b>""", """<a/><b/>"""),
# This is not correct result for an HTML parser, but it is for us
("""<b><a></a></b>""", """<b/><a/>"""),
]
class TestParser:
"""
Test html parser.
"""
@pytest.mark.parametrize("_in, _out", parsetests)
def test_parse(self, _in, _out):
# Parse all test patterns in one go.
2018-01-08 18:46:20 +00:00
out = StringIO()
pretty_print_html(out, htmlsoup.make_soup(_in))
self.check_results(_in, _out, out)
def check_results(self, _in, _out, out):
"""
Check parse results.
"""
res = out.getvalue()
2022-11-08 19:21:29 +00:00
msg = f"Test error; in: {_in!r}, out: {res!r}, expect: {_out!r}"
assert res == _out, msg
def test_encoding_detection_utf_content(self):
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
self.encoding_test(html, "utf-8")
def test_encoding_detection_utf_charset(self):
html = b'<meta charset="UTF-8">'
self.encoding_test(html, "utf-8")
def test_encoding_detection_iso_content(self):
2020-05-28 19:29:13 +00:00
html = (
b'<meta http-equiv="content-type" content="text/html; charset=ISO8859-1">'
)
self.encoding_test(html, "iso8859-1")
def test_encoding_detection_iso_charset(self):
html = b'<meta charset="ISO8859-1">'
self.encoding_test(html, "iso8859-1")
def test_encoding_detection_iso_bad_charset(self):
html = b'<meta charset="hulla">'
self.encoding_test(html, "ascii")
def test_encoding_detection_iso_bad_content(self):
html = b'<meta http-equiv="content-type" content="text/html; charset=blabla">'
self.encoding_test(html, "ascii")
def encoding_test(self, html, expected):
# For encoding detection Beautiful Soup uses if available in order
# of preference cchardet then chardet then charset-normalizer.
# Results for html without a valid charset may differ
# based on cchardet/chardet/charset-normalizer availability.
soup = htmlsoup.make_soup(html)
assert soup.original_encoding == expected