Merge pull request #364 from cjmayo/parser5

Stop using HTML handlers and improve login form error handling
This commit is contained in:
anarcat 2020-04-30 09:28:48 -04:00 committed by GitHub
commit ab476fa4bf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 90 additions and 153 deletions

View file

@ -26,11 +26,12 @@ import warnings
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
from io import BytesIO
import re
from .. import (log, LOG_CHECK, strformat, mimeutil,
url as urlutil, LinkCheckerError, httputil)
from . import (internpaturl, proxysupport)
from ..htmlutil import htmlsoup, linkparse
from ..htmlutil import htmlsoup
# import warnings
from .const import WARN_HTTP_EMPTY_CONTENT
from requests.sessions import REDIRECT_STATI
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
# helper alias
unicode_safe = strformat.unicode_safe
# match for robots meta element content attribute
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with http scheme.
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if not self.is_html():
return True
# construct handler object
handler = linkparse.MetaRobotsFinder()
# parse
try:
htmlsoup.process_soup(handler, self.get_soup())
except linkparse.StopParse as msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
pass
return handler.follow
soup = self.get_soup()
return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})
def add_size_info (self):
"""Get size of URL content from HTTP header."""

View file

@ -15,35 +15,6 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser implemented using Beautiful Soup and html.parser.
USAGE
Two functions are provided, one to make a BeautifulSoup object from markup and
another to call a handler's callbacks for each element in a BeautifulSoup
object it can process.
The used callback of a handler is:
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
def start_element (tag, attrs, text, line, column)
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@param text: element text
@type tag: string
@param line: tag line number
@type tag: integer
@param column: tag column number
@type tag: integer
EXAMPLE
# Create a new BeautifulSoup object.
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
# Process the soup with the chosen handler as a parameter.
htmlutil.htmlsoup.proces_soup(handler, soup)
"""
from warnings import filterwarnings
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
def process_soup(handler, soup):
for element in soup.find_all(True):
handler.start_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)

View file

@ -30,17 +30,17 @@ LinkTags = {
'a': [u'href'],
'applet': [u'archive', u'src'],
'area': [u'href'],
'audio': [u'src'], # HTML5
'audio': [u'src'], # HTML5
'bgsound': [u'src'],
'blockquote': [u'cite'],
'body': [u'background'],
'button': [u'formaction'], # HTML5
'button': [u'formaction'], # HTML5
'del': [u'cite'],
'embed': [u'pluginspage', u'src'],
'form': [u'action'],
'frame': [u'src', u'longdesc'],
'head': [u'profile'],
'html': [u'manifest'], # HTML5
'html': [u'manifest'], # HTML5
'iframe': [u'src', u'longdesc'],
'ilayer': [u'background'],
'img': [u'src', u'lowsrc', u'longdesc', u'usemap', u'srcset'],
@ -53,13 +53,13 @@ LinkTags = {
'object': [u'classid', u'data', u'archive', u'usemap', u'codebase'],
'q': [u'cite'],
'script': [u'src'],
'source': [u'src'], # HTML5
'source': [u'src'], # HTML5
'table': [u'background'],
'td': [u'background'],
'th': [u'background'],
'tr': [u'background'],
'track': [u'src'], # HTML5
'video': [u'src'], # HTML5
'track': [u'src'], # HTML5
'video': [u'src'], # HTML5
'xmp': [u'href'],
None: [u'style', u'itemtype'],
}
@ -98,44 +98,6 @@ def strip_c_comments (text):
return c_comment_re.sub('', text)
class StopParse(Exception):
"""Raised when parsing should stop."""
pass
class TagFinder (object):
"""Base class handling HTML start elements.
TagFinder instances are used as HTML parser handlers."""
def __init__ (self):
"""Initialize local variables."""
super(TagFinder, self).__init__()
def start_element (self, tag, attrs, element_text, lineno, column):
"""Does nothing, override in a subclass."""
pass
class MetaRobotsFinder (TagFinder):
"""Class for finding robots.txt meta values in HTML."""
def __init__ (self):
"""Initialize follow and index flags."""
super(MetaRobotsFinder, self).__init__()
log.debug(LOG_CHECK, "meta robots finder")
self.follow = self.index = True
def start_element (self, tag, attrs, element_text, lineno, column):
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
if tag == 'meta' and attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
self.follow = u'nofollow' not in val
self.index = u'noindex' not in val
raise StopParse("found <meta name=robots> tag")
elif tag == 'body':
raise StopParse("found <body> tag")
def is_meta_url (attr, attrs):
"""Check if the meta attributes contain a URL."""
res = False
@ -158,24 +120,23 @@ def is_form_get(attr, attrs):
return res
class LinkFinder (TagFinder):
class LinkFinder:
"""Find HTML links, and apply them to the callback function with the
format (url, lineno, column, name, codebase)."""
def __init__ (self, callback, tags):
"""Store content in buffer and initialize URL list."""
super(LinkFinder, self).__init__()
self.callback = callback
# set universal tag attributes using tagname None
self.universal_attrs = set(tags.get(None, []))
self.tags = dict()
for tag, attrs in tags.items():
for tag, attrs in tags.items():
self.tags[tag] = set(attrs)
# add universal tag attributes
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
def start_element (self, tag, attrs, element_text, lineno, column):
def html_element (self, tag, attrs, element_text, lineno, column):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
@ -192,7 +153,7 @@ class LinkFinder (TagFinder):
name = self.get_link_name(tag, attrs, attr, element_text)
# possible codebase
base = u''
if tag == 'applet':
if tag == 'applet':
base = attrs.get('codebase', u'')
if not base:
base = self.base_ref
@ -251,3 +212,15 @@ class LinkFinder (TagFinder):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
self.callback(url, line=lineno, column=column, name=name, base=base)
def find_links(soup, callback, tags):
"""Parse into content and search for URLs to check.
When a URL is found it is passed to the supplied callback.
"""
lf = LinkFinder(callback, tags)
for element in soup.find_all(True):
lf.html_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)

View file

@ -18,7 +18,7 @@
Main functions for link parsing
"""
from .. import log, LOG_CHECK, strformat, url as urlutil
from ..htmlutil import htmlsoup, linkparse
from ..htmlutil import linkparse
from ..bookmarks import firefox
@ -46,7 +46,7 @@ def parse_html (url_data):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url, linkparse.LinkTags)
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)
def parse_opera (url_data):
@ -112,15 +112,7 @@ def parse_wml (url_data):
"""Parse into WML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url, linkparse.WmlTags)
def find_links (url_data, callback, tags):
"""Parse into content and search for URLs to check.
Found URLs are added to the URL queue.
"""
handler = linkparse.LinkFinder(callback, tags)
htmlsoup.process_soup(handler, url_data.get_soup())
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)
def parse_firefox (url_data):

View file

@ -22,7 +22,6 @@ from urllib import parse
from . import _ContentPlugin
from .. import log, LOG_PLUGIN
from ..htmlutil import linkparse
from ..parser import find_links
class AnchorCheck(_ContentPlugin):
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
# list of parsed anchors
self.anchors = []
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
linkparse.find_links(url_data.get_soup(), self.add_anchor,
linkparse.AnchorTags)
self.check_anchor(url_data)
def add_anchor (self, url, line, column, name, base):

View file

@ -1,2 +1,2 @@
<meta name="robots" content="nofollow">
<meta name="robots" content="noindex, Nofollow">
<a href="do_not_check.html">bla</a>

View file

@ -16,10 +16,19 @@
"""
Test that <meta name="robots" content="nofollow"> is respected when using http
and ignored when checking a local file.
Also test different values of the content attribute are correctly matched.
"""
import unittest
import linkcheck.configuration
import linkcheck.director
from linkcheck.htmlutil.htmlsoup import make_soup
from . import get_url_from
from . import LinkCheckTest
from .httpserver import HttpServerTest
class TestHttpMetaRobots(HttpServerTest):
"""Test <meta name="robots" content="nofollow"> using http."""
@ -33,6 +42,7 @@ class TestHttpMetaRobots(HttpServerTest):
]
self.direct(url, resultlines, recursionlevel=1)
class TestFileMetaRobots(LinkCheckTest):
"""Test <meta name="robots" content="nofollow"> from a file."""
@ -52,3 +62,23 @@ class TestFileMetaRobots(LinkCheckTest):
"error"
]
self.direct(url, resultlines, recursionlevel=1)
class TestMetaRobotsVariants(unittest.TestCase):
"""Test different values of the robots meta directive content attribute"""
def test_nofollow_variants(self):
config = linkcheck.configuration.Configuration()
aggregate = linkcheck.director.get_aggregate(config)
url = "http://example.org"
url_data = get_url_from(url, 0, aggregate)
url_data.content_type = "text/html"
url_data.soup = make_soup('<meta name="robots" content="nofollow">')
self.assertFalse(url_data.content_allows_robots())
url_data.soup = make_soup('<meta name="robots" content="nocache, Nofollow, noimageindex">')
self.assertFalse(url_data.content_allows_robots())
url_data.soup = make_soup('<meta name="robots" content="noindex, follow">')
self.assertTrue(url_data.content_allows_robots())

View file

@ -15,50 +15,34 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML parser handler test class.
HTML parser test function.
"""
import sys
class HtmlPrettyPrinter:
def pretty_print_html(fd, soup):
"""
Print out all parsed HTML data in encoded form.
Also stores error and warnings messages.
Print out all parsed HTML data,
writing to the given file descriptor.
@param fd: file like object
@type fd: file
@param soup: BeautifulSoup object
@type soup: BeautifulSoup
"""
for element in soup.find_all(True):
tag = element.name
element_text = element.text.strip()
def __init__ (self, fd=sys.stdout, encoding="iso8859-1"):
"""
Write to given file descriptor in given encoding.
@param fd: file like object (default=sys.stdout)
@type fd: file
@param encoding: encoding (default=iso8859-1)
@type encoding: string
"""
self.fd = fd
self.encoding = encoding
def start_element (self, tag, attrs, element_text, lineno, column):
"""
Print HTML start element.
@param tag: tag name
@type tag: string
@param attrs: tag attributes
@type attrs: dict
@return: None
"""
self.fd.write("<%s" % tag.replace("/", ""))
for key, val in sorted(attrs.items()):
fd.write("<%s" % tag.replace("/", ""))
for key, val in sorted(element.attrs.items()):
if val is None:
self.fd.write(" %s" % key)
fd.write(" %s" % key)
else:
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
fd.write(' %s="%s"' % (key, quote_attrval(val)))
if element_text:
self.fd.write(">%s</%s>" % (element_text, tag))
fd.write(">%s</%s>" % (element_text, tag))
else:
self.fd.write("/>")
fd.write("/>")
def quote_attrval (s):

View file

@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
linkparse.find_links(htmlsoup.make_soup(content),
self._test_one_url(url), linkparse.LinkTags)
self.assertEqual(self.count_url, 1)
def _test_one_url (self, origurl):
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
def _test_no_link (self, content):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
linkparse.find_links(htmlsoup.make_soup(content), callback,
linkparse.LinkTags)
def test_href_parsing (self):
# Test <a href> parsing.

View file

@ -25,7 +25,7 @@ import unittest
from parameterized import parameterized
from .htmllib import HtmlPrettyPrinter
from .htmllib import pretty_print_html
# list of tuples
# (<test pattern>, <expected parse output>)
@ -142,8 +142,7 @@ class TestParser (unittest.TestCase):
def test_parse (self, _in, _out):
# Parse all test patterns in one go.
out = StringIO()
handler = HtmlPrettyPrinter(out)
htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
pretty_print_html(out, htmlsoup.make_soup(_in))
self.check_results(_in, _out, out)
def check_results (self, _in, _out, out):
@ -180,8 +179,5 @@ class TestParser (unittest.TestCase):
self.encoding_test(html, "ascii")
def encoding_test (self, html, expected):
out = StringIO()
handler = HtmlPrettyPrinter(out)
soup = htmlsoup.make_soup(html)
htmlsoup.process_soup(handler, soup)
self.assertEqual(soup.original_encoding, expected)