mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-10 01:21:00 +00:00
Merge pull request #364 from cjmayo/parser5
Stop using HTML handlers and improve login form error handling
This commit is contained in:
commit
ab476fa4bf
10 changed files with 90 additions and 153 deletions
|
|
@ -26,11 +26,12 @@ import warnings
|
|||
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
||||
from .. import (log, LOG_CHECK, strformat, mimeutil,
|
||||
url as urlutil, LinkCheckerError, httputil)
|
||||
from . import (internpaturl, proxysupport)
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
from ..htmlutil import htmlsoup
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
from requests.sessions import REDIRECT_STATI
|
||||
|
|
@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://')
|
|||
# helper alias
|
||||
unicode_safe = strformat.unicode_safe
|
||||
|
||||
# match for robots meta element content attribute
|
||||
nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE)
|
||||
|
||||
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||
"""
|
||||
Url link with http scheme.
|
||||
|
|
@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not self.is_html():
|
||||
return True
|
||||
# construct handler object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
# parse
|
||||
try:
|
||||
htmlsoup.process_soup(handler, self.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
return handler.follow
|
||||
|
||||
soup = self.get_soup()
|
||||
return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re})
|
||||
|
||||
def add_size_info (self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
|
|
|
|||
|
|
@ -15,35 +15,6 @@
|
|||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
HTML parser implemented using Beautiful Soup and html.parser.
|
||||
|
||||
USAGE
|
||||
|
||||
Two functions are provided, one to make a BeautifulSoup object from markup and
|
||||
another to call a handler's callbacks for each element in a BeautifulSoup
|
||||
object it can process.
|
||||
|
||||
The used callback of a handler is:
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs, text, line, column)
|
||||
@param tag: tag name
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: dict
|
||||
@param text: element text
|
||||
@type tag: string
|
||||
@param line: tag line number
|
||||
@type tag: integer
|
||||
@param column: tag column number
|
||||
@type tag: integer
|
||||
|
||||
EXAMPLE
|
||||
|
||||
# Create a new BeautifulSoup object.
|
||||
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
|
||||
# Process the soup with the chosen handler as a parameter.
|
||||
htmlutil.htmlsoup.proces_soup(handler, soup)
|
||||
|
||||
"""
|
||||
|
||||
from warnings import filterwarnings
|
||||
|
|
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
|
|||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
|
||||
def process_soup(handler, soup):
|
||||
for element in soup.find_all(True):
|
||||
handler.start_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
|
|
|
|||
|
|
@ -30,17 +30,17 @@ LinkTags = {
|
|||
'a': [u'href'],
|
||||
'applet': [u'archive', u'src'],
|
||||
'area': [u'href'],
|
||||
'audio': [u'src'], # HTML5
|
||||
'audio': [u'src'], # HTML5
|
||||
'bgsound': [u'src'],
|
||||
'blockquote': [u'cite'],
|
||||
'body': [u'background'],
|
||||
'button': [u'formaction'], # HTML5
|
||||
'button': [u'formaction'], # HTML5
|
||||
'del': [u'cite'],
|
||||
'embed': [u'pluginspage', u'src'],
|
||||
'form': [u'action'],
|
||||
'frame': [u'src', u'longdesc'],
|
||||
'head': [u'profile'],
|
||||
'html': [u'manifest'], # HTML5
|
||||
'html': [u'manifest'], # HTML5
|
||||
'iframe': [u'src', u'longdesc'],
|
||||
'ilayer': [u'background'],
|
||||
'img': [u'src', u'lowsrc', u'longdesc', u'usemap', u'srcset'],
|
||||
|
|
@ -53,13 +53,13 @@ LinkTags = {
|
|||
'object': [u'classid', u'data', u'archive', u'usemap', u'codebase'],
|
||||
'q': [u'cite'],
|
||||
'script': [u'src'],
|
||||
'source': [u'src'], # HTML5
|
||||
'source': [u'src'], # HTML5
|
||||
'table': [u'background'],
|
||||
'td': [u'background'],
|
||||
'th': [u'background'],
|
||||
'tr': [u'background'],
|
||||
'track': [u'src'], # HTML5
|
||||
'video': [u'src'], # HTML5
|
||||
'track': [u'src'], # HTML5
|
||||
'video': [u'src'], # HTML5
|
||||
'xmp': [u'href'],
|
||||
None: [u'style', u'itemtype'],
|
||||
}
|
||||
|
|
@ -98,44 +98,6 @@ def strip_c_comments (text):
|
|||
return c_comment_re.sub('', text)
|
||||
|
||||
|
||||
class StopParse(Exception):
|
||||
"""Raised when parsing should stop."""
|
||||
pass
|
||||
|
||||
|
||||
class TagFinder (object):
|
||||
"""Base class handling HTML start elements.
|
||||
TagFinder instances are used as HTML parser handlers."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize local variables."""
|
||||
super(TagFinder, self).__init__()
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Does nothing, override in a subclass."""
|
||||
pass
|
||||
|
||||
|
||||
class MetaRobotsFinder (TagFinder):
|
||||
"""Class for finding robots.txt meta values in HTML."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize follow and index flags."""
|
||||
super(MetaRobotsFinder, self).__init__()
|
||||
log.debug(LOG_CHECK, "meta robots finder")
|
||||
self.follow = self.index = True
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
|
||||
if tag == 'meta' and attrs.get('name') == 'robots':
|
||||
val = attrs.get('content', u'').lower().split(u',')
|
||||
self.follow = u'nofollow' not in val
|
||||
self.index = u'noindex' not in val
|
||||
raise StopParse("found <meta name=robots> tag")
|
||||
elif tag == 'body':
|
||||
raise StopParse("found <body> tag")
|
||||
|
||||
|
||||
def is_meta_url (attr, attrs):
|
||||
"""Check if the meta attributes contain a URL."""
|
||||
res = False
|
||||
|
|
@ -158,24 +120,23 @@ def is_form_get(attr, attrs):
|
|||
return res
|
||||
|
||||
|
||||
class LinkFinder (TagFinder):
|
||||
class LinkFinder:
|
||||
"""Find HTML links, and apply them to the callback function with the
|
||||
format (url, lineno, column, name, codebase)."""
|
||||
|
||||
def __init__ (self, callback, tags):
|
||||
"""Store content in buffer and initialize URL list."""
|
||||
super(LinkFinder, self).__init__()
|
||||
self.callback = callback
|
||||
# set universal tag attributes using tagname None
|
||||
self.universal_attrs = set(tags.get(None, []))
|
||||
self.tags = dict()
|
||||
for tag, attrs in tags.items():
|
||||
for tag, attrs in tags.items():
|
||||
self.tags[tag] = set(attrs)
|
||||
# add universal tag attributes
|
||||
self.tags[tag].update(self.universal_attrs)
|
||||
self.base_ref = u''
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
def html_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
|
||||
|
|
@ -192,7 +153,7 @@ class LinkFinder (TagFinder):
|
|||
name = self.get_link_name(tag, attrs, attr, element_text)
|
||||
# possible codebase
|
||||
base = u''
|
||||
if tag == 'applet':
|
||||
if tag == 'applet':
|
||||
base = attrs.get('codebase', u'')
|
||||
if not base:
|
||||
base = self.base_ref
|
||||
|
|
@ -251,3 +212,15 @@ class LinkFinder (TagFinder):
|
|||
"""Add newly found URL to queue."""
|
||||
assert isinstance(url, str_text) or url is None, repr(url)
|
||||
self.callback(url, line=lineno, column=column, name=name, base=base)
|
||||
|
||||
|
||||
def find_links(soup, callback, tags):
|
||||
"""Parse into content and search for URLs to check.
|
||||
When a URL is found it is passed to the supplied callback.
|
||||
"""
|
||||
lf = LinkFinder(callback, tags)
|
||||
for element in soup.find_all(True):
|
||||
lf.html_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
Main functions for link parsing
|
||||
"""
|
||||
from .. import log, LOG_CHECK, strformat, url as urlutil
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
from ..htmlutil import linkparse
|
||||
from ..bookmarks import firefox
|
||||
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ def parse_html (url_data):
|
|||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url, linkparse.LinkTags)
|
||||
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)
|
||||
|
||||
|
||||
def parse_opera (url_data):
|
||||
|
|
@ -112,15 +112,7 @@ def parse_wml (url_data):
|
|||
"""Parse into WML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url, linkparse.WmlTags)
|
||||
|
||||
|
||||
def find_links (url_data, callback, tags):
|
||||
"""Parse into content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
handler = linkparse.LinkFinder(callback, tags)
|
||||
htmlsoup.process_soup(handler, url_data.get_soup())
|
||||
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)
|
||||
|
||||
|
||||
def parse_firefox (url_data):
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@ from urllib import parse
|
|||
from . import _ContentPlugin
|
||||
from .. import log, LOG_PLUGIN
|
||||
from ..htmlutil import linkparse
|
||||
from ..parser import find_links
|
||||
|
||||
|
||||
class AnchorCheck(_ContentPlugin):
|
||||
|
|
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
|
|||
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
|
||||
# list of parsed anchors
|
||||
self.anchors = []
|
||||
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
|
||||
linkparse.find_links(url_data.get_soup(), self.add_anchor,
|
||||
linkparse.AnchorTags)
|
||||
self.check_anchor(url_data)
|
||||
|
||||
def add_anchor (self, url, line, column, name, base):
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
<meta name="robots" content="nofollow">
|
||||
<meta name="robots" content="noindex, Nofollow">
|
||||
<a href="do_not_check.html">bla</a>
|
||||
|
|
|
|||
|
|
@ -16,10 +16,19 @@
|
|||
"""
|
||||
Test that <meta name="robots" content="nofollow"> is respected when using http
|
||||
and ignored when checking a local file.
|
||||
Also test different values of the content attribute are correctly matched.
|
||||
"""
|
||||
import unittest
|
||||
|
||||
import linkcheck.configuration
|
||||
import linkcheck.director
|
||||
from linkcheck.htmlutil.htmlsoup import make_soup
|
||||
from . import get_url_from
|
||||
|
||||
from . import LinkCheckTest
|
||||
from .httpserver import HttpServerTest
|
||||
|
||||
|
||||
class TestHttpMetaRobots(HttpServerTest):
|
||||
"""Test <meta name="robots" content="nofollow"> using http."""
|
||||
|
||||
|
|
@ -33,6 +42,7 @@ class TestHttpMetaRobots(HttpServerTest):
|
|||
]
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
|
||||
|
||||
class TestFileMetaRobots(LinkCheckTest):
|
||||
"""Test <meta name="robots" content="nofollow"> from a file."""
|
||||
|
||||
|
|
@ -52,3 +62,23 @@ class TestFileMetaRobots(LinkCheckTest):
|
|||
"error"
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
|
||||
|
||||
class TestMetaRobotsVariants(unittest.TestCase):
|
||||
"""Test different values of the robots meta directive content attribute"""
|
||||
|
||||
def test_nofollow_variants(self):
|
||||
config = linkcheck.configuration.Configuration()
|
||||
aggregate = linkcheck.director.get_aggregate(config)
|
||||
url = "http://example.org"
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
url_data.content_type = "text/html"
|
||||
|
||||
url_data.soup = make_soup('<meta name="robots" content="nofollow">')
|
||||
self.assertFalse(url_data.content_allows_robots())
|
||||
|
||||
url_data.soup = make_soup('<meta name="robots" content="nocache, Nofollow, noimageindex">')
|
||||
self.assertFalse(url_data.content_allows_robots())
|
||||
|
||||
url_data.soup = make_soup('<meta name="robots" content="noindex, follow">')
|
||||
self.assertTrue(url_data.content_allows_robots())
|
||||
|
|
|
|||
|
|
@ -15,50 +15,34 @@
|
|||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
HTML parser handler test class.
|
||||
HTML parser test function.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class HtmlPrettyPrinter:
|
||||
def pretty_print_html(fd, soup):
|
||||
"""
|
||||
Print out all parsed HTML data in encoded form.
|
||||
Also stores error and warnings messages.
|
||||
Print out all parsed HTML data,
|
||||
writing to the given file descriptor.
|
||||
|
||||
@param fd: file like object
|
||||
@type fd: file
|
||||
@param soup: BeautifulSoup object
|
||||
@type soup: BeautifulSoup
|
||||
"""
|
||||
for element in soup.find_all(True):
|
||||
tag = element.name
|
||||
element_text = element.text.strip()
|
||||
|
||||
def __init__ (self, fd=sys.stdout, encoding="iso8859-1"):
|
||||
"""
|
||||
Write to given file descriptor in given encoding.
|
||||
|
||||
@param fd: file like object (default=sys.stdout)
|
||||
@type fd: file
|
||||
@param encoding: encoding (default=iso8859-1)
|
||||
@type encoding: string
|
||||
"""
|
||||
self.fd = fd
|
||||
self.encoding = encoding
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""
|
||||
Print HTML start element.
|
||||
|
||||
@param tag: tag name
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: dict
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in sorted(attrs.items()):
|
||||
fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in sorted(element.attrs.items()):
|
||||
if val is None:
|
||||
self.fd.write(" %s" % key)
|
||||
fd.write(" %s" % key)
|
||||
else:
|
||||
self.fd.write(' %s="%s"' % (key, quote_attrval(val)))
|
||||
fd.write(' %s="%s"' % (key, quote_attrval(val)))
|
||||
if element_text:
|
||||
self.fd.write(">%s</%s>" % (element_text, tag))
|
||||
fd.write(">%s</%s>" % (element_text, tag))
|
||||
else:
|
||||
self.fd.write("/>")
|
||||
fd.write("/>")
|
||||
|
||||
|
||||
def quote_attrval (s):
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
|
||||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
linkparse.find_links(htmlsoup.make_soup(content),
|
||||
self._test_one_url(url), linkparse.LinkTags)
|
||||
self.assertEqual(self.count_url, 1)
|
||||
|
||||
def _test_one_url (self, origurl):
|
||||
|
|
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_no_link (self, content):
|
||||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
linkparse.find_links(htmlsoup.make_soup(content), callback,
|
||||
linkparse.LinkTags)
|
||||
|
||||
def test_href_parsing (self):
|
||||
# Test <a href> parsing.
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ import unittest
|
|||
|
||||
from parameterized import parameterized
|
||||
|
||||
from .htmllib import HtmlPrettyPrinter
|
||||
from .htmllib import pretty_print_html
|
||||
|
||||
# list of tuples
|
||||
# (<test pattern>, <expected parse output>)
|
||||
|
|
@ -142,8 +142,7 @@ class TestParser (unittest.TestCase):
|
|||
def test_parse (self, _in, _out):
|
||||
# Parse all test patterns in one go.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
|
||||
pretty_print_html(out, htmlsoup.make_soup(_in))
|
||||
self.check_results(_in, _out, out)
|
||||
|
||||
def check_results (self, _in, _out, out):
|
||||
|
|
@ -180,8 +179,5 @@ class TestParser (unittest.TestCase):
|
|||
self.encoding_test(html, "ascii")
|
||||
|
||||
def encoding_test (self, html, expected):
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
soup = htmlsoup.make_soup(html)
|
||||
htmlsoup.process_soup(handler, soup)
|
||||
self.assertEqual(soup.original_encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue