diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index e891402e..2752e3a8 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -26,11 +26,12 @@ import warnings warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning) from io import BytesIO +import re from .. import (log, LOG_CHECK, strformat, mimeutil, url as urlutil, LinkCheckerError, httputil) from . import (internpaturl, proxysupport) -from ..htmlutil import htmlsoup, linkparse +from ..htmlutil import htmlsoup # import warnings from .const import WARN_HTTP_EMPTY_CONTENT from requests.sessions import REDIRECT_STATI @@ -42,6 +43,9 @@ HTTP_SCHEMAS = ('http://', 'https://') # helper alias unicode_safe = strformat.unicode_safe +# match for robots meta element content attribute +nofollow_re = re.compile(r"\bnofollow\b", re.IGNORECASE) + class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ Url link with http scheme. @@ -78,15 +82,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ if not self.is_html(): return True - # construct handler object - handler = linkparse.MetaRobotsFinder() - # parse - try: - htmlsoup.process_soup(handler, self.get_soup()) - except linkparse.StopParse as msg: - log.debug(LOG_CHECK, "Stopped parsing: %s", msg) - pass - return handler.follow + + soup = self.get_soup() + return not soup.find("meta", attrs={"name": "robots", "content": nofollow_re}) def add_size_info (self): """Get size of URL content from HTTP header.""" diff --git a/linkcheck/htmlutil/htmlsoup.py b/linkcheck/htmlutil/htmlsoup.py index 921703cf..e5e18799 100644 --- a/linkcheck/htmlutil/htmlsoup.py +++ b/linkcheck/htmlutil/htmlsoup.py @@ -15,35 +15,6 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ HTML parser implemented using Beautiful Soup and html.parser. - -USAGE - -Two functions are provided, one to make a BeautifulSoup object from markup and -another to call a handler's callbacks for each element in a BeautifulSoup -object it can process. - -The used callback of a handler is: - -- Start tag: - def start_element (tag, attrs, text, line, column) - @param tag: tag name - @type tag: string - @param attrs: tag attributes - @type attrs: dict - @param text: element text - @type tag: string - @param line: tag line number - @type tag: integer - @param column: tag column number - @type tag: integer - -EXAMPLE - - # Create a new BeautifulSoup object. - soup = htmlutil.htmlsoup.make_soup("Blubb") - # Process the soup with the chosen handler as a parameter. - htmlutil.htmlsoup.proces_soup(handler, soup) - """ from warnings import filterwarnings @@ -58,10 +29,3 @@ from bs4 import BeautifulSoup def make_soup(markup, from_encoding=None): return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding, multi_valued_attributes=None) - -def process_soup(handler, soup): - for element in soup.find_all(True): - handler.start_element( - element.name, element.attrs, element.text.strip(), - element.sourceline, - None if element.sourcepos is None else element.sourcepos + 1) diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 0f5bc7ef..17709fc2 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -30,17 +30,17 @@ LinkTags = { 'a': [u'href'], 'applet': [u'archive', u'src'], 'area': [u'href'], - 'audio': [u'src'], # HTML5 + 'audio': [u'src'], # HTML5 'bgsound': [u'src'], 'blockquote': [u'cite'], 'body': [u'background'], - 'button': [u'formaction'], # HTML5 + 'button': [u'formaction'], # HTML5 'del': [u'cite'], 'embed': [u'pluginspage', u'src'], 'form': [u'action'], 'frame': [u'src', u'longdesc'], 'head': [u'profile'], - 'html': [u'manifest'], # HTML5 + 'html': [u'manifest'], # HTML5 'iframe': [u'src', u'longdesc'], 'ilayer': [u'background'], 'img': [u'src', u'lowsrc', u'longdesc', u'usemap', u'srcset'], @@ -53,13 +53,13 @@ LinkTags = { 'object': [u'classid', u'data', u'archive', u'usemap', u'codebase'], 'q': [u'cite'], 'script': [u'src'], - 'source': [u'src'], # HTML5 + 'source': [u'src'], # HTML5 'table': [u'background'], 'td': [u'background'], 'th': [u'background'], 'tr': [u'background'], - 'track': [u'src'], # HTML5 - 'video': [u'src'], # HTML5 + 'track': [u'src'], # HTML5 + 'video': [u'src'], # HTML5 'xmp': [u'href'], None: [u'style', u'itemtype'], } @@ -98,44 +98,6 @@ def strip_c_comments (text): return c_comment_re.sub('', text) -class StopParse(Exception): - """Raised when parsing should stop.""" - pass - - -class TagFinder (object): - """Base class handling HTML start elements. - TagFinder instances are used as HTML parser handlers.""" - - def __init__ (self): - """Initialize local variables.""" - super(TagFinder, self).__init__() - - def start_element (self, tag, attrs, element_text, lineno, column): - """Does nothing, override in a subclass.""" - pass - - -class MetaRobotsFinder (TagFinder): - """Class for finding robots.txt meta values in HTML.""" - - def __init__ (self): - """Initialize follow and index flags.""" - super(MetaRobotsFinder, self).__init__() - log.debug(LOG_CHECK, "meta robots finder") - self.follow = self.index = True - - def start_element (self, tag, attrs, element_text, lineno, column): - """Search for meta robots.txt "nofollow" and "noindex" flags.""" - if tag == 'meta' and attrs.get('name') == 'robots': - val = attrs.get('content', u'').lower().split(u',') - self.follow = u'nofollow' not in val - self.index = u'noindex' not in val - raise StopParse("found tag") - elif tag == 'body': - raise StopParse("found tag") - - def is_meta_url (attr, attrs): """Check if the meta attributes contain a URL.""" res = False @@ -158,24 +120,23 @@ def is_form_get(attr, attrs): return res -class LinkFinder (TagFinder): +class LinkFinder: """Find HTML links, and apply them to the callback function with the format (url, lineno, column, name, codebase).""" def __init__ (self, callback, tags): """Store content in buffer and initialize URL list.""" - super(LinkFinder, self).__init__() self.callback = callback # set universal tag attributes using tagname None self.universal_attrs = set(tags.get(None, [])) self.tags = dict() - for tag, attrs in tags.items(): + for tag, attrs in tags.items(): self.tags[tag] = set(attrs) # add universal tag attributes self.tags[tag].update(self.universal_attrs) self.base_ref = u'' - def start_element (self, tag, attrs, element_text, lineno, column): + def html_element (self, tag, attrs, element_text, lineno, column): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) log.debug(LOG_CHECK, "line %d col %d", lineno, column) @@ -192,7 +153,7 @@ class LinkFinder (TagFinder): name = self.get_link_name(tag, attrs, attr, element_text) # possible codebase base = u'' - if tag == 'applet': + if tag == 'applet': base = attrs.get('codebase', u'') if not base: base = self.base_ref @@ -251,3 +212,15 @@ class LinkFinder (TagFinder): """Add newly found URL to queue.""" assert isinstance(url, str_text) or url is None, repr(url) self.callback(url, line=lineno, column=column, name=name, base=base) + + +def find_links(soup, callback, tags): + """Parse into content and search for URLs to check. + When a URL is found it is passed to the supplied callback. + """ + lf = LinkFinder(callback, tags) + for element in soup.find_all(True): + lf.html_element( + element.name, element.attrs, element.text.strip(), + element.sourceline, + None if element.sourcepos is None else element.sourcepos + 1) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index a1153afc..e928177b 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -18,7 +18,7 @@ Main functions for link parsing """ from .. import log, LOG_CHECK, strformat, url as urlutil -from ..htmlutil import htmlsoup, linkparse +from ..htmlutil import linkparse from ..bookmarks import firefox @@ -46,7 +46,7 @@ def parse_html (url_data): """Parse into HTML content and search for URLs to check. Found URLs are added to the URL queue. """ - find_links(url_data, url_data.add_url, linkparse.LinkTags) + linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags) def parse_opera (url_data): @@ -112,15 +112,7 @@ def parse_wml (url_data): """Parse into WML content and search for URLs to check. Found URLs are added to the URL queue. """ - find_links(url_data, url_data.add_url, linkparse.WmlTags) - - -def find_links (url_data, callback, tags): - """Parse into content and search for URLs to check. - Found URLs are added to the URL queue. - """ - handler = linkparse.LinkFinder(callback, tags) - htmlsoup.process_soup(handler, url_data.get_soup()) + linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags) def parse_firefox (url_data): diff --git a/linkcheck/plugins/anchorcheck.py b/linkcheck/plugins/anchorcheck.py index 32cc7ef2..5fff4441 100644 --- a/linkcheck/plugins/anchorcheck.py +++ b/linkcheck/plugins/anchorcheck.py @@ -22,7 +22,6 @@ from urllib import parse from . import _ContentPlugin from .. import log, LOG_PLUGIN from ..htmlutil import linkparse -from ..parser import find_links class AnchorCheck(_ContentPlugin): @@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin): log.debug(LOG_PLUGIN, "checking content for invalid anchors") # list of parsed anchors self.anchors = [] - find_links(url_data, self.add_anchor, linkparse.AnchorTags) + linkparse.find_links(url_data.get_soup(), self.add_anchor, + linkparse.AnchorTags) self.check_anchor(url_data) def add_anchor (self, url, line, column, name, base): diff --git a/tests/checker/data/norobots.html b/tests/checker/data/norobots.html index 795e8174..17c6826b 100644 --- a/tests/checker/data/norobots.html +++ b/tests/checker/data/norobots.html @@ -1,2 +1,2 @@ - + bla diff --git a/tests/checker/test_content_allows_robots.py b/tests/checker/test_content_allows_robots.py index 605ea1e2..8802093a 100644 --- a/tests/checker/test_content_allows_robots.py +++ b/tests/checker/test_content_allows_robots.py @@ -16,10 +16,19 @@ """ Test that is respected when using http and ignored when checking a local file. +Also test different values of the content attribute are correctly matched. """ +import unittest + +import linkcheck.configuration +import linkcheck.director +from linkcheck.htmlutil.htmlsoup import make_soup +from . import get_url_from + from . import LinkCheckTest from .httpserver import HttpServerTest + class TestHttpMetaRobots(HttpServerTest): """Test using http.""" @@ -33,6 +42,7 @@ class TestHttpMetaRobots(HttpServerTest): ] self.direct(url, resultlines, recursionlevel=1) + class TestFileMetaRobots(LinkCheckTest): """Test from a file.""" @@ -52,3 +62,23 @@ class TestFileMetaRobots(LinkCheckTest): "error" ] self.direct(url, resultlines, recursionlevel=1) + + +class TestMetaRobotsVariants(unittest.TestCase): + """Test different values of the robots meta directive content attribute""" + + def test_nofollow_variants(self): + config = linkcheck.configuration.Configuration() + aggregate = linkcheck.director.get_aggregate(config) + url = "http://example.org" + url_data = get_url_from(url, 0, aggregate) + url_data.content_type = "text/html" + + url_data.soup = make_soup('') + self.assertFalse(url_data.content_allows_robots()) + + url_data.soup = make_soup('') + self.assertFalse(url_data.content_allows_robots()) + + url_data.soup = make_soup('') + self.assertTrue(url_data.content_allows_robots()) diff --git a/tests/htmllib.py b/tests/htmllib.py index 08318704..f5ec6255 100644 --- a/tests/htmllib.py +++ b/tests/htmllib.py @@ -15,50 +15,34 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ -HTML parser handler test class. +HTML parser test function. """ -import sys - -class HtmlPrettyPrinter: +def pretty_print_html(fd, soup): """ - Print out all parsed HTML data in encoded form. - Also stores error and warnings messages. + Print out all parsed HTML data, + writing to the given file descriptor. + + @param fd: file like object + @type fd: file + @param soup: BeautifulSoup object + @type soup: BeautifulSoup """ + for element in soup.find_all(True): + tag = element.name + element_text = element.text.strip() - def __init__ (self, fd=sys.stdout, encoding="iso8859-1"): - """ - Write to given file descriptor in given encoding. - - @param fd: file like object (default=sys.stdout) - @type fd: file - @param encoding: encoding (default=iso8859-1) - @type encoding: string - """ - self.fd = fd - self.encoding = encoding - - def start_element (self, tag, attrs, element_text, lineno, column): - """ - Print HTML start element. - - @param tag: tag name - @type tag: string - @param attrs: tag attributes - @type attrs: dict - @return: None - """ - self.fd.write("<%s" % tag.replace("/", "")) - for key, val in sorted(attrs.items()): + fd.write("<%s" % tag.replace("/", "")) + for key, val in sorted(element.attrs.items()): if val is None: - self.fd.write(" %s" % key) + fd.write(" %s" % key) else: - self.fd.write(' %s="%s"' % (key, quote_attrval(val))) + fd.write(' %s="%s"' % (key, quote_attrval(val))) if element_text: - self.fd.write(">%s" % (element_text, tag)) + fd.write(">%s" % (element_text, tag)) else: - self.fd.write("/>") + fd.write("/>") def quote_attrval (s): diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 7cc3e930..da7ae57a 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase): def _test_one_link (self, content, url): self.count_url = 0 - h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) - htmlsoup.process_soup(h, htmlsoup.make_soup(content)) + linkparse.find_links(htmlsoup.make_soup(content), + self._test_one_url(url), linkparse.LinkTags) self.assertEqual(self.count_url, 1) def _test_one_url (self, origurl): @@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase): def _test_no_link (self, content): def callback (url, line, column, name, base): self.assertTrue(False, 'URL %r found' % url) - h = linkparse.LinkFinder(callback, linkparse.LinkTags) - htmlsoup.process_soup(h, htmlsoup.make_soup(content)) + linkparse.find_links(htmlsoup.make_soup(content), callback, + linkparse.LinkTags) def test_href_parsing (self): # Test parsing. diff --git a/tests/test_parser.py b/tests/test_parser.py index 22c08fc1..94e54f2c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -25,7 +25,7 @@ import unittest from parameterized import parameterized -from .htmllib import HtmlPrettyPrinter +from .htmllib import pretty_print_html # list of tuples # (, ) @@ -142,8 +142,7 @@ class TestParser (unittest.TestCase): def test_parse (self, _in, _out): # Parse all test patterns in one go. out = StringIO() - handler = HtmlPrettyPrinter(out) - htmlsoup.process_soup(handler, htmlsoup.make_soup(_in)) + pretty_print_html(out, htmlsoup.make_soup(_in)) self.check_results(_in, _out, out) def check_results (self, _in, _out, out): @@ -180,8 +179,5 @@ class TestParser (unittest.TestCase): self.encoding_test(html, "ascii") def encoding_test (self, html, expected): - out = StringIO() - handler = HtmlPrettyPrinter(out) soup = htmlsoup.make_soup(html) - htmlsoup.process_soup(handler, soup) self.assertEqual(soup.original_encoding, expected)