Don't check for robots.txt directives in local html files.

2026-05-08 22:54:51 +00:00 · 2014-03-19 16:33:22 +01:00 · 2014-03-19 16:33:22 +01:00 · ce733ae76b
commit ce733ae76b
parent e528d5f7db
3 changed files with 30 additions and 58 deletions
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -24,6 +24,8 @@ from cStringIO import StringIO
 from .. import (log, LOG_CHECK, strformat, fileutil,
    url as urlutil, LinkCheckerError)
 from . import (internpaturl, proxysupport, httpheaders as headers)
+from ..HtmlParser import htmlsax
+from ..htmlutil import linkparse
 # import warnings
 from .const import WARN_HTTP_EMPTY_CONTENT

@ -60,6 +62,31 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        return self.aggregate.robots_txt.allows_url(self)

+    def content_allows_robots (self):
+        """
+        Return False if the content of this URL forbids robots to
+        search for recursive links.
+        """
+        if not self.is_html():
+            return True
+        # construct parser object
+        handler = linkparse.MetaRobotsFinder()
+        parser = htmlsax.parser(handler)
+        handler.parser = parser
+        if self.charset:
+            parser.encoding = self.charset
+        # parse
+        try:
+            parser.feed(self.get_content())
+            parser.flush()
+        except linkparse.StopParse as msg:
+            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
+            pass
+        # break cyclic dependencies
+        handler.parser = None
+        parser.handler = None
+        return handler.follow
+
    def add_size_info (self):
        """Get size of URL content from HTTP header."""
        if self.headers and "Content-Length" in self.headers and \
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -31,8 +31,6 @@ from cStringIO import StringIO
 from . import absolute_url, get_url_from
 from .. import (log, LOG_CHECK,
  strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
-from ..HtmlParser import htmlsax
-from ..htmlutil import linkparse
 from ..network import iputil
 from .const import (WARN_URL_EFFECTIVE_URL,
    WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
@ -551,33 +549,9 @@ class UrlBase (object):
        log.debug(LOG_CHECK, "... yes, recursion.")
        return True

-    def content_allows_robots (self):
-        """
-        Return False if the content of this URL forbids robots to
-        search for recursive links.
-        """
-        # XXX cleanup
-        if not self.is_html():
-            return True
-        if not (self.is_http() or self.is_file()):
-            return True
-        # construct parser object
-        handler = linkparse.MetaRobotsFinder()
-        parser = htmlsax.parser(handler)
-        handler.parser = parser
-        if self.charset:
-            parser.encoding = self.charset
-        # parse
-        try:
-            parser.feed(self.get_content())
-            parser.flush()
-        except linkparse.StopParse as msg:
-            log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
-            pass
-        # break cyclic dependencies
-        handler.parser = None
-        parser.handler = None
-        return handler.follow
+    def content_allows_robots(self):
+        """Returns True: only check robots.txt on HTTP links."""
+        return True

    def set_extern (self, url):
        """
--- a/tests/checker/test_robotstxt.py
+++ b/tests/checker/test_robotstxt.py
@ -1,29 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2009 Bastian Kleineidam
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-Test HTML robots.txt parsing.
-"""
-from . import LinkCheckTest
-
-
-class TestRobotsTxt (LinkCheckTest):
-    """
-    Test robots.txt directive parsing in HTML files.
-    """
-
-    def test_norobot (self):
-        self.file_test("norobots.html")