diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index d97d26b8..c0f754eb 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -24,6 +24,8 @@ from cStringIO import StringIO from .. import (log, LOG_CHECK, strformat, fileutil, url as urlutil, LinkCheckerError) from . import (internpaturl, proxysupport, httpheaders as headers) +from ..HtmlParser import htmlsax +from ..htmlutil import linkparse # import warnings from .const import WARN_HTTP_EMPTY_CONTENT @@ -60,6 +62,31 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ return self.aggregate.robots_txt.allows_url(self) + def content_allows_robots (self): + """ + Return False if the content of this URL forbids robots to + search for recursive links. + """ + if not self.is_html(): + return True + # construct parser object + handler = linkparse.MetaRobotsFinder() + parser = htmlsax.parser(handler) + handler.parser = parser + if self.charset: + parser.encoding = self.charset + # parse + try: + parser.feed(self.get_content()) + parser.flush() + except linkparse.StopParse as msg: + log.debug(LOG_CHECK, "Stopped parsing: %s", msg) + pass + # break cyclic dependencies + handler.parser = None + parser.handler = None + return handler.follow + def add_size_info (self): """Get size of URL content from HTTP header.""" if self.headers and "Content-Length" in self.headers and \ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index cd2968f1..23f4b7fa 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -31,8 +31,6 @@ from cStringIO import StringIO from . import absolute_url, get_url_from from .. import (log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil, trace, get_link_pat) -from ..HtmlParser import htmlsax -from ..htmlutil import linkparse from ..network import iputil from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, @@ -551,33 +549,9 @@ class UrlBase (object): log.debug(LOG_CHECK, "... yes, recursion.") return True - def content_allows_robots (self): - """ - Return False if the content of this URL forbids robots to - search for recursive links. - """ - # XXX cleanup - if not self.is_html(): - return True - if not (self.is_http() or self.is_file()): - return True - # construct parser object - handler = linkparse.MetaRobotsFinder() - parser = htmlsax.parser(handler) - handler.parser = parser - if self.charset: - parser.encoding = self.charset - # parse - try: - parser.feed(self.get_content()) - parser.flush() - except linkparse.StopParse as msg: - log.debug(LOG_CHECK, "Stopped parsing: %s", msg) - pass - # break cyclic dependencies - handler.parser = None - parser.handler = None - return handler.follow + def content_allows_robots(self): + """Returns True: only check robots.txt on HTTP links.""" + return True def set_extern (self, url): """ diff --git a/tests/checker/test_robotstxt.py b/tests/checker/test_robotstxt.py deleted file mode 100644 index 7cd9caf7..00000000 --- a/tests/checker/test_robotstxt.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: iso-8859-1 -*- -# Copyright (C) 2004-2009 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -Test HTML robots.txt parsing. -""" -from . import LinkCheckTest - - -class TestRobotsTxt (LinkCheckTest): - """ - Test robots.txt directive parsing in HTML files. - """ - - def test_norobot (self): - self.file_test("norobots.html")