Prevent unallowed content read when checking for robots.txt allowance in HTML files.

This commit is contained in:
Bastian Kleineidam 2010-10-12 00:40:34 +02:00
parent 25cde6775b
commit 61e611e4bf
3 changed files with 10 additions and 2 deletions

View file

@ -4,6 +4,8 @@ Fixes:
- gui: Enable the cancel button again after it has been clicked and
disabled.
- checking: Fix printing of active URLs on Ctrl-C.
- checking: Check for allowed content read before trying to
parse robots.txt allowance.
Changes:
- gui: Display cancel message in progress window.
@ -14,7 +16,8 @@ Features:
Closes: SF bug #3040378
- gui: Read default options from configuration file.
Closes: SF bug #2931320
- config: Added configuration file option for the --cookies command line
option.
5.3 "Inception" (released 29.9.2010)

View file

@ -646,6 +646,11 @@ Use URL `%(newurl)s' instead for checking.""") % {
if self.method_get_allowed:
super(HttpUrl, self).set_title_from_content()
def content_allows_robots (self):
if not self.method_get_allowed:
return False
return super(HttpUrl, self).content_allows_robots()
def is_html (self):
"""
See if this URL points to a HTML file by looking at the

View file

@ -565,7 +565,7 @@ class UrlBase (object):
def content_allows_robots (self):
"""
Return True if the content of this URL forbids robots to
Return False if the content of this URL forbids robots to
search for recursive links.
"""
if not self.is_html():