Prevent unallowed content read when checking for robots.txt allowance in HTML files.

2026-05-28 15:48:16 +00:00 · 2010-10-12 00:40:34 +02:00 · 2010-10-12 00:40:34 +02:00 · 61e611e4bf
commit 61e611e4bf
parent 25cde6775b
3 changed files with 10 additions and 2 deletions
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -4,6 +4,8 @@ Fixes:
 - gui: Enable the cancel button again after it has been clicked and
  disabled.
 - checking: Fix printing of active URLs on Ctrl-C.
+- checking: Check for allowed content read before trying to
+  parse robots.txt allowance.

 Changes:
 - gui: Display cancel message in progress window.
@ -14,7 +16,8 @@ Features:
  Closes: SF bug #3040378
 - gui: Read default options from configuration file.
  Closes: SF bug #2931320
-
+- config: Added configuration file option for the --cookies command line
+  option.

 5.3 "Inception" (released 29.9.2010)

--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -646,6 +646,11 @@ Use URL `%(newurl)s' instead for checking.""") % {
        if self.method_get_allowed:
            super(HttpUrl, self).set_title_from_content()

+    def content_allows_robots (self):
+        if not self.method_get_allowed:
+            return False
+        return super(HttpUrl, self).content_allows_robots()
+
    def is_html (self):
        """
        See if this URL points to a HTML file by looking at the
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -565,7 +565,7 @@ class UrlBase (object):

    def content_allows_robots (self):
        """
-        Return True if the content of this URL forbids robots to
+        Return False if the content of this URL forbids robots to
        search for recursive links.
        """
        if not self.is_html():