Only fallback to HTTP GET when robots.txt sallows it.

This commit is contained in:
Bastian Kleineidam 2010-09-04 18:09:59 +02:00
parent 59e06a4792
commit 5284017d67
5 changed files with 28 additions and 25 deletions

View file

@ -38,12 +38,14 @@ Changes:
- cmdline: The --password option now reads a password from stdin
instead taking it from the commandline.
Features:
- ftp: Detect and support UTF-8 filename encoding capability of FTP
servers.
- checking: added new warning to check if content size is zero
- install: remove Windows registry keys on uninstall
- checking: Added new warning to check if content size is zero.
- install: Remove Windows registry keys on uninstall.
- checking: Do not fall back to GET when no recursion is requested on
single pages. This allows to check pages with a HEAD request even if
robots.txt disallows to get the page content.
5.2 "11:14" (released 7.3.2010)

View file

@ -1,6 +1,3 @@
- [HTTP] Do not fall back to GET when no recursion is requested on
single pages. This would allow to check pages even if robots.txt
disallows to get the page content.
- [HTTP] Allow sending POST data for login at beginning of check.
- [GUI] Add more context menu actions:
+ Show detailed URL properties

View file

@ -124,11 +124,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self._data = None
# flag indicating connection reuse
self.reused_connection = False
# flag telling if GET method is allowed; determined by robots.txt
self.method_get_allowed = True
def allows_robots (self, url):
"""
Fetch and parse the robots.txt of given url. Checks if LinkChecker
can access the requested resource.
can get the requested resource content. HEAD requests however are
still allowed.
@param url: the url to be requested
@type url: string
@ -180,18 +183,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if not self.allows_robots(self.url):
# remove all previously stored results
self.add_warning(
_("Access denied by robots.txt, checked only syntax."),
_("Access denied by robots.txt, skipping content checks."),
tag=WARN_HTTP_ROBOTS_DENIED)
self.set_result(u"syntax OK")
return
self.method_get_allowed = False
# first try with HEAD
self.method = "HEAD"
# check for amazon server quirk
if _is_amazon(self.urlparts[1]):
self.add_info(_("Amazon servers block HTTP HEAD requests, "
"using GET instead."))
self.method = "GET"
else:
# first try with HEAD
self.method = "HEAD"
self.add_info(_("Amazon servers block HTTP HEAD requests."))
if self.method_get_allowed:
self.add_info(_("Using GET method for Amazon server."))
self.method = "GET"
# check the http connection
response = self.check_http_connection()
if self.headers and "Server" in self.headers:
@ -231,7 +233,7 @@ Use URL `%(newurl)s' instead for checking.""") % {
response = self._try_http_response()
except httplib.BadStatusLine, msg:
# some servers send empty HEAD replies
if self.method == "HEAD":
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.method = "GET"
self.aliases = []
@ -264,7 +266,7 @@ Use URL `%(newurl)s' instead for checking.""") % {
tries, response = self.follow_redirections(response)
except httplib.BadStatusLine, msg:
# some servers send empty HEAD replies
if self.method == "HEAD":
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.method = "GET"
self.aliases = []
@ -276,7 +278,7 @@ Use URL `%(newurl)s' instead for checking.""") % {
response.close()
return None
if tries >= self.max_redirects:
if self.method == "HEAD":
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
self.method = "GET"
self.aliases = []
@ -297,11 +299,11 @@ Use URL `%(newurl)s' instead for checking.""") % {
continue
elif response.status >= 400:
# retry with GET (but do not set fallback flag)
if self.method == "HEAD":
if self.method == "HEAD" and self.method_get_allowed:
self.method = "GET"
self.aliases = []
continue
elif self.headers and self.method == "HEAD":
elif self.headers and self.method == "HEAD" and self.method_get_allowed:
# test for HEAD support
mime = headers.get_content_type(self.headers)
poweredby = self.headers.get('X-Powered-By', '')
@ -361,7 +363,7 @@ Use URL `%(newurl)s' instead for checking.""") % {
# see about recursive redirect
all_seen = [self.cache_url_key] + self.aliases
if redirected in all_seen:
if self.method == "HEAD":
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
# fall back to the original url and use GET
return self.max_redirects, response
@ -591,6 +593,7 @@ Use URL `%(newurl)s' instead for checking.""") % {
@return: URL content, decompressed and decoded
@rtype: string
"""
assert self.method_get_allowed, 'unallowed content read'
self.method = "GET"
response = self._try_http_response()
response = self.follow_redirections(response, set_result=False)[1]

View file

@ -123,8 +123,8 @@ class TestHttp (HttpServerTest):
u"url %s" % url,
u"cache key %s" % url,
u"real url %s" % url,
u"warning Access denied by robots.txt, checked only syntax.",
u"valid",
u"warning Access denied by robots.txt, skipping content checks.",
u"error",
]
self.direct(url, resultlines, recursionlevel=5)

View file

@ -33,7 +33,8 @@ class TestHttps (LinkCheckTest):
u"url %s" % url,
u"cache key %s" % url,
u"real url %s" % url,
u"info Amazon servers block HTTP HEAD requests, using GET instead.",
u"info Amazon servers block HTTP HEAD requests.",
u"info Using GET method for Amazon server.",
u"valid",
]
self.direct(url, resultlines)