Prevent UnicodeDecodeError in robots.txt parsing.

This commit is contained in:
Bastian Kleineidam 2010-03-07 22:49:25 +01:00
parent b8b0398dd2
commit 7c15d28f56
2 changed files with 5 additions and 3 deletions

View file

@ -11,6 +11,8 @@ Fixes:
status line. Fixes the "BadStatusLine" errors.
- http: Prevent UnicodeDecodeError on redirection by ensuring that
the redirected URL will be Unicode encoded.
- checking: Prevent UnicodeDecodeError in robots.txt parser by
encoding the linkchecker useragent string.
- installer: Add commandline executable to Windows installer.
Closes: SF bug #2903257
- http: Warn about permanent redirections even when redirected URL is

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2009 Bastian Kleineidam
# Copyright (C) 2006-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -40,6 +40,7 @@ class RobotsTxt (object):
"""
Ask robots.txt allowance.
"""
useragent = str(configuration.UserAgent)
if roboturl not in self.cache:
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
password=password)
@ -48,10 +49,9 @@ class RobotsTxt (object):
if hasattr(callback, '__call__'):
parts = urlutil.url_split(rp.url)
host = "%s:%d" % (parts[1], parts[2])
useragent = configuration.UserAgent
wait = rp.get_crawldelay(useragent)
callback(host, wait)
self.cache[roboturl] = rp
else:
rp = self.cache[roboturl]
return rp.can_fetch(configuration.UserAgent, url)
return rp.can_fetch(useragent, url)