diff --git a/ChangeLog.txt b/ChangeLog.txt index a5fe3c17..632a0873 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -75,6 +75,18 @@ Type: feature Changed: linkcheck/checker/fileurl.py + * Handle non-Latin1 filenames when checking local directories. + Type: bugfix + Closes: SF bug #2093225 + Changed: linkcheck/checker/fileurl.py + + * Use configured proxy when requesting robots.txt, especially + honor the noproxy values. + Type: bugfix + Closes: SF bug #2091297 + Changed: linkcheck/robotparser2.py, linkcheck/cache/robots_txt.py, + linkcheck/checker/httpurl.py + 4.9 "Michael Clayton" (released 25.4.2008) * Parse Shockwave Flash (SWF) for URLs to check diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index c19d22dd..708d98fd 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -36,12 +36,13 @@ class RobotsTxt (object): self.cache = {} @synchronized(_lock) - def allows_url (self, roboturl, url, user, password, callback=None): + def allows_url (self, roboturl, url, proxy, user, password, callback=None): """ Ask robots.txt allowance. """ if roboturl not in self.cache: - rp = robotparser2.RobotFileParser(user=user, password=password) + rp = robotparser2.RobotFileParser(proxy=proxy, user=user, + password=password) rp.set_url(roboturl) rp.read() if callback is not None: diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index d65924b1..853ec1f2 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -186,7 +186,9 @@ class FileUrl (urlbase.UrlBase): t = time.time() files = get_files(self.get_os_filename()) data = get_index_html(files) - self.data = data.encode("iso8859-1", "ignore") + if isinstance(data, unicode): + data = data.encode("iso8859-1", "ignore") + self.data = data self.dltime = time.time() - t self.dlsize = len(self.data) return self.data diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 75dae3de..1bea6854 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -139,7 +139,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): user, password = self.get_user_password() rb = self.aggregate.robots_txt callback = self.aggregate.connections.host_wait - return rb.allows_url(roboturl, url, user, password, callback=callback) + return rb.allows_url(roboturl, url, self.proxy, user, password, + callback=callback,) def check_connection (self): """ diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 353e9e90..b5c419ba 100755 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -61,10 +61,11 @@ class RobotFileParser (object): """This class provides a set of methods to read, parse and answer questions about a single robots.txt file.""" - def __init__ (self, url='', user=None, password=None): + def __init__ (self, url='', proxy=None, user=None, password=None): """Initialize internal entry lists and store given url and credentials.""" self.set_url(url) + self.proxy = proxy self.user = user self.password = password self._reset() @@ -107,16 +108,22 @@ class RobotFileParser (object): """ pwd_manager = PasswordManager(self.user, self.password) handlers = [ - urllib2.ProxyHandler(urllib.getproxies()), urllib2.UnknownHandler, httputil.HttpWithGzipHandler, urllib2.HTTPBasicAuthHandler(pwd_manager), - urllib2.ProxyBasicAuthHandler(pwd_manager), urllib2.HTTPDigestAuthHandler(pwd_manager), - urllib2.ProxyDigestAuthHandler(pwd_manager), + ] + if self.proxy: + handlers.insert(0, + urllib2.ProxyHandler({"http": self.proxy, "https": self.proxy})) + handlers.extend([ + urllib2.ProxyBasicAuthHandler(pwd_manager), + urllib2.ProxyDigestAuthHandler(pwd_manager), + ]) + handlers.extend([ urllib2.HTTPDefaultErrorHandler, urllib2.HTTPRedirectHandler, - ] + ]) if hasattr(httplib, 'HTTPS'): handlers.append(httputil.HttpsWithGzipHandler) return urllib2.build_opener(*handlers)