git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3867 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2008-11-20 21:30:10 +00:00
parent 2758de9757
commit c3b6fc5aa4
5 changed files with 32 additions and 9 deletions

View file

@ -75,6 +75,18 @@
Type: feature Type: feature
Changed: linkcheck/checker/fileurl.py Changed: linkcheck/checker/fileurl.py
* Handle non-Latin1 filenames when checking local directories.
Type: bugfix
Closes: SF bug #2093225
Changed: linkcheck/checker/fileurl.py
* Use configured proxy when requesting robots.txt, especially
honor the noproxy values.
Type: bugfix
Closes: SF bug #2091297
Changed: linkcheck/robotparser2.py, linkcheck/cache/robots_txt.py,
linkcheck/checker/httpurl.py
4.9 "Michael Clayton" (released 25.4.2008) 4.9 "Michael Clayton" (released 25.4.2008)
* Parse Shockwave Flash (SWF) for URLs to check * Parse Shockwave Flash (SWF) for URLs to check

View file

@ -36,12 +36,13 @@ class RobotsTxt (object):
self.cache = {} self.cache = {}
@synchronized(_lock) @synchronized(_lock)
def allows_url (self, roboturl, url, user, password, callback=None): def allows_url (self, roboturl, url, proxy, user, password, callback=None):
""" """
Ask robots.txt allowance. Ask robots.txt allowance.
""" """
if roboturl not in self.cache: if roboturl not in self.cache:
rp = robotparser2.RobotFileParser(user=user, password=password) rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
password=password)
rp.set_url(roboturl) rp.set_url(roboturl)
rp.read() rp.read()
if callback is not None: if callback is not None:

View file

@ -186,7 +186,9 @@ class FileUrl (urlbase.UrlBase):
t = time.time() t = time.time()
files = get_files(self.get_os_filename()) files = get_files(self.get_os_filename())
data = get_index_html(files) data = get_index_html(files)
self.data = data.encode("iso8859-1", "ignore") if isinstance(data, unicode):
data = data.encode("iso8859-1", "ignore")
self.data = data
self.dltime = time.time() - t self.dltime = time.time() - t
self.dlsize = len(self.data) self.dlsize = len(self.data)
return self.data return self.data

View file

@ -139,7 +139,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
user, password = self.get_user_password() user, password = self.get_user_password()
rb = self.aggregate.robots_txt rb = self.aggregate.robots_txt
callback = self.aggregate.connections.host_wait callback = self.aggregate.connections.host_wait
return rb.allows_url(roboturl, url, user, password, callback=callback) return rb.allows_url(roboturl, url, self.proxy, user, password,
callback=callback,)
def check_connection (self): def check_connection (self):
""" """

View file

@ -61,10 +61,11 @@ class RobotFileParser (object):
"""This class provides a set of methods to read, parse and answer """This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.""" questions about a single robots.txt file."""
def __init__ (self, url='', user=None, password=None): def __init__ (self, url='', proxy=None, user=None, password=None):
"""Initialize internal entry lists and store given url and """Initialize internal entry lists and store given url and
credentials.""" credentials."""
self.set_url(url) self.set_url(url)
self.proxy = proxy
self.user = user self.user = user
self.password = password self.password = password
self._reset() self._reset()
@ -107,16 +108,22 @@ class RobotFileParser (object):
""" """
pwd_manager = PasswordManager(self.user, self.password) pwd_manager = PasswordManager(self.user, self.password)
handlers = [ handlers = [
urllib2.ProxyHandler(urllib.getproxies()),
urllib2.UnknownHandler, urllib2.UnknownHandler,
httputil.HttpWithGzipHandler, httputil.HttpWithGzipHandler,
urllib2.HTTPBasicAuthHandler(pwd_manager), urllib2.HTTPBasicAuthHandler(pwd_manager),
urllib2.ProxyBasicAuthHandler(pwd_manager),
urllib2.HTTPDigestAuthHandler(pwd_manager), urllib2.HTTPDigestAuthHandler(pwd_manager),
urllib2.ProxyDigestAuthHandler(pwd_manager), ]
if self.proxy:
handlers.insert(0,
urllib2.ProxyHandler({"http": self.proxy, "https": self.proxy}))
handlers.extend([
urllib2.ProxyBasicAuthHandler(pwd_manager),
urllib2.ProxyDigestAuthHandler(pwd_manager),
])
handlers.extend([
urllib2.HTTPDefaultErrorHandler, urllib2.HTTPDefaultErrorHandler,
urllib2.HTTPRedirectHandler, urllib2.HTTPRedirectHandler,
] ])
if hasattr(httplib, 'HTTPS'): if hasattr(httplib, 'HTTPS'):
handlers.append(httputil.HttpsWithGzipHandler) handlers.append(httputil.HttpsWithGzipHandler)
return urllib2.build_opener(*handlers) return urllib2.build_opener(*handlers)