mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-24 14:13:43 +00:00
Readd
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3867 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
2758de9757
commit
c3b6fc5aa4
5 changed files with 32 additions and 9 deletions
|
|
@ -75,6 +75,18 @@
|
||||||
Type: feature
|
Type: feature
|
||||||
Changed: linkcheck/checker/fileurl.py
|
Changed: linkcheck/checker/fileurl.py
|
||||||
|
|
||||||
|
* Handle non-Latin1 filenames when checking local directories.
|
||||||
|
Type: bugfix
|
||||||
|
Closes: SF bug #2093225
|
||||||
|
Changed: linkcheck/checker/fileurl.py
|
||||||
|
|
||||||
|
* Use configured proxy when requesting robots.txt, especially
|
||||||
|
honor the noproxy values.
|
||||||
|
Type: bugfix
|
||||||
|
Closes: SF bug #2091297
|
||||||
|
Changed: linkcheck/robotparser2.py, linkcheck/cache/robots_txt.py,
|
||||||
|
linkcheck/checker/httpurl.py
|
||||||
|
|
||||||
4.9 "Michael Clayton" (released 25.4.2008)
|
4.9 "Michael Clayton" (released 25.4.2008)
|
||||||
|
|
||||||
* Parse Shockwave Flash (SWF) for URLs to check
|
* Parse Shockwave Flash (SWF) for URLs to check
|
||||||
|
|
|
||||||
5
linkcheck/cache/robots_txt.py
vendored
5
linkcheck/cache/robots_txt.py
vendored
|
|
@ -36,12 +36,13 @@ class RobotsTxt (object):
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
|
|
||||||
@synchronized(_lock)
|
@synchronized(_lock)
|
||||||
def allows_url (self, roboturl, url, user, password, callback=None):
|
def allows_url (self, roboturl, url, proxy, user, password, callback=None):
|
||||||
"""
|
"""
|
||||||
Ask robots.txt allowance.
|
Ask robots.txt allowance.
|
||||||
"""
|
"""
|
||||||
if roboturl not in self.cache:
|
if roboturl not in self.cache:
|
||||||
rp = robotparser2.RobotFileParser(user=user, password=password)
|
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
|
||||||
|
password=password)
|
||||||
rp.set_url(roboturl)
|
rp.set_url(roboturl)
|
||||||
rp.read()
|
rp.read()
|
||||||
if callback is not None:
|
if callback is not None:
|
||||||
|
|
|
||||||
|
|
@ -186,7 +186,9 @@ class FileUrl (urlbase.UrlBase):
|
||||||
t = time.time()
|
t = time.time()
|
||||||
files = get_files(self.get_os_filename())
|
files = get_files(self.get_os_filename())
|
||||||
data = get_index_html(files)
|
data = get_index_html(files)
|
||||||
self.data = data.encode("iso8859-1", "ignore")
|
if isinstance(data, unicode):
|
||||||
|
data = data.encode("iso8859-1", "ignore")
|
||||||
|
self.data = data
|
||||||
self.dltime = time.time() - t
|
self.dltime = time.time() - t
|
||||||
self.dlsize = len(self.data)
|
self.dlsize = len(self.data)
|
||||||
return self.data
|
return self.data
|
||||||
|
|
|
||||||
|
|
@ -139,7 +139,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
||||||
user, password = self.get_user_password()
|
user, password = self.get_user_password()
|
||||||
rb = self.aggregate.robots_txt
|
rb = self.aggregate.robots_txt
|
||||||
callback = self.aggregate.connections.host_wait
|
callback = self.aggregate.connections.host_wait
|
||||||
return rb.allows_url(roboturl, url, user, password, callback=callback)
|
return rb.allows_url(roboturl, url, self.proxy, user, password,
|
||||||
|
callback=callback,)
|
||||||
|
|
||||||
def check_connection (self):
|
def check_connection (self):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -61,10 +61,11 @@ class RobotFileParser (object):
|
||||||
"""This class provides a set of methods to read, parse and answer
|
"""This class provides a set of methods to read, parse and answer
|
||||||
questions about a single robots.txt file."""
|
questions about a single robots.txt file."""
|
||||||
|
|
||||||
def __init__ (self, url='', user=None, password=None):
|
def __init__ (self, url='', proxy=None, user=None, password=None):
|
||||||
"""Initialize internal entry lists and store given url and
|
"""Initialize internal entry lists and store given url and
|
||||||
credentials."""
|
credentials."""
|
||||||
self.set_url(url)
|
self.set_url(url)
|
||||||
|
self.proxy = proxy
|
||||||
self.user = user
|
self.user = user
|
||||||
self.password = password
|
self.password = password
|
||||||
self._reset()
|
self._reset()
|
||||||
|
|
@ -107,16 +108,22 @@ class RobotFileParser (object):
|
||||||
"""
|
"""
|
||||||
pwd_manager = PasswordManager(self.user, self.password)
|
pwd_manager = PasswordManager(self.user, self.password)
|
||||||
handlers = [
|
handlers = [
|
||||||
urllib2.ProxyHandler(urllib.getproxies()),
|
|
||||||
urllib2.UnknownHandler,
|
urllib2.UnknownHandler,
|
||||||
httputil.HttpWithGzipHandler,
|
httputil.HttpWithGzipHandler,
|
||||||
urllib2.HTTPBasicAuthHandler(pwd_manager),
|
urllib2.HTTPBasicAuthHandler(pwd_manager),
|
||||||
urllib2.ProxyBasicAuthHandler(pwd_manager),
|
|
||||||
urllib2.HTTPDigestAuthHandler(pwd_manager),
|
urllib2.HTTPDigestAuthHandler(pwd_manager),
|
||||||
urllib2.ProxyDigestAuthHandler(pwd_manager),
|
]
|
||||||
|
if self.proxy:
|
||||||
|
handlers.insert(0,
|
||||||
|
urllib2.ProxyHandler({"http": self.proxy, "https": self.proxy}))
|
||||||
|
handlers.extend([
|
||||||
|
urllib2.ProxyBasicAuthHandler(pwd_manager),
|
||||||
|
urllib2.ProxyDigestAuthHandler(pwd_manager),
|
||||||
|
])
|
||||||
|
handlers.extend([
|
||||||
urllib2.HTTPDefaultErrorHandler,
|
urllib2.HTTPDefaultErrorHandler,
|
||||||
urllib2.HTTPRedirectHandler,
|
urllib2.HTTPRedirectHandler,
|
||||||
]
|
])
|
||||||
if hasattr(httplib, 'HTTPS'):
|
if hasattr(httplib, 'HTTPS'):
|
||||||
handlers.append(httputil.HttpsWithGzipHandler)
|
handlers.append(httputil.HttpsWithGzipHandler)
|
||||||
return urllib2.build_opener(*handlers)
|
return urllib2.build_opener(*handlers)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue