basic robots_txt check method

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2001 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-11-18 00:55:47 +00:00
parent 0af9a489df
commit 89616a4bba
2 changed files with 9 additions and 8 deletions

View file

@ -182,15 +182,11 @@ class Cache (object):
finally:
self.lock.release()
def robots_txt_allows_url (self, url_data):
def robots_txt_allows_url (self, roboturl, url, user, password):
"""ask robots.txt allowance"""
self.lock.acquire()
try:
roboturl = url_data.get_robots_txt_url()
linkcheck.log.debug(linkcheck.LOG_CACHE,
"robots.txt url %r of %r", roboturl, url_data.url)
if roboturl not in self.robots_txt:
user, password = url_data.get_user_password()
rp = linkcheck.robotparser2.RobotFileParser(
user=user, password=password)
rp.set_url(roboturl)
@ -198,8 +194,7 @@ class Cache (object):
self.robots_txt[roboturl] = rp
else:
rp = self.robots_txt[roboturl]
return rp.can_fetch(linkcheck.configuration.UserAgent,
url_data.url)
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
finally:
self.lock.release()

View file

@ -66,6 +66,12 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.urlparts[2] = '/'
self.url = urlparse.urlunsplit(self.urlparts)
def allows_robots (self, url):
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
return self.consumer.cache.robots_txt_allows_url(roboturl, url,
user, password)
def check_connection (self):
"""
Check a URL with HTTP protocol.
@ -115,7 +121,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.headers = None
self.auth = None
self.cookies = []
if not self.consumer.cache.robots_txt_allows_url(self):
if not self.allows_robots(self.url):
self.add_warning(
_("Access denied by robots.txt, checked only syntax"))
return