mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-19 22:01:00 +00:00
basic robots_txt check method
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2001 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
0af9a489df
commit
89616a4bba
2 changed files with 9 additions and 8 deletions
|
|
@ -182,15 +182,11 @@ class Cache (object):
|
|||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def robots_txt_allows_url (self, url_data):
|
||||
def robots_txt_allows_url (self, roboturl, url, user, password):
|
||||
"""ask robots.txt allowance"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
roboturl = url_data.get_robots_txt_url()
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"robots.txt url %r of %r", roboturl, url_data.url)
|
||||
if roboturl not in self.robots_txt:
|
||||
user, password = url_data.get_user_password()
|
||||
rp = linkcheck.robotparser2.RobotFileParser(
|
||||
user=user, password=password)
|
||||
rp.set_url(roboturl)
|
||||
|
|
@ -198,8 +194,7 @@ class Cache (object):
|
|||
self.robots_txt[roboturl] = rp
|
||||
else:
|
||||
rp = self.robots_txt[roboturl]
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent,
|
||||
url_data.url)
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
|
|
|
|||
|
|
@ -66,6 +66,12 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.urlparts[2] = '/'
|
||||
self.url = urlparse.urlunsplit(self.urlparts)
|
||||
|
||||
def allows_robots (self, url):
|
||||
roboturl = self.get_robots_txt_url()
|
||||
user, password = self.get_user_password()
|
||||
return self.consumer.cache.robots_txt_allows_url(roboturl, url,
|
||||
user, password)
|
||||
|
||||
def check_connection (self):
|
||||
"""
|
||||
Check a URL with HTTP protocol.
|
||||
|
|
@ -115,7 +121,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.headers = None
|
||||
self.auth = None
|
||||
self.cookies = []
|
||||
if not self.consumer.cache.robots_txt_allows_url(self):
|
||||
if not self.allows_robots(self.url):
|
||||
self.add_warning(
|
||||
_("Access denied by robots.txt, checked only syntax"))
|
||||
return
|
||||
|
|
|
|||
Loading…
Reference in a new issue