mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-14 17:43:11 +00:00
Support checking Sitemap: URLs in robots.txt files.
This commit is contained in:
parent
0f0d79c7e0
commit
6f205a2574
5 changed files with 31 additions and 12 deletions
|
|
@ -7,6 +7,7 @@ Features:
|
|||
- checking: Add options to limit the number of requests per second,
|
||||
allowed URL schemes and maximum file or download size.
|
||||
Closes: GH bug #397, #465, #420
|
||||
- checking: Support checking Sitemap: URLs in robots.txt files.
|
||||
- gui: UI language can be changed dynamically.
|
||||
Closes: GH bug #391
|
||||
|
||||
|
|
|
|||
32
linkcheck/cache/robots_txt.py
vendored
32
linkcheck/cache/robots_txt.py
vendored
|
|
@ -21,6 +21,7 @@ from .. import robotparser2, configuration
|
|||
from ..containers import LFUCache
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock
|
||||
from ..checker import get_url_from
|
||||
|
||||
|
||||
# lock objects
|
||||
|
|
@ -42,27 +43,46 @@ class RobotsTxt (object):
|
|||
self.hits = self.misses = 0
|
||||
self.roboturl_locks = {}
|
||||
|
||||
def allows_url (self, roboturl, url, proxy, user, password):
|
||||
def allows_url (self, url_data):
|
||||
"""Ask robots.txt allowance."""
|
||||
roboturl = url_data.get_robots_txt_url()
|
||||
with self.get_lock(roboturl):
|
||||
return self._allows_url(roboturl, url, proxy, user, password)
|
||||
return self._allows_url(url_data, roboturl)
|
||||
|
||||
def _allows_url (self, roboturl, url, proxy, user, password):
|
||||
def _allows_url (self, url_data, roboturl):
|
||||
"""Ask robots.txt allowance. Assumes only single thread per robots.txt
|
||||
URL calls this function."""
|
||||
user, password = url_data.get_user_password()
|
||||
with cache_lock:
|
||||
if roboturl in self.cache:
|
||||
self.hits += 1
|
||||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(self.useragent, url)
|
||||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
self.misses += 1
|
||||
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
|
||||
rp = robotparser2.RobotFileParser(proxy=url_data.proxy, user=user,
|
||||
password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
with cache_lock:
|
||||
self.cache[roboturl] = rp
|
||||
return rp.can_fetch(self.useragent, url)
|
||||
self.add_sitemap_urls(rp, url_data, roboturl)
|
||||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
|
||||
def add_sitemap_urls(self, rp, url_data, roboturl):
|
||||
"""Add sitemap URLs to queue."""
|
||||
if not rp.sitemap_urls:
|
||||
return
|
||||
rec_level = url_data.aggregate.config["recursionlevel"]
|
||||
if rec_level >= 0 and url_data.recursion_level >= rec_level:
|
||||
return
|
||||
for sitemap_url, line in rp.sitemap_urls:
|
||||
sitemap_url_data = get_url_from(sitemap_url,
|
||||
url_data.recursion_level+1, url_data.aggregate,
|
||||
parent_url=roboturl, line=line,
|
||||
parent_content_type=url_data.content_type)
|
||||
if sitemap_url_data.has_result or not sitemap_url_data.extern[1]:
|
||||
# Only queue URLs which have a result or are not strict extern.
|
||||
url_data.aggregate.urlqueue.put(sitemap_url_data)
|
||||
|
||||
@synchronized(robot_lock)
|
||||
def get_lock(self, roboturl):
|
||||
|
|
|
|||
|
|
@ -60,10 +60,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
@return: True if access is granted, otherwise False
|
||||
@rtype: bool
|
||||
"""
|
||||
roboturl = self.get_robots_txt_url()
|
||||
user, password = self.get_user_password()
|
||||
rb = self.aggregate.robots_txt
|
||||
return rb.allows_url(roboturl, self.url, self.proxy, user, password)
|
||||
return self.aggregate.robots_txt.allows_url(self)
|
||||
|
||||
def add_size_info (self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ class RobotFileParser (object):
|
|||
self.disallow_all = False
|
||||
self.allow_all = False
|
||||
self.last_checked = 0
|
||||
# list of tuples (sitemap url, line number)
|
||||
self.sitemap_urls = []
|
||||
|
||||
def mtime (self):
|
||||
|
|
@ -185,7 +186,7 @@ class RobotFileParser (object):
|
|||
# Note that sitemap URLs must be absolute according to
|
||||
# http://www.sitemaps.org/protocol.html#submit_robots
|
||||
# But this should be checked by the calling layer.
|
||||
self.sitemap_urls.append(line[1])
|
||||
self.sitemap_urls.append((line[1], linenumber))
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -297,4 +297,4 @@ class TestRobotsTxt (unittest.TestCase):
|
|||
]
|
||||
self.rp.parse(lines)
|
||||
self.assertTrue(len(self.rp.sitemap_urls) > 0)
|
||||
self.assertTrue(self.rp.sitemap_urls[0] == "bla")
|
||||
self.assertTrue(self.rp.sitemap_urls[0] == ("bla", 1))
|
||||
|
|
|
|||
Loading…
Reference in a new issue