Skip incomplete Sitemap in robots.txt and warn

Sitemap values should be fully qualified URLs; LinkChecker may not
resolve relative paths correctly.
This commit is contained in:
Chris Mayo 2022-10-17 19:21:03 +01:00
parent 8bc849dfde
commit 7367e6e865

View file

@ -16,10 +16,13 @@
"""
Cache robots.txt contents.
"""
import urllib.parse
from .. import robotparser2
from ..containers import LFUCache
from ..decorators import synchronized
from ..lock import get_lock
from .. import log, LOG_CACHE
# lock objects
@ -70,6 +73,10 @@ class RobotsTxt:
if not rp.sitemap_urls or not url_data.allows_simple_recursion():
return
for sitemap_url, line in rp.sitemap_urls:
if not urllib.parse.urlparse(sitemap_url).scheme:
log.warn(LOG_CACHE, _("Relative Sitemap %s in %s discarded"),
sitemap_url, roboturl)
continue
url_data.add_url(sitemap_url, line=line)
@synchronized(robot_lock)