diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index a6c90399..6e1df7d6 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -16,10 +16,13 @@ """ Cache robots.txt contents. """ +import urllib.parse + from .. import robotparser2 from ..containers import LFUCache from ..decorators import synchronized from ..lock import get_lock +from .. import log, LOG_CACHE # lock objects @@ -70,6 +73,10 @@ class RobotsTxt: if not rp.sitemap_urls or not url_data.allows_simple_recursion(): return for sitemap_url, line in rp.sitemap_urls: + if not urllib.parse.urlparse(sitemap_url).scheme: + log.warn(LOG_CACHE, _("Relative Sitemap %s in %s discarded"), + sitemap_url, roboturl) + continue url_data.add_url(sitemap_url, line=line) @synchronized(robot_lock)