From 7367e6e865405f17fa97f428cd039ab7900998b7 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 17 Oct 2022 19:21:03 +0100 Subject: [PATCH] Skip incomplete Sitemap in robots.txt and warn Sitemap values should be fully qualified URLs; LinkChecker may not resolve relative paths correctly. --- linkcheck/cache/robots_txt.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index a6c90399..6e1df7d6 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -16,10 +16,13 @@ """ Cache robots.txt contents. """ +import urllib.parse + from .. import robotparser2 from ..containers import LFUCache from ..decorators import synchronized from ..lock import get_lock +from .. import log, LOG_CACHE # lock objects @@ -70,6 +73,10 @@ class RobotsTxt: if not rp.sitemap_urls or not url_data.allows_simple_recursion(): return for sitemap_url, line in rp.sitemap_urls: + if not urllib.parse.urlparse(sitemap_url).scheme: + log.warn(LOG_CACHE, _("Relative Sitemap %s in %s discarded"), + sitemap_url, roboturl) + continue url_data.add_url(sitemap_url, line=line) @synchronized(robot_lock)