From eab2fa410e9b064568766d7a959535ba0851db5d Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 17 Oct 2022 19:21:03 +0100 Subject: [PATCH] Log robots.txt as the sitemap parent URL This is the location the sitemap URL was found in. The line being reported is the line in robots.txt. --- linkcheck/cache/robots_txt.py | 2 +- linkcheck/checker/urlbase.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index 6e1df7d6..4de3a6e9 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -77,7 +77,7 @@ class RobotsTxt: log.warn(LOG_CACHE, _("Relative Sitemap %s in %s discarded"), sitemap_url, roboturl) continue - url_data.add_url(sitemap_url, line=line) + url_data.add_url(sitemap_url, line=line, parent=roboturl) @synchronized(robot_lock) def get_lock(self, roboturl): diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index d556b6db..9bd0484f 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -793,7 +793,7 @@ class UrlBase: return (split.username, split.password) return self.aggregate.config.get_user_password(self.url) - def add_url(self, url, line=0, column=0, page=0, name="", base=None): + def add_url(self, url, line=0, column=0, page=0, name="", base=None, parent=None): """Add new URL to queue.""" if base: base_ref = urlutil.url_norm(base, encoding=self.content_encoding)[0] @@ -803,7 +803,7 @@ class UrlBase: url, self.recursion_level + 1, self.aggregate, - parent_url=self.url, + parent_url=self.url if parent is None else parent, base_ref=base_ref, line=line, column=column,