mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Skip incomplete Sitemap in robots.txt and warn
Sitemap values should be fully qualified URLs; LinkChecker may not resolve relative paths correctly.
This commit is contained in:
parent
8bc849dfde
commit
7367e6e865
1 changed files with 7 additions and 0 deletions
7
linkcheck/cache/robots_txt.py
vendored
7
linkcheck/cache/robots_txt.py
vendored
|
|
@ -16,10 +16,13 @@
|
|||
"""
|
||||
Cache robots.txt contents.
|
||||
"""
|
||||
import urllib.parse
|
||||
|
||||
from .. import robotparser2
|
||||
from ..containers import LFUCache
|
||||
from ..decorators import synchronized
|
||||
from ..lock import get_lock
|
||||
from .. import log, LOG_CACHE
|
||||
|
||||
|
||||
# lock objects
|
||||
|
|
@ -70,6 +73,10 @@ class RobotsTxt:
|
|||
if not rp.sitemap_urls or not url_data.allows_simple_recursion():
|
||||
return
|
||||
for sitemap_url, line in rp.sitemap_urls:
|
||||
if not urllib.parse.urlparse(sitemap_url).scheme:
|
||||
log.warn(LOG_CACHE, _("Relative Sitemap %s in %s discarded"),
|
||||
sitemap_url, roboturl)
|
||||
continue
|
||||
url_data.add_url(sitemap_url, line=line)
|
||||
|
||||
@synchronized(robot_lock)
|
||||
|
|
|
|||
Loading…
Reference in a new issue