mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-25 08:34:43 +00:00
Prevent UnicodeDecodeError in robots.txt parsing.
This commit is contained in:
parent
b8b0398dd2
commit
7c15d28f56
2 changed files with 5 additions and 3 deletions
|
|
@ -11,6 +11,8 @@ Fixes:
|
|||
status line. Fixes the "BadStatusLine" errors.
|
||||
- http: Prevent UnicodeDecodeError on redirection by ensuring that
|
||||
the redirected URL will be Unicode encoded.
|
||||
- checking: Prevent UnicodeDecodeError in robots.txt parser by
|
||||
encoding the linkchecker useragent string.
|
||||
- installer: Add commandline executable to Windows installer.
|
||||
Closes: SF bug #2903257
|
||||
- http: Warn about permanent redirections even when redirected URL is
|
||||
|
|
|
|||
6
linkcheck/cache/robots_txt.py
vendored
6
linkcheck/cache/robots_txt.py
vendored
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2006-2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -40,6 +40,7 @@ class RobotsTxt (object):
|
|||
"""
|
||||
Ask robots.txt allowance.
|
||||
"""
|
||||
useragent = str(configuration.UserAgent)
|
||||
if roboturl not in self.cache:
|
||||
rp = robotparser2.RobotFileParser(proxy=proxy, user=user,
|
||||
password=password)
|
||||
|
|
@ -48,10 +49,9 @@ class RobotsTxt (object):
|
|||
if hasattr(callback, '__call__'):
|
||||
parts = urlutil.url_split(rp.url)
|
||||
host = "%s:%d" % (parts[1], parts[2])
|
||||
useragent = configuration.UserAgent
|
||||
wait = rp.get_crawldelay(useragent)
|
||||
callback(host, wait)
|
||||
self.cache[roboturl] = rp
|
||||
else:
|
||||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(configuration.UserAgent, url)
|
||||
return rp.can_fetch(useragent, url)
|
||||
|
|
|
|||
Loading…
Reference in a new issue