use a decorator to synchronize all the checker threads

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2614 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-05-19 15:49:06 +00:00
parent 3f6d2fa753
commit 55d7d5f7df
13 changed files with 246 additions and 294 deletions

View file

@ -134,13 +134,14 @@ def _check_urls (consumer):
start_time = time.time()
status_time = start_time
while not consumer.finished():
consumer.check_url()
if consumer.config['status']:
if not consumer.check_url():
time.sleep(0.1)
if consumer.config('status'):
curtime = time.time()
if (curtime - status_time) > 5:
consumer.print_status(curtime, start_time)
status_time = curtime
consumer.logger_end_output()
consumer.end_log_output()
# file extensions we can parse recursively
@ -252,7 +253,7 @@ def get_url_from (base_url, recursion_level, consumer,
pat = url_data.get_intern_pattern()
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Pattern %r", pat)
if pat:
consumer.config['internlinks'].append(linkcheck.get_link_pat(pat))
consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
return url_data

View file

@ -48,13 +48,11 @@ def _check_morsel (m, host, path):
return m.output(header='').strip()
class Cache (linkcheck.lock.AssertLock):
class Cache (object):
"""
Store and provide routines for cached data. Currently there are
caches for cookies, checked URLs, FTP connections and robots.txt
contents.
All public operations (except __init__()) are thread-safe.
"""
def __init__ (self):
@ -63,10 +61,13 @@ class Cache (linkcheck.lock.AssertLock):
"""
super(Cache, self).__init__()
# already checked URLs
# {cache key (string) -> cache data (dict)}
self.checked = {}
# URLs that are being checked
# {cache key (string) -> urldata (UrlData)}
self.in_progress = {}
# to-be-checked URLs
# [urldata (UrlData)]
self.incoming = collections.deque()
# downloaded robots.txt files
self.robots_txt = {}
@ -79,11 +80,7 @@ class Cache (linkcheck.lock.AssertLock):
"""
Check if incoming queue is empty.
"""
self.acquire()
try:
return len(self.incoming) <= 0
finally:
self.release()
return len(self.incoming) <= 0
def incoming_get_url (self):
"""
@ -91,57 +88,44 @@ class Cache (linkcheck.lock.AssertLock):
return it. If no such url is available return None. The
url might be already cached.
"""
self.acquire()
try:
for i, url_data in enumerate(self.incoming):
key = url_data.cache_url_key
if key in self.checked:
del self.incoming[i]
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
return url_data
elif key not in self.in_progress:
del self.incoming[i]
self.in_progress[key] = url_data
return url_data
return None
finally:
self.release()
for i, url_data in enumerate(self.incoming):
key = url_data.cache_url_key
if key in self.checked:
del self.incoming[i]
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
return url_data
elif key not in self.in_progress:
del self.incoming[i]
self.in_progress[key] = url_data
return url_data
return None
def incoming_len (self):
"""
Return number of entries in incoming queue.
"""
self.acquire()
try:
return len(self.incoming)
finally:
self.release()
return len(self.incoming)
def incoming_add (self, url_data):
"""
Add a new URL to list of URLs to check.
"""
self.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Add url %s...", repr(url_data))
# check syntax
if not url_data.check_syntax():
# wrong syntax, do not check any further
return False
# check the cache
key = url_data.cache_url_key
if key in self.checked:
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
return False
# url is not cached, so add to incoming queue
self.incoming.append(url_data)
linkcheck.log.debug(linkcheck.LOG_CACHE, "...added.")
return True
finally:
self.release()
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Add url %s...", repr(url_data))
if url_data.has_result:
# do not check any further
return False
# check the cache
key = url_data.cache_url_key
if key in self.checked:
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
return False
# url is not cached, so add to incoming queue
self.incoming.append(url_data)
linkcheck.log.debug(linkcheck.LOG_CACHE, "...added.")
return True
def has_incoming (self, key):
"""
@ -150,11 +134,7 @@ class Cache (linkcheck.lock.AssertLock):
@param key: Usually obtained from url_data.cache_url_key
@type key: String
"""
self.acquire()
try:
return key in self.incoming
finally:
self.release()
return key in self.incoming
def has_in_progress (self, key):
"""
@ -163,44 +143,32 @@ class Cache (linkcheck.lock.AssertLock):
@param key: Usually obtained from url_data.cache_url_key
@type key: String
"""
self.acquire()
try:
return key in self.in_progress
finally:
self.release()
return key in self.in_progress
def in_progress_remove (self, url_data, ignore_missing=False):
"""
Remove url from in-progress cache. If url is not cached and
ignore_missing evaluates True, raise AssertionError.
"""
self.acquire()
try:
key = url_data.cache_url_key
if key in self.in_progress:
del self.in_progress[key]
else:
assert ignore_missing, repr(key)
finally:
self.release()
key = url_data.cache_url_key
if key in self.in_progress:
del self.in_progress[key]
else:
assert ignore_missing, repr(key)
def checked_add (self, url_data):
"""
Cache checked url data.
"""
self.acquire()
try:
data = url_data.get_cache_data()
key = url_data.cache_url_key
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cache key %r...", key)
assert key not in self.checked, \
key + u", " + unicode(self.checked[key])
assert key in self.in_progress, key
# move entry from self.in_progress to self.checked
del self.in_progress[key]
self.checked[key] = data
finally:
self.release()
data = url_data.get_cache_data()
key = url_data.cache_url_key
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cache key %r...", key)
assert key not in self.checked, \
key + u", " + unicode(self.checked[key])
assert key in self.in_progress, key
# move entry from self.in_progress to self.checked
del self.in_progress[key]
self.checked[key] = data
def checked_redirect (self, redirect, url_data):
"""
@ -209,96 +177,69 @@ class Cache (linkcheck.lock.AssertLock):
If the redirect URL is found in the cache, the result data is
already copied.
"""
self.acquire()
try:
if redirect in self.checked:
url_data.copy_from_cache(self.checked[redirect])
return True
return False
finally:
self.release()
if redirect in self.checked:
url_data.copy_from_cache(self.checked[redirect])
return True
return False
def robots_txt_allows_url (self, roboturl, url, user, password):
"""
Ask robots.txt allowance.
"""
self.acquire()
try:
if roboturl not in self.robots_txt:
rp = linkcheck.robotparser2.RobotFileParser(
user=user, password=password)
rp.set_url(roboturl)
rp.read()
self.robots_txt[roboturl] = rp
else:
rp = self.robots_txt[roboturl]
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
finally:
self.release()
if roboturl not in self.robots_txt:
rp = linkcheck.robotparser2.RobotFileParser(
user=user, password=password)
rp.set_url(roboturl)
rp.read()
self.robots_txt[roboturl] = rp
else:
rp = self.robots_txt[roboturl]
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
def get_connection (self, key):
"""
Get open connection to given host. Return None if no such
connection is available (or the old one timed out).
"""
self.acquire()
try:
return self.pool.get_connection(key)
finally:
self.release()
return self.pool.get_connection(key)
def add_connection (self, key, connection, timeout):
"""
Store open connection into pool for reuse.
"""
self.acquire()
try:
self.pool.add_connection(key, connection, timeout)
finally:
self.release()
self.pool.add_connection(key, connection, timeout)
def release_connection (self, key):
"""
Remove connection from pool.
"""
self.acquire()
try:
self.pool.release_connection(key)
finally:
self.release()
self.pool.release_connection(key)
def store_cookies (self, headers, host):
"""
Thread-safe cookie cache setter function. Can raise the
exception Cookie.CookieError.
"""
self.acquire()
try:
output = []
for h in headers.getallmatchingheaders("Set-Cookie"):
output.append(h)
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h)
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
c.load(h)
return output
finally:
self.release()
output = []
for h in headers.getallmatchingheaders("Set-Cookie"):
output.append(h)
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h)
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
c.load(h)
return output
def get_cookies (self, host, path):
"""
Thread-safe cookie cache getter function.
"""
self.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Get cookies for host %r path %r", host, path)
if not self.cookies.has_key(host):
return []
cookievals = []
for m in self.cookies[host].values():
val = _check_morsel(m, host, path)
if val:
cookievals.append(val)
return cookievals
finally:
self.release()
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Get cookies for host %r path %r", host, path)
if not self.cookies.has_key(host):
return []
cookievals = []
for m in self.cookies[host].values():
val = _check_morsel(m, host, path)
if val:
cookievals.append(val)
return cookievals

View file

@ -20,14 +20,22 @@ Url consumer class.
import sys
import time
try:
import thread
except ImportError:
import dummy_thread as thread
import linkcheck.threader
import linkcheck.log
import linkcheck.lock
import linkcheck.strformat
import linkcheck.checker.geoip
from linkcheck.decorators import synchronized
from urlbase import stderr
# global lock for synchronizing all the checker threads
_lock = thread.allocate_lock()
def print_tocheck (tocheck):
msg = _n("%5d URL queued,", "%5d URLs queued,", tocheck) % tocheck
@ -49,7 +57,7 @@ def print_duration (duration):
print >> stderr, msg,
class Consumer (linkcheck.lock.AssertLock):
class Consumer (object):
"""
Consume URLs from the URL queue in a thread-safe manner.
"""
@ -59,33 +67,46 @@ class Consumer (linkcheck.lock.AssertLock):
Initialize consumer data and threads.
"""
super(Consumer, self).__init__()
self.config = config
self.cache = cache
self.threader = linkcheck.threader.Threader(num=config['threads'])
self.logger = config['logger']
self.fileoutput = config['fileoutput']
self.logger_start_output()
self._config = config
self._cache = cache
self._threader = linkcheck.threader.Threader(num=config['threads'])
self.start_log_output()
@synchronized(_lock)
def config (self, key):
return self._config[key]
@synchronized(_lock)
def config_append (self, key, val):
self._config[key].append(val)
@synchronized(_lock)
def __getattr__ (self, name):
if hasattr(self._cache, name):
return getattr(self._cache, name)
raise AttributeError(name)
@synchronized(_lock)
def append_url (self, url_data):
"""
Append url to incoming check list.
"""
if not self.cache.incoming_add(url_data):
if not self._cache.incoming_add(url_data):
# can be logged
self.logger_log_url(url_data)
self._log_url(url_data)
@synchronized(_lock)
def check_url (self):
"""
Start new thread checking the given url.
"""
url_data = self.cache.incoming_get_url()
url_data = self._cache.incoming_get_url()
if url_data is None:
# active connections are downloading/parsing, so
# wait a little
time.sleep(0.1)
# active connections are downloading/parsing
pass
elif url_data.cached:
# was cached -> can be logged
self.logger_log_url(url_data)
self._log_url(url_data)
else:
# go check this url
# this calls either self.checked() or self.interrupted()
@ -95,47 +116,48 @@ class Consumer (linkcheck.lock.AssertLock):
else:
name = u""
name += url_data.base_url
self.threader.start_thread(url_data.check, (), name=name)
self._threader.start_thread(url_data.check, (), name=name)
return url_data and not url_data.cached
@synchronized(_lock)
def checked (self, url_data):
"""
Put checked url in cache and log it.
"""
# log before putting it in the cache (otherwise we would see
# a "(cached)" after every url
self.logger_log_url(url_data)
self._log_url(url_data)
if not url_data.cached:
self.cache.checked_add(url_data)
self._cache.checked_add(url_data)
else:
self.cache.in_progress_remove(url_data)
self._cache.in_progress_remove(url_data)
@synchronized(_lock)
def interrupted (self, url_data):
"""
Remove url from active list.
"""
self.cache.in_progress_remove(url_data, ignore_missing=True)
self._cache.in_progress_remove(url_data, ignore_missing=True)
@synchronized(_lock)
def finished (self):
"""
Return True if checking is finished.
"""
# avoid deadlock by requesting cache data before locking
tocheck = self.cache.incoming_len()
self.acquire()
try:
return self.threader.finished() and tocheck == 0
finally:
self.release()
return self._threader.finished() and \
self._cache.incoming_len() == 0
@synchronized(_lock)
def finish (self):
self._threader.finish()
@synchronized(_lock)
def no_more_threads (self):
"""
Return True if no more active threads are running.
"""
self.acquire()
try:
return self.threader.finished()
finally:
self.release()
return self._threader.finished()
def abort (self):
"""
@ -148,7 +170,7 @@ class Consumer (linkcheck.lock.AssertLock):
if num_waited > wait_max:
linkcheck.log.error(linkcheck.LOG_CHECK,
"Thread wait timeout")
self.logger_end_output()
self.end_log_output()
sys.exit(1)
num = self.active_threads()
msg = \
@ -156,94 +178,69 @@ class Consumer (linkcheck.lock.AssertLock):
"keyboard interrupt; waiting for %d active threads to finish",
num)
linkcheck.log.warn(linkcheck.LOG_CHECK, msg, num)
self.acquire()
try:
self.threader.finish()
finally:
self.release()
self.finish()
num_waited += 1
time.sleep(2)
self.logger_end_output()
self.end_log_output()
@synchronized(_lock)
def print_status (self, curtime, start_time):
"""
Print check status looking at url queues.
"""
# avoid deadlock by requesting cache data before locking
tocheck = self.cache.incoming_len()
active = self.active_threads()
self.acquire()
try:
print >> stderr, _("Status:"),
print_active(active)
print_links(self.logger.number)
print_tocheck(tocheck)
print_duration(curtime - start_time)
print >> stderr
finally:
self.release()
print >> stderr, _("Status:"),
print_active(self._threader.active_threads())
print_links(self._config['logger'].number)
print_tocheck(self._cache.incoming_len())
print_duration(curtime - start_time)
print >> stderr
def logger_start_output (self):
@synchronized(_lock)
def start_log_output (self):
"""
Start output of all configured loggers.
"""
self.acquire()
try:
self.logger.start_output()
for logger in self.fileoutput:
logger.start_output()
finally:
self.release()
self._config['logger'].start_output()
for logger in self._config['fileoutput']:
logger.start_output()
def logger_log_url (self, url_data):
def _log_url (self, url_data):
"""
Send new url to all configured loggers.
"""
self.acquire()
try:
do_print = self.config["verbose"] or not url_data.valid or \
(url_data.warning and self.config["warnings"])
self.logger.log_filter_url(url_data, do_print)
for log in self.fileoutput:
log.log_filter_url(url_data, do_print)
finally:
self.release()
do_print = self._config["verbose"] or not url_data.valid or \
(url_data.warning and self._config["warnings"])
self._config['logger'].log_filter_url(url_data, do_print)
for log in self._config['fileoutput']:
log.log_filter_url(url_data, do_print)
# do_filter = (self.linknumber % 1000) == 0
# XXX deadlock!
#if do_filter:
# self.filter_queue(self)
def logger_end_output (self):
@synchronized(_lock)
def end_log_output (self):
"""
End output of all configured loggers.
"""
self.acquire()
try:
self.logger.end_output()
for logger in self.fileoutput:
logger.end_output()
finally:
self.release()
self._config['logger'].end_output()
for logger in self._config['fileoutput']:
logger.end_output()
@synchronized(_lock)
def active_threads (self):
"""
Return number of active threads.
"""
self.acquire()
try:
return self.threader.active_threads()
finally:
self.release()
return self._threader.active_threads()
@synchronized(_lock)
def get_country_name (self, host):
"""
Return country code for host if found, else None.
"""
self.acquire()
try:
gi = self.config["geoip"]
if gi:
return linkcheck.checker.geoip.get_country(gi, host)
return None
finally:
self.release()
gi = self._config["geoip"]
if gi:
return linkcheck.checker.geoip.get_country(gi, host)
return None

View file

@ -58,12 +58,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
order: login, changing directory, list the file.
"""
# proxy support (we support only http)
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
if self.proxy:
# using a (HTTP) proxy
http = httpurl.HttpUrl(self.base_url,
self.recursion_level,
self.consumer.config,
self.consumer,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
@ -92,7 +92,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# ready to connect
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
conn = self.consumer.cache.get_connection(key)
conn = self.consumer.get_connection(key)
if conn is not None and conn.sock is not None:
# reuse cached FTP connection
self.url_connection = conn
@ -250,6 +250,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
cache_add = self.consumer.cache.add_connection
cache_add = self.consumer.add_connection
cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
self.url_connection = None

View file

@ -78,8 +78,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
return self.consumer.cache.robots_txt_allows_url(roboturl, url,
user, password)
return self.consumer.robots_txt_allows_url(roboturl, url,
user, password)
def check_connection (self):
"""
@ -124,15 +124,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
| extension-code
"""
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
# initialize check data
self.headers = None
self.auth = None
self.cookies = []
# check robots.txt
if not self.allows_robots(self.url):
self.add_info(
# remove all previously stored results
self.add_warning(
_("Access denied by robots.txt, checked only syntax."))
self.set_result(u"syntax OK")
return
# check for amazon server quirk
if _is_amazon(self.urlparts[1]):
@ -144,12 +146,23 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.method = "HEAD"
# check the http connection
response, fallback_GET = self.check_http_connection()
if self.headers and self.headers.has_key("Server"):
server = self.headers['Server']
else:
server = _("unknown")
if fallback_GET:
self.add_info(_("Server %r did not support HEAD request; "\
"a GET request was used instead.") % server)
if self.no_anchor:
self.add_warning(_("Server %r had no anchor support, removed"\
" anchor from request.") % server)
# redirections might have changed the URL
newurl = urlparse.urlunsplit(self.urlparts)
if self.url != newurl:
self.url = newurl
# check response
self.check_response(response, fallback_GET)
if response:
self.check_response(response)
def check_http_connection (self):
"""
@ -205,7 +218,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
raise
if tries == -1:
linkcheck.log.debug(linkcheck.LOG_CHECK, "already handled")
return response, fallback_GET
return None, fallback_GET
if tries >= self.max_redirects:
if self.method == "HEAD":
# Microsoft servers tend to recurse HEAD requests
@ -276,11 +289,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if self.is_extern():
self.add_info(
_("Outside of domain filter, checked only syntax."))
self.set_result(u"filtered")
return -1, response
# check robots.txt allowance again
if not self.allows_robots(redirected):
self.add_warning(
_("Access denied by robots.txt, checked only syntax."))
self.set_result(u"syntax OK")
return -1, response
# see about recursive redirect
all_seen = [self.cache_url_key] + self.aliases
@ -330,7 +345,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
tries += 1
return tries, response
def check_response (self, response, fallback_GET):
def check_response (self, response):
"""
Check final result and log it.
"""
@ -338,27 +353,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.set_result(u"%r %s" % (response.status, response.reason),
valid=False)
else:
if self.headers and self.headers.has_key("Server"):
server = self.headers['Server']
else:
server = _("unknown")
if fallback_GET:
self.add_info(_("Server %r did not support HEAD request; "\
"a GET request was used instead.") % server)
if self.no_anchor:
self.add_warning(_("Server %r had no anchor support, removed"\
" anchor from request.") % server)
if response.status == 204:
# no content
self.add_warning(
linkcheck.strformat.unicode_safe(response.reason))
# store cookies for valid links
if self.consumer.config['cookies']:
if self.consumer.config('cookies'):
for c in self.cookies:
self.add_info(_("Store cookie: %s.") % c)
try:
out = self.consumer.cache.store_cookies(self.headers,
self.urlparts[1])
out = self.consumer.store_cookies(self.headers,
self.urlparts[1])
for h in out:
self.add_info(linkcheck.strformat.unicode_safe(h))
except Cookie.CookieError, msg:
@ -414,9 +419,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
linkcheck.configuration.UserAgent)
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.consumer.config['cookies']:
self.cookies = self.consumer.cache.get_cookies(self.urlparts[1],
self.urlparts[2])
if self.consumer.config('cookies'):
self.cookies = self.consumer.get_cookies(self.urlparts[1],
self.urlparts[2])
for c in self.cookies:
self.url_connection.putheader("Cookie", c)
self.url_connection.endheaders()
@ -439,7 +444,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
_user, _password = self.get_user_password()
key = (scheme, self.urlparts[1], _user, _password)
conn = self.consumer.cache.get_connection(key)
conn = self.consumer.get_connection(key)
if conn is not None:
linkcheck.log.debug(linkcheck.LOG_CHECK,
"reuse cached HTTP(S) connection %s", conn)
@ -566,7 +571,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("http", self.urlparts[1], _user, _password)
cache_add = self.consumer.cache.add_connection
cache_add = self.consumer.add_connection
# note: only cache the connection when it is persistent
# and all pending content has been received
if not self.persistent or not self.has_content or \

View file

@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
Connect to NNTP server and try to request the URL article
resource (if specified).
"""
nntpserver = self.host or self.consumer.config["nntpserver"]
nntpserver = self.host or self.consumer.config("nntpserver")
if not nntpserver:
self.add_warning(
_("No NNTP server was specified, skipping this URL."))

View file

@ -59,7 +59,7 @@ class ProxySupport (object):
"""
Check if self.host is in the no-proxy-for ignore list.
"""
for ro in self.consumer.config["noproxyfor"]:
for ro in self.consumer.config("noproxyfor"):
if ro.search(self.host):
return True
return False

View file

@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
label is "login: ", expected password label is "Password: ".
"""
self.url_connection = telnetlib.Telnet()
if self.consumer.config.get("debug"):
if self.consumer.config("debug"):
self.url_connection.set_debuglevel(1)
self.url_connection.open(self.host, self.port)
if self.user:

View file

@ -140,8 +140,9 @@ class UrlBase (object):
self.urlparts = None
# the anchor part of url
self.anchor = None
# the result message string
# the result message string and flag
self.result = u""
self.has_result = False
# cached or not
self.cached = False
# valid or not
@ -166,11 +167,17 @@ class UrlBase (object):
# cache keys, are set by build_url() calling set_cache_keys()
self.cache_url_key = None
self.cache_content_key = None
self.check_syntax()
def set_result (self, msg, valid=True):
"""
Set result string and validity.
"""
if self.has_result:
linkcheck.log.warn(linkcheck.LOG_CHECK,
"Double result %r (previous %r)", msg, self.result)
else:
self.has_result = True
self.result = msg
self.valid = valid
@ -245,8 +252,8 @@ class UrlBase (object):
linkcheck.log.debug(linkcheck.LOG_CACHE, "Content cache key %r",
self.cache_content_key)
# construct cache key
if self.consumer.config["anchorcaching"] and \
self.consumer.config["anchors"]:
if self.consumer.config("anchorcaching") and \
self.consumer.config("anchors"):
# do not ignore anchor
parts = self.urlparts[:]
parts[4] = self.anchor
@ -271,7 +278,7 @@ class UrlBase (object):
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
if not self.base_url:
self.set_result(_("URL is empty"), valid=False)
return False
return
try:
self.build_url()
# check url warnings
@ -282,10 +289,9 @@ class UrlBase (object):
except linkcheck.LinkCheckerError, msg:
self.set_result(linkcheck.strformat.unicode_safe(msg),
valid=False)
return False
return
self.set_cache_keys()
self.extern = self._get_extern(self.url)
return True
def build_url (self):
"""
@ -338,7 +344,7 @@ class UrlBase (object):
"""
Main check function for checking this URL.
"""
if self.consumer.config["trace"]:
if self.consumer.config("trace"):
linkcheck.log.trace()
try:
self.local_check()
@ -372,11 +378,11 @@ class UrlBase (object):
Local check function can be overridden in subclasses.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
if self.recursion_level and self.consumer.config['wait']:
if self.recursion_level and self.consumer.config('wait'):
linkcheck.log.debug(linkcheck.LOG_CHECK,
"sleeping for %d seconds",
self.consumer.config['wait'])
time.sleep(self.consumer.config['wait'])
self.consumer.config('wait'))
time.sleep(self.consumer.config('wait'))
t = time.time()
if self.is_extern():
self.add_info(_("Outside of domain filter, checked only syntax."))
@ -387,7 +393,7 @@ class UrlBase (object):
try:
self.check_connection()
self.add_country_info()
if self.consumer.config["anchors"]:
if self.consumer.config("anchors"):
self.check_anchors()
except tuple(linkcheck.checker.ExcList):
etype, evalue, etb = sys.exc_info()
@ -403,7 +409,7 @@ class UrlBase (object):
valid=False)
# check content
warningregex = self.consumer.config["warningregex"]
warningregex = self.consumer.config("warningregex")
if warningregex and self.valid:
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
try:
@ -469,8 +475,8 @@ class UrlBase (object):
return self.valid and \
self.is_parseable() and \
self.can_get_content() and \
(self.consumer.config["recursionlevel"] < 0 or
self.recursion_level < self.consumer.config["recursionlevel"]) and \
(self.consumer.config("recursionlevel") < 0 or
self.recursion_level < self.consumer.config("recursionlevel")) and \
not self.extern[0] and self.content_allows_robots()
def content_allows_robots (self):
@ -533,13 +539,13 @@ class UrlBase (object):
@return: a tuple (is_extern, is_strict)
@rtype: tuple (bool, bool)
"""
for entry in self.consumer.config["externlinks"]:
for entry in self.consumer.config("externlinks"):
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern URL %r", url)
return (1, entry['strict'])
for entry in self.consumer.config["internlinks"]:
for entry in self.consumer.config("internlinks"):
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
@ -582,7 +588,7 @@ class UrlBase (object):
If a maximum size was given, call this function to check it
against the content size of this url.
"""
maxbytes = self.consumer.config["warnsizebytes"]
maxbytes = self.consumer.config("warnsizebytes")
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(_("Content size %s is larger than %s.") % \
(linkcheck.strformat.strsize(self.dlsize),
@ -602,7 +608,7 @@ class UrlBase (object):
Get tuple (user, password) from configured authentication.
Both user and password can be None if not specified.
"""
for auth in self.consumer.config["authentication"]:
for auth in self.consumer.config("authentication"):
if auth['pattern'].match(self.url):
return auth['user'], auth['password']
return None, None
@ -728,7 +734,7 @@ class UrlBase (object):
@rtype: string
"""
s = self.serialized()
return self.consumer.config['logger'].encode(s)
return self.consumer.config('logger').encode(s)
def __repr__ (self):
"""

View file

@ -4,6 +4,7 @@ Simple decorators (usable in Python >= 2.4).
import warnings
import signal
import os
import thread
def deprecated (func):
"""

View file

@ -163,9 +163,9 @@ class StandardTest (unittest.TestCase):
url, 0, consumer, cmdline=cmdline)
consumer.append_url(url_data)
linkcheck.checker.check_urls(consumer)
if consumer.config['logger'].diff:
if consumer.config('logger').diff:
sep = unicode(os.linesep)
l = [url] + consumer.config['logger'].diff
l = [url] + consumer.config('logger').diff
l = sep.join(l)
self.fail(l.encode("iso8859-1", "ignore"))
@ -187,8 +187,8 @@ class StandardTest (unittest.TestCase):
url, 0, consumer, cmdline=cmdline)
consumer.append_url(url_data)
linkcheck.checker.check_urls(consumer)
if consumer.config['logger'].diff:
if consumer.config('logger').diff:
sep = unicode(os.linesep)
l = [url] + consumer.config['logger'].diff
l = [url] + consumer.config('logger').diff
l = sep.join(l)
self.fail(l.encode("iso8859-1", "ignore"))

View file

@ -145,9 +145,9 @@ def get_locale ():
loc = None
try:
loc = locale.getdefaultlocale()[0]
except ValueError:
# workaround (XXX delete this when python2.5 is fixed)
pass
except ValueError, msg:
# workaround for XXX
print >>sys.stderr, "WARNING", msg
if loc is None:
return 'C'
loc = locale.normalize(loc)

View file

@ -29,6 +29,7 @@ import cStringIO as StringIO
import linecache
import sys
import re
import time
try:
import thread as _thread
except ImportError:
@ -71,8 +72,8 @@ def _traceit (frame, event, arg):
if filename.endswith(".pyc") or filename.endswith(".pyo"):
filename = filename[:-1]
line = linecache.getline(filename, lineno)
print "THREAD(%d) %s:%d: %s" % \
(_thread.get_ident(), name, lineno, line.rstrip())
print "THREAD(%d) %.2f %s:%d: %s" % \
(_thread.get_ident(), time.time(), name, lineno, line.rstrip())
return _traceit
def trace ():