split off cache and url consumer routines into separate classes

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1432 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-14 09:33:09 +00:00 · 2004-08-19 21:35:47 +00:00 · 2004-08-19 21:35:47 +00:00 · f2e7ca6040
commit f2e7ca6040
parent c49ac001d1
12 changed files with 303 additions and 168 deletions
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -137,55 +137,42 @@ acap        # application configuration access protocol
 ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)


-def print_status (config, curtime, start_time):
-    """print check status looking at url queues"""
-    tocheck = len(config.urls)
-    links = config['linknumber']
-    active = config.threader.active_threads()
-    duration = linkcheck.strformat.strduration(curtime - start_time)
-    print >> sys.stderr, \
-     _("%5d urls queued, %4d links checked, %2d active threads, runtime %s")\
-     % (tocheck, links, active, duration)
-
-
 # main check function
-def check_urls (config):
+def check_urls (consumer):
    """Gets a complete configuration object as parameter where all
       runtime-dependent options are stored. If you call this function
       more than once, you can specify different configurations.
-
-       In the config object there are functions to get new URLs to check,
-       and to perform the actual checking.
    """
-    config.logger_start_output()
    try:
-        start_time = time.time()
-        status_time = start_time
-        while True:
-            if config.has_more_urls():
-                config.check_url(config.get_url())
-            elif config.finished():
-                break
-            else:
-                # active connections are downloading/parsing, so
-                # wait a little
-                time.sleep(0.1)
-            if config['status']:
-                curtime = time.time()
-                if (curtime - status_time) > 5:
-                    print_status(config, curtime, start_time)
-                    status_time = curtime
-        config.logger_end_output()
+        _check_urls(consumer)
    except KeyboardInterrupt:
-        config.finish()
-        config.logger_end_output()
-        active = config.threader.active_threads()
+        consumer.finish()
        linkcheck.log.warn(linkcheck.LOG_CHECK,
             _("keyboard interrupt; waiting for %d active threads to finish"),
-             active)
+             consumer.active_threads())
        raise


+def _check_urls (consumer):
+    consumer.logger_start_output()
+    start_time = time.time()
+    status_time = start_time
+    while not consumer.finished():
+        url = consumer.get_url()
+        if url is not None:
+            consumer.check_url(url)
+        else:
+            # active connections are downloading/parsing, so
+            # wait a little
+            time.sleep(0.1)
+        if consumer.config['status']:
+            curtime = time.time()
+            if (curtime - status_time) > 5:
+                consumer.print_status(curtime, start_time)
+                status_time = curtime
+    consumer.logger_end_output()
+
+
 # file extensions we can parse recursively
 extensions = {
    "html": re.compile(r'(?i)\.s?html?$'),
@ -237,9 +224,9 @@ def absolute_url (base_url, base_ref, parent_url):
    return ""


-def get_url_from (base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=None,
-                  cmdline=None):
+def get_url_from (base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0,
+                  name=None, cmdline=None):
    """get url data from given base data"""
    if cmdline and linkcheck.url.url_needs_quoting(base_url):
        base_url = linkcheck.url.url_quote(base_url)
@ -269,9 +256,10 @@ def get_url_from (base_url, recursion_level, config, parent_url=None,
    # assume local file
    else:
        klass = linkcheck.checker.fileurl.FileUrl
-    if cmdline and url and config['strict'] and \
-       not (config['internlinks'] or config['externlinks']):
+    if cmdline and url and consumer.config['strict'] and \
+       not (consumer.config['internlinks'] or consumer.config['externlinks']):
        # set automatic intern/extern stuff if no filter was given
-        set_intern_url(url, klass, config)
-    return klass(base_url, recursion_level, config, parent_url, base_ref,
+        set_intern_url(url, klass, consumer.config)
+    return klass(base_url, recursion_level, consumer,
+                 parent_url=parent_url, base_ref=base_ref,
                 line=line, column=column, name=name)
--- a/linkcheck/checker/consumer.py
+++ b/linkcheck/checker/consumer.py
@ -0,0 +1,183 @@
+# -*- coding: iso-8859-1 -*-
+"""url consumer class"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sys
+try:
+    import threading
+except ImportError:
+    import dummy_threading as threading
+
+import linkcheck.threader
+
+from linkcheck.i18n import _
+
+class Consumer (object):
+    """consume urls from the url queue in a threaded manner"""
+
+    def __init__ (self, config, cache):
+        """initialize consumer data and threads"""
+        self.config = config
+        self.cache = cache
+        self.urls = []
+        self.threader = linkcheck.threader.Threader()
+        self._set_threads(config['threads'])
+        self.logger = config['logger']
+        self.fileoutput = config['fileoutput']
+        self.linknumber = 0
+        # one lock for the data
+        self.lock = threading.Lock()
+
+    def filter_url_queue (self):
+        """remove already cached urls from queue"""
+        pass # deadlock!
+        #self.lock.acquire()
+        #try:
+        #    urls = []
+        #    for url_data in self.urls:
+        #        if self.cache.check_cache(url_data):
+        #            self.logger_new_url(url_data)
+        #        else:
+        #            urls.append(url_data)
+        #    self.urls = urls
+        #    print >> sys.stderr, \
+        #      _("removed %d cached urls from incoming queue") % len(removed)
+        #finally:
+        #    self.lock.release()
+
+    def _set_threads (self, num):
+        """set number of checker threads to start"""
+        linkcheck.log.debug(linkcheck.LOG_CHECK,
+                            "set threading with %d threads", num)
+        self.threader.threads_max = num
+        if num > 0:
+            sys.setcheckinterval(50)
+        else:
+            sys.setcheckinterval(100)
+
+    def check_url (self, url_data):
+        """start new thread checking the given url"""
+        self.threader.start_thread(url_data.check, ())
+
+    def append_url (self, url_data):
+        """add new url to list of urls to check"""
+        # check syntax
+        if not url_data.check_syntax():
+            # wrong syntax, do not check any further
+            return
+        # check the cache
+        if self.cache.check_cache(url_data):
+            # already cached
+            self.logger_new_url(url_data)
+            return
+        self.lock.acquire()
+        try:
+            self.urls.append(url_data)
+        finally:
+            self.lock.release()
+
+    def finished (self):
+        """return True if checking is finished"""
+        self.lock.acquire()
+        try:
+            return self.threader.finished() and len(self.urls) <= 0
+        finally:
+            self.lock.release()
+
+    def get_url (self):
+        """get first url in queue and return it"""
+        self.lock.acquire()
+        try:
+            if not self.urls:
+                return None
+            u = self.urls[0]
+            del self.urls[0]
+            return u
+        finally:
+            self.lock.release()
+
+    def finish (self):
+        """finish checking and send of-of-output message to logger"""
+        self.lock.acquire()
+        try:
+            self.threader.finish()
+        finally:
+            self.lock.release()
+        self.logger_end_output()
+
+    def print_status (self, curtime, start_time):
+        """print check status looking at url queues"""
+        self.lock.acquire()
+        try:
+            active = self.threader.active_threads()
+            links = self.linknumber
+            tocheck = len(self.urls)
+            duration = linkcheck.strformat.strduration(curtime - start_time)
+            print >> sys.stderr, _("%5d urls queued, %4d links checked, "\
+                                   "%2d active threads, runtime %s")\
+                                 % (tocheck, links, active, duration)
+        finally:
+            self.lock.release()
+
+    def logger_start_output (self):
+        """start output of all configured loggers"""
+        self.lock.acquire()
+        try:
+            if not self.config['quiet']:
+                self.logger.start_output()
+            for logger in self.fileoutput:
+                logger.start_output()
+        finally:
+            self.lock.release()
+
+    def logger_new_url (self, url_data):
+        """send new url to all configured loggers"""
+        self.lock.acquire()
+        try:
+            self.linknumber += 1
+            do_filter = (self.linknumber % 1000) == 0
+            if not self.config['quiet'] and \
+              (self.config["verbose"] or not url_data.valid or
+               (url_data.warning and self.config["warnings"])):
+                self.logger.new_url(url_data)
+            for log in self.fileoutput:
+                log.new_url(url_data)
+        finally:
+            self.lock.release()
+        # XXX deadlock!
+        #if do_filter:
+        #    self.filter_queue(self)
+
+    def logger_end_output (self):
+        """end output of all configured loggers"""
+        self.lock.acquire()
+        try:
+            if not self.config['quiet']:
+                self.logger.end_output(linknumber=self.linknumber)
+            for logger in self.fileoutput:
+                logger.end_output(linknumber=self.linknumber)
+        finally:
+            self.lock.release()
+
+    def active_threads (self):
+        """return number of active threads"""
+        self.lock.acquire()
+        try:
+            return self.threader.active_threads()
+        finally:
+            self.lock.release()
+
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -53,15 +53,12 @@ def get_index_html (dirname):
 class FileUrl (urlbase.UrlBase):
    "Url link with file scheme"

-    def __init__ (self,
-                  base_url,
-                  config,
-                  recursion_level,
+    def __init__ (self, base_url, recursion_level, consumer,
                  parent_url = None,
                  base_ref = None, line=0, column=0, name=""):
-        super(FileUrl, self).__init__(base_url, config, recursion_level,
-                                    parent_url=parent_url, base_ref=base_ref,
-                                    line=line, column=column, name=name)
+        super(FileUrl, self).__init__(base_url, recursion_level, consumer,
+             parent_url=parent_url, base_ref=base_ref,
+             line=line, column=column, name=name)
        if not (parent_url or base_ref or self.base_url.startswith("file:")):
            self.base_url = os.path.expanduser(self.base_url)
            if not self.base_url.startswith("/"):
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -32,11 +32,11 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):

    def check_connection (self):
        # proxy support (we support only http)
-        self.set_proxy(self.config["proxy"].get(self.scheme))
+        self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
        if self.proxy:
            http = httpurl.HttpUrl(self.base_url,
                  self.recursion_level,
-                  self.config,
+                  self.consumer.config,
                  parent_url=self.parent_url,
                  base_ref=self.base_ref,
                  line=self.line,
@ -80,7 +80,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        # ready to connect
        try:
            self.url_connection = ftplib.FTP()
-            if self.config.get("debug"):
+            if self.consumer.config.get("debug"):
                self.url_connection.set_debuglevel(1)
            self.url_connection.connect(self.urlparts[1])
            self.url_connection.login(_user, _password)
--- a/linkcheck/checker/httpsurl.py
+++ b/linkcheck/checker/httpsurl.py
@ -28,4 +28,4 @@ class HttpsUrl (httpurl.HttpUrl):
            super(HttpsUrl, self).local_check()
        else:
            self.add_warning(_("%s url ignored")%self.scheme.capitalize())
-            self.log_me()
+            self.consumer.logger_new_url(self)
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -46,11 +46,11 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
 class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
    "Url link with http scheme"

-    def __init__ (self, base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=""):
-        super(HttpUrl, self).__init__(base_url, recursion_level, config,
-                         parent_url=parent_url, base_ref=base_ref, line=line,
-                         column=column, name=name)
+    def __init__ (self, base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0, name=""):
+        super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
+               parent_url=parent_url, base_ref=base_ref, line=line,
+               column=column, name=name)
        self.aliases = []
        self.max_redirects = 5
        self.has301status = False
@ -109,13 +109,13 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        | extension-code
        """
        # set the proxy, so a 407 status after this is an error
-        self.set_proxy(self.config["proxy"].get(self.scheme))
+        self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
        if self.proxy:
            self.add_info(_("Using Proxy %r") % self.proxy)
        self.headers = None
        self.auth = None
        self.cookies = []
-        if not self.robots_txt_allows_url():
+        if not self.consumer.cache.robots_txt_allows_url(self):
            self.add_warning(
                       _("Access denied by robots.txt, checked only syntax"))
            return
@ -235,6 +235,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                self.set_result(
                     _("recursive redirection encountered:\n %s") % \
                            "\n  => ".join(redirect_cache), valid=False)
+                self.consumer.logger_new_url(self)
                return -1, response
            redirect_cache.append(redirected)
            # remember this alias
@ -252,11 +253,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                    self.has301status = True
                self.aliases.append(redirected)
            # check cache again on possibly changed URL
-            key = self.get_cache_key()
-            if self.config.url_cache_has_key(key):
-                self.copy_from_cache(self.config.url_cache_get(key))
-                self.cached = True
-                self.log_me()
+            if self.consumer.cache.check_cache(self):
+                self.consumer.logger_new_url(self)
                return -1, response
            # check if we still have a http url, it could be another
            # scheme, eg https or news
@ -266,15 +264,14 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                             "the original url was %r.") % self.url)
                # make new Url object
                newobj = linkcheck.checker.get_url_from(
-                          redirected, self.recursion_level, self.config,
+                          redirected, self.recursion_level, self.consumer,
                          parent_url=self.parent_url, base_ref=self.base_ref,
                          line=self.line, column=self.column, name=self.name)
                newobj.warning = self.warning
                newobj.info = self.info
                # append new object to queue
-                self.config.append_url(newobj)
+                self.consumer.append_url(newobj)
                # pretend to be finished and logged
-                self.cached = True
                return -1, response
            # new response data
            response = self._get_http_response()
@ -302,10 +299,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                # no content
                self.add_warning(response.reason)
            # store cookies for valid links
-            if self.config['cookies']:
+            if self.consumer.config['cookies']:
                for c in self.cookies:
                    self.add_info("Cookie: %s" % c)
-                out = self.config.storeCookies(self.headers, self.urlparts[1])
+                out = self.consumer.config.storeCookies(self.headers, self.urlparts[1])
                for h in out:
                    self.add_info(h)
            if response.status >= 200:
@ -335,14 +332,16 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        if self.url_connection:
            self.close_connection()
        self.url_connection = self.get_http_object(host, scheme)
-        url = urlparse.urlunsplit(self.urlparts)
        if self.no_anchor:
-            qurlparts[4] = ''
+            anchor = ''
+        else:
+            anchor = self.urlparts[4]
        if self.proxy:
-            path = urlparse.urlunsplit(self.urlparts)
+            path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
+                                 self.urlparts[2], self.urlparts[3], anchor))
        else:
            path = urlparse.urlunsplit(('', '', self.urlparts[2],
-                                        self.urlparts[3], self.urlparts[4]))
+                                        self.urlparts[3], anchor))
        self.url_connection.putrequest(self.method, path, skip_host=True)
        self.url_connection.putheader("Host", host)
        # userinfo is from http://user@pass:host/
@ -360,8 +359,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                                      linkcheck.configuration.UserAgent)
        self.url_connection.putheader("Accept-Encoding",
                                  "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
-        if self.config['cookies']:
-            self.cookies = self.config.getCookies(self.urlparts[1],
+        if self.consumer.config['cookies']:
+            self.cookies = self.consumer.config.getCookies(self.urlparts[1],
                                                  self.urlparts[2])
            for c in self.cookies:
                self.url_connection.putheader("Cookie", c)
@ -375,7 +374,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
            h = linkcheck.httplib2.HTTPSConnection(host)
        else:
            raise linkcheck.LinkCheckerError("invalid url scheme %s" % scheme)
-        if self.config.get("debug"):
+        if self.consumer.config.get("debug"):
            h.set_debuglevel(1)
        h.connect()
        return h
@ -447,15 +446,3 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
    def get_robots_txt_url (self):
        return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])

-    def robots_txt_allows_url (self):
-        roboturl = self.get_robots_txt_url()
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "robots.txt url %r",
-                            roboturl)
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "url %r", self.url)
-        if not self.config.robots_txt_cache_has_key(roboturl):
-            rp = linkcheck.robotparser2.RobotFileParser()
-            rp.set_url(roboturl)
-            rp.read()
-            self.config.robots_txt_cache_set(roboturl, rp)
-        rp = self.config.robots_txt_cache_get(roboturl)
-        return rp.can_fetch(linkcheck.configuration.UserAgent, self.url)
--- a/linkcheck/checker/ignoredurl.py
+++ b/linkcheck/checker/ignoredurl.py
@ -25,7 +25,7 @@ class IgnoredUrl (urlbase.UrlBase):

    def local_check (self):
        self.add_warning(_("%s url ignored")%self.scheme.capitalize())
-        self.log_me()
+        self.consumer.logger_new_url(self)

    def can_get_content (self):
        return False
--- a/linkcheck/checker/mailtourl.py
+++ b/linkcheck/checker/mailtourl.py
@ -16,7 +16,6 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-import re
 import sys
 import cgi
 import urllib
--- a/linkcheck/checker/nntpurl.py
+++ b/linkcheck/checker/nntpurl.py
@ -46,7 +46,7 @@ class NntpUrl (urlbase.UrlBase):
        linkcheck.log.debug(linkcheck.LOG_CHECK, self.urlparts)

    def check_connection (self):
-        nntpserver = self.urlparts[1] or self.config["nntpserver"]
+        nntpserver = self.urlparts[1] or self.consumer.config["nntpserver"]
        if not nntpserver:
            self.add_warning(_("No NNTP server specified, skipping this URL"))
            return
--- a/linkcheck/checker/telneturl.py
+++ b/linkcheck/checker/telneturl.py
@ -49,14 +49,14 @@ class TelnetUrl (urlconnect.UrlConnect):
    def local_check (self):
        if not self.host:
            self.set_result(_("Host is empty"), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
            return
        super(TelnetUrl, self).local_check()

    def check_connection (self):
        super(TelnetUrl, self).check_connection()
        self.url_connection = telnetlib.Telnet()
-        if self.config.get("debug"):
+        if self.consumer.config.get("debug"):
            self.url_connection.set_debuglevel(1)
        self.url_connection.open(self.host, self.port)
        if self.user:
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -79,7 +79,7 @@ def print_app_info ():
 class UrlBase (object):
    """An URL with additional information like validity etc."""

-    def __init__ (self, base_url, recursion_level, config,
+    def __init__ (self, base_url, recursion_level, consumer,
                  parent_url = None, base_ref = None,
                  line = 0, column = 0, name = ""):
        """Initialize check data, and store given variables.
@ -100,8 +100,9 @@ class UrlBase (object):
        self.parent_url = parent_url
        self.anchor = None
        self.recursion_level = recursion_level
-        self.config = config
+        self.consumer = consumer
        self.result = ""
+        self.cached = False
        self.valid = True
        self.warning = linkcheck.containers.SetList()
        self.info = linkcheck.containers.SetList()
@ -111,7 +112,6 @@ class UrlBase (object):
        self.dltime = -1
        self.dlsize = -1
        self.checktime = 0
-        self.cached = False
        self.url_connection = None
        self.extern = (1, 0)
        self.data = None
@ -169,6 +169,7 @@ class UrlBase (object):
        self.info.extend(cache_data["info"])
        self.valid = cache_data["valid"]
        self.dltime = cache_data["dltime"]
+        self.cached = True

    def get_cache_data (self):
        """return all data values that should be put in the cache"""
@ -186,13 +187,12 @@ class UrlBase (object):
        return [key]

    def is_cached (self):
-        key = self.get_cache_key()
-        return self.cached or self.config.url_seen_has_key(key)
+        return self.consumer.cache.url_is_cached(self.get_cache_key())

    def get_cache_key (self):
        # note: the host is already lowercase
        if self.urlparts:
-            if self.config["anchorcaching"]:
+            if self.consumer.config["anchorcaching"]:
                # do not ignore anchor
                return urlparse.urlunsplit(self.urlparts)
            else:
@ -200,16 +200,6 @@ class UrlBase (object):
                return urlparse.urlunsplit(self.urlparts[:4]+[''])
        return None

-    def put_in_cache (self):
-        """put url data into cache"""
-        if self.is_cached():
-            # another thread was faster and cached this url already
-            return
-        data = self.get_cache_data()
-        for key in self.get_cache_keys():
-            self.config.url_cache_set(key, data)
-            self.config.url_seen_set(key)
-
    def build_url (self):
        # make url absolute
        if self.base_ref:
@ -236,14 +226,6 @@ class UrlBase (object):
        # safe anchor for later checking
        self.anchor = self.urlparts[4]

-    def log_me (self):
-        """announce the url data as checked to the configured loggers"""
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "logging url")
-        self.config.increment_linknumber()
-        if self.config["verbose"] or not self.valid or \
-           (self.warning and self.config["warnings"]):
-            self.config.logger_new_url(self)
-
    def check (self):
        try:
            self.local_check()
@ -260,28 +242,30 @@ class UrlBase (object):

    def local_check (self):
        linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
-        if self.recursion_level and self.config['wait']:
+        if self.recursion_level and self.consumer.config['wait']:
            linkcheck.log.debug(linkcheck.LOG_CHECK,
-                            "sleeping for %d seconds", self.config['wait'])
-            time.sleep(self.config['wait'])
+                            "sleeping for %d seconds", self.consumer.config['wait'])
+            time.sleep(self.consumer.config['wait'])
        t = time.time()
-        if not self.check_cache():
+        if self.consumer.cache.check_cache(self):
+            # was cached from previous queue member
+            self.consumer.logger_new_url(self)
            return
        # apply filter
        linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern)
-        if self.extern[0] and (self.config["strict"] or self.extern[1]):
+        if self.extern[0] and (self.consumer.config["strict"] or self.extern[1]):
            self.add_warning(
                  _("outside of domain filter, checked only syntax"))
-            self.log_me()
+            self.consumer.logger_new_url(self)
            return

        # check connection
        linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection")
        try:
            self.check_connection()
-            if self.cached:
+            if self.is_cached():
                return
-            if self.config["anchors"]:
+            if self.consumer.config["anchors"]:
                self.check_anchors()
        except tuple(linkcheck.checker.ExcList):
            etype, evalue, etb = sys.exc_info()
@ -296,7 +280,7 @@ class UrlBase (object):
            self.set_result(str(evalue), valid=False)

        # check content
-        warningregex = self.config["warningregex"]
+        warningregex = self.consumer.config["warningregex"]
        if warningregex and self.valid:
            linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
            try:
@ -323,40 +307,37 @@ class UrlBase (object):
                            valid=False)
        # close
        self.close_connection()
-        self.log_me()
+        self.consumer.logger_new_url(self)
        linkcheck.log.debug(linkcheck.LOG_CHECK, "caching")
-        self.put_in_cache()
+        self.consumer.cache.url_data_cache_add(self)

    def check_syntax (self):
+        """Called before self.check(), this function inspects the
+           url syntax. Success enables further checking, failure
+           immediately logs this url. This syntax check must not
+           use any network resources.
+        """
        linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
        if not self.base_url:
            self.set_result(_("URL is empty"), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
            return False
        if ws_at_start_or_end(self.base_url):
+            # leading or trailing whitespace is common, so make a
+            # separate error message for this
            self.set_result(_("URL has whitespace at beginning or end"),
                            valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
            return False
        try:
            self.build_url()
            self.extern = self._get_extern()
        except linkcheck.LinkCheckerError, msg:
            self.set_result(str(msg), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
            return False
        return True

-    def check_cache (self):
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "checking cache")
-        for key in self.get_cache_keys():
-            if self.config.url_cache_has_key(key):
-                self.copy_from_cache(self.config.url_cache_get(key))
-                self.cached = True
-                self.log_me()
-                return False
-        return True
-
    def close_connection (self):
        """close an opened url connection"""
        # brute force closing
@ -379,8 +360,8 @@ class UrlBase (object):
            self.is_parseable() and \
            self.can_get_content() and \
            not self.is_cached() and \
-            (self.config["recursionlevel"] < 0 or
-             self.recursion_level < self.config["recursionlevel"]) and \
+            (self.consumer.config["recursionlevel"] < 0 or
+             self.recursion_level < self.consumer.config["recursionlevel"]) and \
            not self.extern[0] and self.content_allows_robots()

    def content_allows_robots (self):
@ -418,19 +399,19 @@ class UrlBase (object):
        self.add_warning(_("anchor #%s not found") % self.anchor)

    def _get_extern (self):
-        if not (self.config["externlinks"] or self.config["internlinks"]):
+        if not (self.consumer.config["externlinks"] or self.consumer.config["internlinks"]):
            return (0, 0)
        # deny and allow external checking
        linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", self.url)
-        if self.config["denyallow"]:
-            for entry in self.config["externlinks"]:
+        if self.consumer.config["denyallow"]:
+            for entry in self.consumer.config["externlinks"]:
                linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
                                    entry)
                match = entry['pattern'].search(self.url)
                if (entry['negate'] and not match) or \
                   (match and not entry['negate']):
                    return (1, entry['strict'])
-            for entry in self.config["internlinks"]:
+            for entry in self.consumer.config["internlinks"]:
                linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
                                    entry)
                match = entry['pattern'].search(self.url)
@ -439,14 +420,14 @@ class UrlBase (object):
                    return (0, 0)
            return (0, 0)
        else:
-            for entry in self.config["internlinks"]:
+            for entry in self.consumer.config["internlinks"]:
                linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
                                    entry)
                match = entry['pattern'].search(self.url)
                if (entry['negate'] and not match) or \
                   (match and not entry['negate']):
                    return (0, 0)
-            for entry in self.config["externlinks"]:
+            for entry in self.consumer.config["externlinks"]:
                linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
                                    entry)
                match = entry['pattern'].search(self.url)
@ -482,7 +463,7 @@ class UrlBase (object):
    def check_size (self):
        """if a maximum size was given, call this function to check it
           against the content size of this url"""
-        maxbytes = self.config["warnsizebytes"]
+        maxbytes = self.consumer.config["warnsizebytes"]
        if maxbytes is not None and self.dlsize >= maxbytes:
            self.add_warning(_("Content size %s is larger than %s") % \
                         (linkcheck.strformat.strsize(self.dlsize),
@ -497,7 +478,7 @@ class UrlBase (object):
        self.parse_html()

    def get_user_password (self):
-        for auth in self.config["authentication"]:
+        for auth in self.consumer.config["authentication"]:
            if auth['pattern'].match(self.url):
                return auth['user'], auth['password']
        return None, None
@ -535,10 +516,10 @@ class UrlBase (object):
                base = base_ref
            linkcheck.log.debug(linkcheck.LOG_CHECK, "Put url %r in queue",
                                url)
-            self.config.append_url(linkcheck.checker.get_url_from(url,
-                                  self.recursion_level+1, self.config,
-                                  parent_url=self.url, base_ref=base,
-                                  line=line, column=column, name=name))
+            self.consumer.append_url(linkcheck.checker.get_url_from(url,
+                           self.recursion_level+1, self.consumer,
+                           parent_url=self.url, base_ref=base,
+                           line=line, column=column, name=name))

    def parse_opera (self):
        """parse an opera bookmark file"""
@ -553,8 +534,9 @@ class UrlBase (object):
            elif line.startswith("URL="):
                url = line[4:]
                if url:
-                    self.config.append_url(linkcheck.checker.get_url_from(url,
-           self.recursion_level+1, self.config, self.url, None, lineno, name))
+                    self.consumer.append_url(linkcheck.checker.get_url_from(url,
+                       self.recursion_level+1, self.consumer,
+                       self.url, None, lineno, name))
                name = ""

    def parse_text (self):
@ -567,9 +549,9 @@ class UrlBase (object):
            lineno += 1
            line = line.strip()
            if not line or line.startswith('#'): continue
-            self.config.append_url(
+            self.consumer.append_url(
                  linkcheck.checker.get_url_from(line, self.recursion_level+1,
-                               self.config, parent_url=self.url, line=lineno))
+                   self.consumer, parent_url=self.url, line=lineno))

    def parse_css (self):
        """parse a CSS file for url() patterns"""
@ -578,9 +560,9 @@ class UrlBase (object):
            lineno += 1
            for mo in linkcheck.linkparse.css_url_re.finditer(line):
                column = mo.start("url")
-                self.config.append_url(
+                self.consumer.append_url(
                             linkcheck.checker.get_url_from(mo.group("url"),
-                             self.recursion_level+1, self.config,
+                             self.recursion_level+1, self.consumer,
                             parent_url=self.url, line=lineno, column=column))

    def __str__ (self):
@ -590,7 +572,6 @@ class UrlBase (object):
            "base_url=%s" % self.base_url,
            "parent_url=%s" % self.parent_url,
            "base_ref=%s" % self.base_ref,
-            "cached=%s" % self.cached,
            "recursion_level=%s" % self.recursion_level,
            "url_connection=%s" % self.url_connection,
            "line=%s" % self.line,
--- a/linkcheck/checker/urlconnect.py
+++ b/linkcheck/checker/urlconnect.py
@ -27,9 +27,9 @@ from linkcheck.i18n import _
 class UrlConnect (urlbase.UrlBase):
    """Url link for which we have to connect to a specific host"""

-    def __init__ (self, base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=""):
-        super(UrlConnect, self).__init__(base_url, recursion_level, config,
+    def __init__ (self, base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0, name=""):
+        super(UrlConnect, self).__init__(base_url, recursion_level, consumer,
                    parent_url=parent_url, base_ref=base_ref,
                    line=line, column=column, name=name)
        self.host = None