From f2e7ca6040aeaa6fc182dce83eb1fbbd9a592d8f Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Thu, 19 Aug 2004 21:35:47 +0000
Subject: [PATCH] split off cache and url consumer routines into separate
 classes

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1432 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 linkcheck/checker/__init__.py   |  76 ++++++-------
 linkcheck/checker/consumer.py   | 183 ++++++++++++++++++++++++++++++++
 linkcheck/checker/fileurl.py    |  11 +-
 linkcheck/checker/ftpurl.py     |   6 +-
 linkcheck/checker/httpsurl.py   |   2 +-
 linkcheck/checker/httpurl.py    |  59 ++++------
 linkcheck/checker/ignoredurl.py |   2 +-
 linkcheck/checker/mailtourl.py  |   1 -
 linkcheck/checker/nntpurl.py    |   2 +-
 linkcheck/checker/telneturl.py  |   4 +-
 linkcheck/checker/urlbase.py    | 119 +++++++++------------
 linkcheck/checker/urlconnect.py |   6 +-
 12 files changed, 303 insertions(+), 168 deletions(-)
 create mode 100644 linkcheck/checker/consumer.py

diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py
index 6aa53af8..7f5da75c 100644
--- a/linkcheck/checker/__init__.py
+++ b/linkcheck/checker/__init__.py
@@ -137,55 +137,42 @@ acap        # application configuration access protocol
 ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
 
 
-def print_status (config, curtime, start_time):
-    """print check status looking at url queues"""
-    tocheck = len(config.urls)
-    links = config['linknumber']
-    active = config.threader.active_threads()
-    duration = linkcheck.strformat.strduration(curtime - start_time)
-    print >> sys.stderr, \
-     _("%5d urls queued, %4d links checked, %2d active threads, runtime %s")\
-     % (tocheck, links, active, duration)
-
-
 # main check function
-def check_urls (config):
+def check_urls (consumer):
     """Gets a complete configuration object as parameter where all
        runtime-dependent options are stored. If you call this function
        more than once, you can specify different configurations.
-
-       In the config object there are functions to get new URLs to check,
-       and to perform the actual checking.
     """
-    config.logger_start_output()
     try:
-        start_time = time.time()
-        status_time = start_time
-        while True:
-            if config.has_more_urls():
-                config.check_url(config.get_url())
-            elif config.finished():
-                break
-            else:
-                # active connections are downloading/parsing, so
-                # wait a little
-                time.sleep(0.1)
-            if config['status']:
-                curtime = time.time()
-                if (curtime - status_time) > 5:
-                    print_status(config, curtime, start_time)
-                    status_time = curtime
-        config.logger_end_output()
+        _check_urls(consumer)
     except KeyboardInterrupt:
-        config.finish()
-        config.logger_end_output()
-        active = config.threader.active_threads()
+        consumer.finish()
         linkcheck.log.warn(linkcheck.LOG_CHECK,
              _("keyboard interrupt; waiting for %d active threads to finish"),
-             active)
+             consumer.active_threads())
         raise
 
 
+def _check_urls (consumer):
+    consumer.logger_start_output()
+    start_time = time.time()
+    status_time = start_time
+    while not consumer.finished():
+        url = consumer.get_url()
+        if url is not None:
+            consumer.check_url(url)
+        else:
+            # active connections are downloading/parsing, so
+            # wait a little
+            time.sleep(0.1)
+        if consumer.config['status']:
+            curtime = time.time()
+            if (curtime - status_time) > 5:
+                consumer.print_status(curtime, start_time)
+                status_time = curtime
+    consumer.logger_end_output()
+
+
 # file extensions we can parse recursively
 extensions = {
     "html": re.compile(r'(?i)\.s?html?$'),
@@ -237,9 +224,9 @@ def absolute_url (base_url, base_ref, parent_url):
     return ""
 
 
-def get_url_from (base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=None,
-                  cmdline=None):
+def get_url_from (base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0,
+                  name=None, cmdline=None):
     """get url data from given base data"""
     if cmdline and linkcheck.url.url_needs_quoting(base_url):
         base_url = linkcheck.url.url_quote(base_url)
@@ -269,9 +256,10 @@ def get_url_from (base_url, recursion_level, config, parent_url=None,
     # assume local file
     else:
         klass = linkcheck.checker.fileurl.FileUrl
-    if cmdline and url and config['strict'] and \
-       not (config['internlinks'] or config['externlinks']):
+    if cmdline and url and consumer.config['strict'] and \
+       not (consumer.config['internlinks'] or consumer.config['externlinks']):
         # set automatic intern/extern stuff if no filter was given
-        set_intern_url(url, klass, config)
-    return klass(base_url, recursion_level, config, parent_url, base_ref,
+        set_intern_url(url, klass, consumer.config)
+    return klass(base_url, recursion_level, consumer,
+                 parent_url=parent_url, base_ref=base_ref,
                  line=line, column=column, name=name)
diff --git a/linkcheck/checker/consumer.py b/linkcheck/checker/consumer.py
new file mode 100644
index 00000000..df2f353e
--- /dev/null
+++ b/linkcheck/checker/consumer.py
@@ -0,0 +1,183 @@
+# -*- coding: iso-8859-1 -*-
+"""url consumer class"""
+# Copyright (C) 2000-2004  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+import sys
+try:
+    import threading
+except ImportError:
+    import dummy_threading as threading
+
+import linkcheck.threader
+
+from linkcheck.i18n import _
+
+class Consumer (object):
+    """consume urls from the url queue in a threaded manner"""
+
+    def __init__ (self, config, cache):
+        """initialize consumer data and threads"""
+        self.config = config
+        self.cache = cache
+        self.urls = []
+        self.threader = linkcheck.threader.Threader()
+        self._set_threads(config['threads'])
+        self.logger = config['logger']
+        self.fileoutput = config['fileoutput']
+        self.linknumber = 0
+        # one lock for the data
+        self.lock = threading.Lock()
+
+    def filter_url_queue (self):
+        """remove already cached urls from queue"""
+        pass # deadlock!
+        #self.lock.acquire()
+        #try:
+        #    urls = []
+        #    for url_data in self.urls:
+        #        if self.cache.check_cache(url_data):
+        #            self.logger_new_url(url_data)
+        #        else:
+        #            urls.append(url_data)
+        #    self.urls = urls
+        #    print >> sys.stderr, \
+        #      _("removed %d cached urls from incoming queue") % len(removed)
+        #finally:
+        #    self.lock.release()
+
+    def _set_threads (self, num):
+        """set number of checker threads to start"""
+        linkcheck.log.debug(linkcheck.LOG_CHECK,
+                            "set threading with %d threads", num)
+        self.threader.threads_max = num
+        if num > 0:
+            sys.setcheckinterval(50)
+        else:
+            sys.setcheckinterval(100)
+
+    def check_url (self, url_data):
+        """start new thread checking the given url"""
+        self.threader.start_thread(url_data.check, ())
+
+    def append_url (self, url_data):
+        """add new url to list of urls to check"""
+        # check syntax
+        if not url_data.check_syntax():
+            # wrong syntax, do not check any further
+            return
+        # check the cache
+        if self.cache.check_cache(url_data):
+            # already cached
+            self.logger_new_url(url_data)
+            return
+        self.lock.acquire()
+        try:
+            self.urls.append(url_data)
+        finally:
+            self.lock.release()
+
+    def finished (self):
+        """return True if checking is finished"""
+        self.lock.acquire()
+        try:
+            return self.threader.finished() and len(self.urls) <= 0
+        finally:
+            self.lock.release()
+
+    def get_url (self):
+        """get first url in queue and return it"""
+        self.lock.acquire()
+        try:
+            if not self.urls:
+                return None
+            u = self.urls[0]
+            del self.urls[0]
+            return u
+        finally:
+            self.lock.release()
+
+    def finish (self):
+        """finish checking and send of-of-output message to logger"""
+        self.lock.acquire()
+        try:
+            self.threader.finish()
+        finally:
+            self.lock.release()
+        self.logger_end_output()
+
+    def print_status (self, curtime, start_time):
+        """print check status looking at url queues"""
+        self.lock.acquire()
+        try:
+            active = self.threader.active_threads()
+            links = self.linknumber
+            tocheck = len(self.urls)
+            duration = linkcheck.strformat.strduration(curtime - start_time)
+            print >> sys.stderr, _("%5d urls queued, %4d links checked, "\
+                                   "%2d active threads, runtime %s")\
+                                 % (tocheck, links, active, duration)
+        finally:
+            self.lock.release()
+
+    def logger_start_output (self):
+        """start output of all configured loggers"""
+        self.lock.acquire()
+        try:
+            if not self.config['quiet']:
+                self.logger.start_output()
+            for logger in self.fileoutput:
+                logger.start_output()
+        finally:
+            self.lock.release()
+
+    def logger_new_url (self, url_data):
+        """send new url to all configured loggers"""
+        self.lock.acquire()
+        try:
+            self.linknumber += 1
+            do_filter = (self.linknumber % 1000) == 0
+            if not self.config['quiet'] and \
+              (self.config["verbose"] or not url_data.valid or
+               (url_data.warning and self.config["warnings"])):
+                self.logger.new_url(url_data)
+            for log in self.fileoutput:
+                log.new_url(url_data)
+        finally:
+            self.lock.release()
+        # XXX deadlock!
+        #if do_filter:
+        #    self.filter_queue(self)
+
+    def logger_end_output (self):
+        """end output of all configured loggers"""
+        self.lock.acquire()
+        try:
+            if not self.config['quiet']:
+                self.logger.end_output(linknumber=self.linknumber)
+            for logger in self.fileoutput:
+                logger.end_output(linknumber=self.linknumber)
+        finally:
+            self.lock.release()
+
+    def active_threads (self):
+        """return number of active threads"""
+        self.lock.acquire()
+        try:
+            return self.threader.active_threads()
+        finally:
+            self.lock.release()
+
diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py
index bd19ce6d..7ead2496 100644
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@@ -53,15 +53,12 @@ def get_index_html (dirname):
 class FileUrl (urlbase.UrlBase):
     "Url link with file scheme"
 
-    def __init__ (self,
-                  base_url,
-                  config,
-                  recursion_level,
+    def __init__ (self, base_url, recursion_level, consumer,
                   parent_url = None,
                   base_ref = None, line=0, column=0, name=""):
-        super(FileUrl, self).__init__(base_url, config, recursion_level,
-                                    parent_url=parent_url, base_ref=base_ref,
-                                    line=line, column=column, name=name)
+        super(FileUrl, self).__init__(base_url, recursion_level, consumer,
+             parent_url=parent_url, base_ref=base_ref,
+             line=line, column=column, name=name)
         if not (parent_url or base_ref or self.base_url.startswith("file:")):
             self.base_url = os.path.expanduser(self.base_url)
             if not self.base_url.startswith("/"):
diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py
index 1f0fe37d..97f80b2a 100644
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@@ -32,11 +32,11 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
 
     def check_connection (self):
         # proxy support (we support only http)
-        self.set_proxy(self.config["proxy"].get(self.scheme))
+        self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
         if self.proxy:
             http = httpurl.HttpUrl(self.base_url,
                   self.recursion_level,
-                  self.config,
+                  self.consumer.config,
                   parent_url=self.parent_url,
                   base_ref=self.base_ref,
                   line=self.line,
@@ -80,7 +80,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
         # ready to connect
         try:
             self.url_connection = ftplib.FTP()
-            if self.config.get("debug"):
+            if self.consumer.config.get("debug"):
                 self.url_connection.set_debuglevel(1)
             self.url_connection.connect(self.urlparts[1])
             self.url_connection.login(_user, _password)
diff --git a/linkcheck/checker/httpsurl.py b/linkcheck/checker/httpsurl.py
index b02e798e..5510880f 100644
--- a/linkcheck/checker/httpsurl.py
+++ b/linkcheck/checker/httpsurl.py
@@ -28,4 +28,4 @@ class HttpsUrl (httpurl.HttpUrl):
             super(HttpsUrl, self).local_check()
         else:
             self.add_warning(_("%s url ignored")%self.scheme.capitalize())
-            self.log_me()
+            self.consumer.logger_new_url(self)
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 57daee50..54fae783 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -46,11 +46,11 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
 class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
     "Url link with http scheme"
 
-    def __init__ (self, base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=""):
-        super(HttpUrl, self).__init__(base_url, recursion_level, config,
-                         parent_url=parent_url, base_ref=base_ref, line=line,
-                         column=column, name=name)
+    def __init__ (self, base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0, name=""):
+        super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
+               parent_url=parent_url, base_ref=base_ref, line=line,
+               column=column, name=name)
         self.aliases = []
         self.max_redirects = 5
         self.has301status = False
@@ -109,13 +109,13 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
         | extension-code
         """
         # set the proxy, so a 407 status after this is an error
-        self.set_proxy(self.config["proxy"].get(self.scheme))
+        self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
         if self.proxy:
             self.add_info(_("Using Proxy %r") % self.proxy)
         self.headers = None
         self.auth = None
         self.cookies = []
-        if not self.robots_txt_allows_url():
+        if not self.consumer.cache.robots_txt_allows_url(self):
             self.add_warning(
                        _("Access denied by robots.txt, checked only syntax"))
             return
@@ -235,6 +235,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                 self.set_result(
                      _("recursive redirection encountered:\n %s") % \
                             "\n  => ".join(redirect_cache), valid=False)
+                self.consumer.logger_new_url(self)
                 return -1, response
             redirect_cache.append(redirected)
             # remember this alias
@@ -252,11 +253,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                     self.has301status = True
                 self.aliases.append(redirected)
             # check cache again on possibly changed URL
-            key = self.get_cache_key()
-            if self.config.url_cache_has_key(key):
-                self.copy_from_cache(self.config.url_cache_get(key))
-                self.cached = True
-                self.log_me()
+            if self.consumer.cache.check_cache(self):
+                self.consumer.logger_new_url(self)
                 return -1, response
             # check if we still have a http url, it could be another
             # scheme, eg https or news
@@ -266,15 +264,14 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                              "the original url was %r.") % self.url)
                 # make new Url object
                 newobj = linkcheck.checker.get_url_from(
-                          redirected, self.recursion_level, self.config,
+                          redirected, self.recursion_level, self.consumer,
                           parent_url=self.parent_url, base_ref=self.base_ref,
                           line=self.line, column=self.column, name=self.name)
                 newobj.warning = self.warning
                 newobj.info = self.info
                 # append new object to queue
-                self.config.append_url(newobj)
+                self.consumer.append_url(newobj)
                 # pretend to be finished and logged
-                self.cached = True
                 return -1, response
             # new response data
             response = self._get_http_response()
@@ -302,10 +299,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                 # no content
                 self.add_warning(response.reason)
             # store cookies for valid links
-            if self.config['cookies']:
+            if self.consumer.config['cookies']:
                 for c in self.cookies:
                     self.add_info("Cookie: %s" % c)
-                out = self.config.storeCookies(self.headers, self.urlparts[1])
+                out = self.consumer.config.storeCookies(self.headers, self.urlparts[1])
                 for h in out:
                     self.add_info(h)
             if response.status >= 200:
@@ -335,14 +332,16 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
         if self.url_connection:
             self.close_connection()
         self.url_connection = self.get_http_object(host, scheme)
-        url = urlparse.urlunsplit(self.urlparts)
         if self.no_anchor:
-            qurlparts[4] = ''
+            anchor = ''
+        else:
+            anchor = self.urlparts[4]
         if self.proxy:
-            path = urlparse.urlunsplit(self.urlparts)
+            path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
+                                 self.urlparts[2], self.urlparts[3], anchor))
         else:
             path = urlparse.urlunsplit(('', '', self.urlparts[2],
-                                        self.urlparts[3], self.urlparts[4]))
+                                        self.urlparts[3], anchor))
         self.url_connection.putrequest(self.method, path, skip_host=True)
         self.url_connection.putheader("Host", host)
         # userinfo is from http://user@pass:host/
@@ -360,8 +359,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                                       linkcheck.configuration.UserAgent)
         self.url_connection.putheader("Accept-Encoding",
                                   "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
-        if self.config['cookies']:
-            self.cookies = self.config.getCookies(self.urlparts[1],
+        if self.consumer.config['cookies']:
+            self.cookies = self.consumer.config.getCookies(self.urlparts[1],
                                                   self.urlparts[2])
             for c in self.cookies:
                 self.url_connection.putheader("Cookie", c)
@@ -375,7 +374,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
             h = linkcheck.httplib2.HTTPSConnection(host)
         else:
             raise linkcheck.LinkCheckerError("invalid url scheme %s" % scheme)
-        if self.config.get("debug"):
+        if self.consumer.config.get("debug"):
             h.set_debuglevel(1)
         h.connect()
         return h
@@ -447,15 +446,3 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
     def get_robots_txt_url (self):
         return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
 
-    def robots_txt_allows_url (self):
-        roboturl = self.get_robots_txt_url()
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "robots.txt url %r",
-                            roboturl)
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "url %r", self.url)
-        if not self.config.robots_txt_cache_has_key(roboturl):
-            rp = linkcheck.robotparser2.RobotFileParser()
-            rp.set_url(roboturl)
-            rp.read()
-            self.config.robots_txt_cache_set(roboturl, rp)
-        rp = self.config.robots_txt_cache_get(roboturl)
-        return rp.can_fetch(linkcheck.configuration.UserAgent, self.url)
diff --git a/linkcheck/checker/ignoredurl.py b/linkcheck/checker/ignoredurl.py
index 6c7d58f3..2bf8323c 100644
--- a/linkcheck/checker/ignoredurl.py
+++ b/linkcheck/checker/ignoredurl.py
@@ -25,7 +25,7 @@ class IgnoredUrl (urlbase.UrlBase):
 
     def local_check (self):
         self.add_warning(_("%s url ignored")%self.scheme.capitalize())
-        self.log_me()
+        self.consumer.logger_new_url(self)
 
     def can_get_content (self):
         return False
diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py
index 7e289e9b..770445b5 100644
--- a/linkcheck/checker/mailtourl.py
+++ b/linkcheck/checker/mailtourl.py
@@ -16,7 +16,6 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 
-import re
 import sys
 import cgi
 import urllib
diff --git a/linkcheck/checker/nntpurl.py b/linkcheck/checker/nntpurl.py
index 12c03b5d..9fc77c13 100644
--- a/linkcheck/checker/nntpurl.py
+++ b/linkcheck/checker/nntpurl.py
@@ -46,7 +46,7 @@ class NntpUrl (urlbase.UrlBase):
         linkcheck.log.debug(linkcheck.LOG_CHECK, self.urlparts)
 
     def check_connection (self):
-        nntpserver = self.urlparts[1] or self.config["nntpserver"]
+        nntpserver = self.urlparts[1] or self.consumer.config["nntpserver"]
         if not nntpserver:
             self.add_warning(_("No NNTP server specified, skipping this URL"))
             return
diff --git a/linkcheck/checker/telneturl.py b/linkcheck/checker/telneturl.py
index 2781ee43..fb8ed8dd 100644
--- a/linkcheck/checker/telneturl.py
+++ b/linkcheck/checker/telneturl.py
@@ -49,14 +49,14 @@ class TelnetUrl (urlconnect.UrlConnect):
     def local_check (self):
         if not self.host:
             self.set_result(_("Host is empty"), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
             return
         super(TelnetUrl, self).local_check()
 
     def check_connection (self):
         super(TelnetUrl, self).check_connection()
         self.url_connection = telnetlib.Telnet()
-        if self.config.get("debug"):
+        if self.consumer.config.get("debug"):
             self.url_connection.set_debuglevel(1)
         self.url_connection.open(self.host, self.port)
         if self.user:
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index ffed9da9..2e29c058 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -79,7 +79,7 @@ def print_app_info ():
 class UrlBase (object):
     """An URL with additional information like validity etc."""
 
-    def __init__ (self, base_url, recursion_level, config,
+    def __init__ (self, base_url, recursion_level, consumer,
                   parent_url = None, base_ref = None,
                   line = 0, column = 0, name = ""):
         """Initialize check data, and store given variables.
@@ -100,8 +100,9 @@ class UrlBase (object):
         self.parent_url = parent_url
         self.anchor = None
         self.recursion_level = recursion_level
-        self.config = config
+        self.consumer = consumer
         self.result = ""
+        self.cached = False
         self.valid = True
         self.warning = linkcheck.containers.SetList()
         self.info = linkcheck.containers.SetList()
@@ -111,7 +112,6 @@ class UrlBase (object):
         self.dltime = -1
         self.dlsize = -1
         self.checktime = 0
-        self.cached = False
         self.url_connection = None
         self.extern = (1, 0)
         self.data = None
@@ -169,6 +169,7 @@ class UrlBase (object):
         self.info.extend(cache_data["info"])
         self.valid = cache_data["valid"]
         self.dltime = cache_data["dltime"]
+        self.cached = True
 
     def get_cache_data (self):
         """return all data values that should be put in the cache"""
@@ -186,13 +187,12 @@ class UrlBase (object):
         return [key]
 
     def is_cached (self):
-        key = self.get_cache_key()
-        return self.cached or self.config.url_seen_has_key(key)
+        return self.consumer.cache.url_is_cached(self.get_cache_key())
 
     def get_cache_key (self):
         # note: the host is already lowercase
         if self.urlparts:
-            if self.config["anchorcaching"]:
+            if self.consumer.config["anchorcaching"]:
                 # do not ignore anchor
                 return urlparse.urlunsplit(self.urlparts)
             else:
@@ -200,16 +200,6 @@ class UrlBase (object):
                 return urlparse.urlunsplit(self.urlparts[:4]+[''])
         return None
 
-    def put_in_cache (self):
-        """put url data into cache"""
-        if self.is_cached():
-            # another thread was faster and cached this url already
-            return
-        data = self.get_cache_data()
-        for key in self.get_cache_keys():
-            self.config.url_cache_set(key, data)
-            self.config.url_seen_set(key)
-
     def build_url (self):
         # make url absolute
         if self.base_ref:
@@ -236,14 +226,6 @@ class UrlBase (object):
         # safe anchor for later checking
         self.anchor = self.urlparts[4]
 
-    def log_me (self):
-        """announce the url data as checked to the configured loggers"""
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "logging url")
-        self.config.increment_linknumber()
-        if self.config["verbose"] or not self.valid or \
-           (self.warning and self.config["warnings"]):
-            self.config.logger_new_url(self)
-
     def check (self):
         try:
             self.local_check()
@@ -260,28 +242,30 @@ class UrlBase (object):
 
     def local_check (self):
         linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
-        if self.recursion_level and self.config['wait']:
+        if self.recursion_level and self.consumer.config['wait']:
             linkcheck.log.debug(linkcheck.LOG_CHECK,
-                            "sleeping for %d seconds", self.config['wait'])
-            time.sleep(self.config['wait'])
+                            "sleeping for %d seconds", self.consumer.config['wait'])
+            time.sleep(self.consumer.config['wait'])
         t = time.time()
-        if not self.check_cache():
+        if self.consumer.cache.check_cache(self):
+            # was cached from previous queue member
+            self.consumer.logger_new_url(self)
             return
         # apply filter
         linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern)
-        if self.extern[0] and (self.config["strict"] or self.extern[1]):
+        if self.extern[0] and (self.consumer.config["strict"] or self.extern[1]):
             self.add_warning(
                   _("outside of domain filter, checked only syntax"))
-            self.log_me()
+            self.consumer.logger_new_url(self)
             return
 
         # check connection
         linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection")
         try:
             self.check_connection()
-            if self.cached:
+            if self.is_cached():
                 return
-            if self.config["anchors"]:
+            if self.consumer.config["anchors"]:
                 self.check_anchors()
         except tuple(linkcheck.checker.ExcList):
             etype, evalue, etb = sys.exc_info()
@@ -296,7 +280,7 @@ class UrlBase (object):
             self.set_result(str(evalue), valid=False)
 
         # check content
-        warningregex = self.config["warningregex"]
+        warningregex = self.consumer.config["warningregex"]
         if warningregex and self.valid:
             linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
             try:
@@ -323,40 +307,37 @@ class UrlBase (object):
                             valid=False)
         # close
         self.close_connection()
-        self.log_me()
+        self.consumer.logger_new_url(self)
         linkcheck.log.debug(linkcheck.LOG_CHECK, "caching")
-        self.put_in_cache()
+        self.consumer.cache.url_data_cache_add(self)
 
     def check_syntax (self):
+        """Called before self.check(), this function inspects the
+           url syntax. Success enables further checking, failure
+           immediately logs this url. This syntax check must not
+           use any network resources.
+        """
         linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
         if not self.base_url:
             self.set_result(_("URL is empty"), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
             return False
         if ws_at_start_or_end(self.base_url):
+            # leading or trailing whitespace is common, so make a
+            # separate error message for this
             self.set_result(_("URL has whitespace at beginning or end"),
                             valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
             return False
         try:
             self.build_url()
             self.extern = self._get_extern()
         except linkcheck.LinkCheckerError, msg:
             self.set_result(str(msg), valid=False)
-            self.log_me()
+            self.consumer.logger_new_url(self)
             return False
         return True
 
-    def check_cache (self):
-        linkcheck.log.debug(linkcheck.LOG_CHECK, "checking cache")
-        for key in self.get_cache_keys():
-            if self.config.url_cache_has_key(key):
-                self.copy_from_cache(self.config.url_cache_get(key))
-                self.cached = True
-                self.log_me()
-                return False
-        return True
-
     def close_connection (self):
         """close an opened url connection"""
         # brute force closing
@@ -379,8 +360,8 @@ class UrlBase (object):
             self.is_parseable() and \
             self.can_get_content() and \
             not self.is_cached() and \
-            (self.config["recursionlevel"] < 0 or
-             self.recursion_level < self.config["recursionlevel"]) and \
+            (self.consumer.config["recursionlevel"] < 0 or
+             self.recursion_level < self.consumer.config["recursionlevel"]) and \
             not self.extern[0] and self.content_allows_robots()
 
     def content_allows_robots (self):
@@ -418,19 +399,19 @@ class UrlBase (object):
         self.add_warning(_("anchor #%s not found") % self.anchor)
 
     def _get_extern (self):
-        if not (self.config["externlinks"] or self.config["internlinks"]):
+        if not (self.consumer.config["externlinks"] or self.consumer.config["internlinks"]):
             return (0, 0)
         # deny and allow external checking
         linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", self.url)
-        if self.config["denyallow"]:
-            for entry in self.config["externlinks"]:
+        if self.consumer.config["denyallow"]:
+            for entry in self.consumer.config["externlinks"]:
                 linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
                                     entry)
                 match = entry['pattern'].search(self.url)
                 if (entry['negate'] and not match) or \
                    (match and not entry['negate']):
                     return (1, entry['strict'])
-            for entry in self.config["internlinks"]:
+            for entry in self.consumer.config["internlinks"]:
                 linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
                                     entry)
                 match = entry['pattern'].search(self.url)
@@ -439,14 +420,14 @@ class UrlBase (object):
                     return (0, 0)
             return (0, 0)
         else:
-            for entry in self.config["internlinks"]:
+            for entry in self.consumer.config["internlinks"]:
                 linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
                                     entry)
                 match = entry['pattern'].search(self.url)
                 if (entry['negate'] and not match) or \
                    (match and not entry['negate']):
                     return (0, 0)
-            for entry in self.config["externlinks"]:
+            for entry in self.consumer.config["externlinks"]:
                 linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
                                     entry)
                 match = entry['pattern'].search(self.url)
@@ -482,7 +463,7 @@ class UrlBase (object):
     def check_size (self):
         """if a maximum size was given, call this function to check it
            against the content size of this url"""
-        maxbytes = self.config["warnsizebytes"]
+        maxbytes = self.consumer.config["warnsizebytes"]
         if maxbytes is not None and self.dlsize >= maxbytes:
             self.add_warning(_("Content size %s is larger than %s") % \
                          (linkcheck.strformat.strsize(self.dlsize),
@@ -497,7 +478,7 @@ class UrlBase (object):
         self.parse_html()
 
     def get_user_password (self):
-        for auth in self.config["authentication"]:
+        for auth in self.consumer.config["authentication"]:
             if auth['pattern'].match(self.url):
                 return auth['user'], auth['password']
         return None, None
@@ -535,10 +516,10 @@ class UrlBase (object):
                 base = base_ref
             linkcheck.log.debug(linkcheck.LOG_CHECK, "Put url %r in queue",
                                 url)
-            self.config.append_url(linkcheck.checker.get_url_from(url,
-                                  self.recursion_level+1, self.config,
-                                  parent_url=self.url, base_ref=base,
-                                  line=line, column=column, name=name))
+            self.consumer.append_url(linkcheck.checker.get_url_from(url,
+                           self.recursion_level+1, self.consumer,
+                           parent_url=self.url, base_ref=base,
+                           line=line, column=column, name=name))
 
     def parse_opera (self):
         """parse an opera bookmark file"""
@@ -553,8 +534,9 @@ class UrlBase (object):
             elif line.startswith("URL="):
                 url = line[4:]
                 if url:
-                    self.config.append_url(linkcheck.checker.get_url_from(url,
-           self.recursion_level+1, self.config, self.url, None, lineno, name))
+                    self.consumer.append_url(linkcheck.checker.get_url_from(url,
+                       self.recursion_level+1, self.consumer,
+                       self.url, None, lineno, name))
                 name = ""
 
     def parse_text (self):
@@ -567,9 +549,9 @@ class UrlBase (object):
             lineno += 1
             line = line.strip()
             if not line or line.startswith('#'): continue
-            self.config.append_url(
+            self.consumer.append_url(
                   linkcheck.checker.get_url_from(line, self.recursion_level+1,
-                               self.config, parent_url=self.url, line=lineno))
+                   self.consumer, parent_url=self.url, line=lineno))
 
     def parse_css (self):
         """parse a CSS file for url() patterns"""
@@ -578,9 +560,9 @@ class UrlBase (object):
             lineno += 1
             for mo in linkcheck.linkparse.css_url_re.finditer(line):
                 column = mo.start("url")
-                self.config.append_url(
+                self.consumer.append_url(
                              linkcheck.checker.get_url_from(mo.group("url"),
-                             self.recursion_level+1, self.config,
+                             self.recursion_level+1, self.consumer,
                              parent_url=self.url, line=lineno, column=column))
 
     def __str__ (self):
@@ -590,7 +572,6 @@ class UrlBase (object):
             "base_url=%s" % self.base_url,
             "parent_url=%s" % self.parent_url,
             "base_ref=%s" % self.base_ref,
-            "cached=%s" % self.cached,
             "recursion_level=%s" % self.recursion_level,
             "url_connection=%s" % self.url_connection,
             "line=%s" % self.line,
diff --git a/linkcheck/checker/urlconnect.py b/linkcheck/checker/urlconnect.py
index b90d2431..35070340 100644
--- a/linkcheck/checker/urlconnect.py
+++ b/linkcheck/checker/urlconnect.py
@@ -27,9 +27,9 @@ from linkcheck.i18n import _
 class UrlConnect (urlbase.UrlBase):
     """Url link for which we have to connect to a specific host"""
 
-    def __init__ (self, base_url, recursion_level, config, parent_url=None,
-                  base_ref=None, line=0, column=0, name=""):
-        super(UrlConnect, self).__init__(base_url, recursion_level, config,
+    def __init__ (self, base_url, recursion_level, consumer,
+                  parent_url=None, base_ref=None, line=0, column=0, name=""):
+        super(UrlConnect, self).__init__(base_url, recursion_level, consumer,
                     parent_url=parent_url, base_ref=base_ref,
                     line=line, column=column, name=name)
         self.host = None