Replace the old threading algorithm with a new one based on Queue.Queue and consumer threads

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3146 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-15 18:13:09 +00:00 · 2006-05-13 13:44:52 +00:00 · 2006-05-13 13:44:52 +00:00 · f002c5f965
commit f002c5f965
parent d05c68ef74
28 changed files with 684 additions and 326 deletions
--- a/6
+++ b/6
@ -14,6 +14,12 @@
    Changed: linkcheck/HtmlParser/htmllex.[lc],
      linkcheck/tests/test_parser.py

+  * Revamp the threading algorithm by using a URL queue, with a
+    constant number of consumer threads called 'workers'.
+    This fixes the remaining "dequeue mutated during iteration" errors.
+    Type: feature
+    Changed: *.py
+
 3.4 "The Chumscrubbers" (released 4.2.2006)

  * Ignore decoding errors when retrieving the robots.txt URL.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -37,6 +37,7 @@ include doc/de/*.1
 include doc/fr/*.1
 include doc/Makefile doc/rest2htmlnav
 recursive-include linkcheck/checker/tests/data *.txt *.html *.result *.asc *.css *.ico
+recursive-include linkcheck/configuration/tests/data *.ini
 include linkcheck/tests/*.py
 include linkcheck/checker/tests/*.py
 include linkcheck/dns/tests/*.py
--- a/10
+++ b/10
@ -1,7 +1,11 @@
-http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483752
-http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/475160
+- Improved print_status

- use format_time from quodlibet for times
+- Ctrl-C is not really working.
+
+- To limit the used memory, put a maximum size on the URL queue
+  (eg. 20000 URLs). If reached, the worker calling queue.put() will
+  wait for another worker to call queue.get() before continuing.
+  Problem: dead lock when all workers called queue.put().

 - [FEATURE] postmortem debugging with pdb.pm()

--- a/linkcheck/init.py
+++ b/linkcheck/init.py
@ -58,6 +58,17 @@ class LinkCheckerError (Exception):
    pass


+def add_intern_pattern (url_data, config):
+    """
+    Add intern URL regex to config.
+    """
+    pat = url_data.get_intern_pattern()
+    if pat:
+        assert linkcheck.log.debug(LOG_CHECK,
+           "Add intern pattern %r from command line", pat)
+        config['internlinks'].append(get_link_pat(pat))
+
+
 def get_link_pat (arg, strict=False):
    """
    Get a link pattern matcher for intern/extern links.
--- a/linkcheck/cache/init.py
+++ b/linkcheck/cache/init.py
@ -0,0 +1,19 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and provide cached data during checking in a thread-safe manner.
+"""
--- a/linkcheck/cache/connection.py
+++ b/linkcheck/cache/connection.py
@ -0,0 +1,111 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2005-2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and retrieve open connections.
+"""
+
+import time
+import threading
+from linkcheck.decorators import synchronized
+
+# lock for robots.txt caching
+_lock = threading.Lock()
+
+
+class ConnectionPool (object):
+    """
+    Thread-safe cache, storing a set of connections for URL retrieval.
+    """
+
+    def __init__ (self):
+        """
+        Initialize an empty connection dictionary which will have entries
+        of the form::
+        key -> [connection, status, expiration time]
+
+        Connection can be any open connection object (HTTP, FTP, ...).
+        Status is either 'available' or 'busy'.
+        Expiration time is the point of time in seconds when this
+        connection will be timed out.
+
+        The identifier key is usually a tuple (type, host, user, pass),
+        but it can be any immutable Python object.
+        """
+        # open connections
+        # {(type, host, user, pass) -> [connection, status, expiration time]}
+        self.connections = {}
+
+    @synchronized(_lock)
+    def add (self, key, conn, timeout):
+        """
+        Add connection to the pool with given identifier key and timeout
+        in seconds.
+        """
+        self.connections[key] = [conn, 'available', time.time() + timeout]
+
+    @synchronized(_lock)
+    def get (self, key):
+        """
+        Get open connection if available, for at most 30 seconds.
+
+        @return: Open connection object or None if no connection is available.
+        @rtype None or FTPConnection or HTTP(S)Connection
+        """
+        if key not in self.connections:
+            # not found
+            return None
+        conn_data = self.connections[key]
+        t = time.time()
+        if t > conn_data[2]:
+            # timed out
+            try:
+                conn_data[1].close()
+            except:
+                # ignore close errors
+                pass
+            del self.connections[key]
+            return None
+        # wait at most 300*0.1=30 seconds for connection to become available
+        for dummy in xrange(300):
+            if conn_data[1] != 'busy':
+                conn_data[1] = 'busy'
+                conn_data[2] = t
+                return conn_data[0]
+            time.sleep(0.1)
+        # connection is in use
+        return None
+
+    @synchronized(_lock)
+    def release (self, key):
+        """
+        Mark an open and reusable connection as available.
+        """
+        if key in self.connections:
+            self.connections[key][1] = 'available'
+
+    @synchronized(_lock)
+    def expire_connections (self):
+        """
+        Remove expired connections from this pool.
+        """
+        t = time.time()
+        to_delete = []
+        for key, conn_data in self.connections.iteritems():
+            if conn_data[1] == 'available' and t > conn_data[2]:
+                to_delete.append(key)
+        for key in to_delete:
+            del self.connections[key]
--- a/linkcheck/cache/cookie.py
+++ b/linkcheck/cache/cookie.py
@ -0,0 +1,73 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and retrieve cookies.
+"""
+import threading
+from linkcheck.decorators import synchronized
+import linkcheck
+import linkcheck.log
+import linkcheck.cookies
+
+# lock for caching
+_lock = threading.Lock()
+
+
+class CookieJar (object):
+    """
+    Cookie storage, implementing the default cookie handling policy for
+    LinkChecker.
+    """
+
+    def __init__ (self):
+        self.cache = {}
+
+    @synchronized(_lock)
+    def add (self, headers, scheme, host, path):
+        """
+        Parse cookie values, add to cache.
+        """
+        jar = set()
+        for h in headers.getallmatchingheaders("Set-Cookie"):
+            # RFC 2109 (Netscape) cookie type
+            try:
+                c = linkcheck.cookies.NetscapeCookie(h, scheme, host, path)
+                jar.add(c)
+            except linkcheck.cookies.CookieError:
+                assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+               "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
+        for h in headers.getallmatchingheaders("Set-Cookie2"):
+            # RFC 2965 cookie type
+            try:
+                c = linkcheck.cookies.Rfc2965Cookie(h, scheme, host, path)
+                jar.add(c)
+            except linkcheck.cookies.CookieError:
+                assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+              "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
+        self.cache[host] = jar
+        return jar
+
+    @synchronized(_lock)
+    def get (self, scheme, host, port, path):
+        """
+        Cookie cache getter function.
+        """
+        assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+                            "Get cookies for host %r path %r", host, path)
+        jar = self.cache.setdefault(host, set())
+        return [x for x in jar if x.check_expired() and \
+                x.is_valid_for(scheme, host, port, path)]
--- a/linkcheck/checker/geoip.py
+++ b/linkcheck/checker/geoip.py
@ -15,23 +15,44 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 """
-GeoIP wrapper.
+Store and retrieve country names for IPs.
 """
+import os
+import threading
+from linkcheck.decorators import synchronized

-def get_country (gi, host):
+# I don't know if the geoip library is already thread-safe, but
+# we take no risks here.
+_lock = threading.Lock()
+
+# initialize GeoIP database
+geoip = None
+try:
+    import GeoIP
+    geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
+    if os.name == 'posix' and os.path.exists(geoip_dat):
+        geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
+    del geoip_dat
+except ImportError:
+    pass
+
+
+@synchronized(_lock)
+def get_country (host):
    """
    Get translated country name.

    @return: country string or None
    """
-    c = gi.country_code_by_name(host)
+    if geoip is None:
+        return None
+    c = geoip.country_code_by_name(host)
    if c and c in countries:
        return "%s, %s" % (c, countries[c])
    return None


 # GeoIP country map with {short name -> translated full name} entries
-
 countries = {
    "AP": "Asia/Pacific Region",
    "EU": "Europe",
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -0,0 +1,52 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Cache robots.txt contents.
+"""
+import threading
+from linkcheck.decorators import synchronized
+import linkcheck.robotparser2
+import linkcheck.configuration
+
+
+# lock for caching
+_lock = threading.Lock()
+
+
+class RobotsTxt (object):
+    """
+    Thread-safe cache of downloaded robots.txt files.
+    format: {cache key (string) -> robots.txt content (RobotFileParser)}
+    """
+
+    def __init__ (self):
+        self.cache = {}
+
+    @synchronized(_lock)
+    def allows_url (self, roboturl, url, user, password):
+        """
+        Ask robots.txt allowance.
+        """
+        if roboturl not in self.cache:
+            rp = linkcheck.robotparser2.RobotFileParser(
+                                            user=user, password=password)
+            rp.set_url(roboturl)
+            rp.read()
+            self.cache[roboturl] = rp
+        else:
+            rp = self.cache[roboturl]
+        return rp.can_fetch(linkcheck.configuration.UserAgent, url)
--- a/linkcheck/cache/urlqueue.py
+++ b/linkcheck/cache/urlqueue.py
@ -0,0 +1,210 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2000-2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Handle a queue of URLs to check.
+"""
+import threading
+import Queue
+import time
+import linkcheck
+import linkcheck.log
+
+
+class UrlQueue (Queue.Queue):
+    """
+    A queue supporting several consumer tasks. The task_done() idea is
+    from the Python 2.5 Subversion repository.
+    """
+
+    def __init__ (self, maxsize=0):
+        """
+        Initialize the queue state and task counters.
+        """
+        Queue.Queue.__init__(self, maxsize=maxsize)
+        self.all_tasks_done = threading.Condition(self.mutex)
+        self.unfinished_tasks = 0
+        self.finished_tasks = 0
+        self.in_progress = {}
+        self.checked = {}
+        self.shutdown = False
+
+    def get (self):
+        """
+        Get first not-in-progress url from the queue and
+        return it. If no such url is available return None. The
+        url might be already cached.
+        """
+        self.not_empty.acquire()
+        try:
+            while self._empty():
+                self.not_empty.wait()
+            url_data = self._get()
+            key = url_data.cache_url_key
+            if url_data.has_result:
+                # Already checked and copied from cache.
+                pass
+            elif key in self.checked:
+                # Already checked; copy result. And even ignore
+                # the case where url happens to be in_progress.
+                url_data.copy_from_cache(self.checked[key])
+            elif key in self.in_progress:
+                # It's being checked currently; put it back in the queue.
+                Queue.Queue._put(self, url_data)
+                url_data = None
+            else:
+                self.in_progress[key] = url_data
+            self.not_full.notify()
+            return url_data
+        finally:
+            self.not_empty.release()
+
+    def _put (self, url_data):
+        """
+        Put URL in queue, increase number of unfished tasks.
+        """
+        if self.shutdown:
+            # don't accept more URLs
+            return
+        key = url_data.cache_url_key
+        if key in self.checked:
+            # Put at beginning of queue to get consumed quickly.
+            url_data.copy_from_cache(self.checked[key])
+            self.queue.appendleft(url_data)
+        else:
+            self.queue.append(url_data)
+        self.unfinished_tasks += 1
+
+    def task_done (self, url_data):
+        """
+        Indicate that a formerly enqueued task is complete.
+
+        Used by Queue consumer threads.  For each get() used to fetch a task,
+        a subsequent call to task_done() tells the queue that the processing
+        on the task is complete.
+
+        If a join() is currently blocking, it will resume when all items
+        have been processed (meaning that a task_done() call was received
+        for every item that had been put() into the queue).
+
+        Raises a ValueError if called more times than there were items
+        placed in the queue.
+        """
+        self.all_tasks_done.acquire()
+        try:
+            if url_data is not None:
+                key = url_data.cache_url_key
+                if key is not None and key not in self.checked:
+                    self._cache_url(key, url_data)
+            self.finished_tasks += 1
+            unfinished = self.unfinished_tasks - 1
+            if unfinished <= 0:
+                if unfinished < 0:
+                    raise ValueError('task_done() called too many times')
+                self.all_tasks_done.notifyAll()
+            self.unfinished_tasks = unfinished
+        finally:
+            self.all_tasks_done.release()
+
+    def _cache_url (self, key, url_data):
+        """
+        Put URL result data into cache.
+        """
+        assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+            "Caching %r", key)
+        assert key in self.in_progress, \
+            "%r not in %s" % (key, self.in_progress)
+        del self.in_progress[key]
+        data = url_data.get_cache_data()
+        self.checked[key] = data
+        # check for aliases (eg. through HTTP redirections)
+        if hasattr(url_data, "aliases"):
+            data = url_data.get_alias_cache_data()
+            for key in url_data.aliases:
+                if key in self.checked or key in self.in_progress:
+                    continue
+                assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+                                    "Caching alias %r", key)
+                self.checked[key] = data
+
+    def join (self, timeout=None):
+        """Blocks until all items in the Queue have been gotten and processed.
+
+        The count of unfinished tasks goes up whenever an item is added to the
+        queue. The count goes down whenever a consumer thread calls task_done()
+        to indicate the item was retrieved and all work on it is complete.
+
+        When the count of unfinished tasks drops to zero, join() unblocks.
+        """
+        self.all_tasks_done.acquire()
+        try:
+            if timeout is None:
+                while self.unfinished_tasks:
+                    self.all_tasks_done.wait()
+            else:
+                if timeout < 0:
+                    raise ValueError("'timeout' must be a positive number")
+                endtime = time.time() + timeout
+                while self.unfinished_tasks:
+                    remaining = endtime - time.time()
+                    if remaining <= 0.0:
+                        return
+                    self.all_tasks_done.wait(remaining)
+        finally:
+            self.all_tasks_done.release()
+
+    def do_shutdown (self):
+        """
+        Shutdown the queue by not accepting any more URLs.
+        """
+        self.mutex.acquire()
+        try:
+            unfinished = self.unfinished_tasks - len(self.queue)
+            self.queue.clear()
+            if unfinished <= 0:
+                if unfinished < 0:
+                    raise ValueError('shutdown is in error')
+                self.all_tasks_done.notifyAll()
+            self.unfinished_tasks = unfinished
+            self.shutdown = True
+        finally:
+            self.mutex.release()
+
+    def status (self):
+        """
+        Get tuple (finished tasks, unfinished tasks, queue size).
+        """
+        self.mutex.acquire()
+        try:
+            return (self.finished_tasks, self.unfinished_tasks, len(self.queue))
+        finally:
+            self.mutex.release()
+
+    def checked_redirect (self, redirect, url_data):
+        """
+        Check if redirect is already in cache. Used for URL redirections
+        to avoid double checking of already cached URLs.
+        If the redirect URL is found in the cache, the result data is
+        already copied.
+        """
+        self.mutex.acquire()
+        try:
+            if redirect in self.checked:
+                url_data.copy_from_cache(self.checked[redirect])
+                return True
+            return False
+        finally:
+            self.mutex.release()
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -18,19 +18,14 @@
 Main functions for link checking.
 """

-import time
-import sys
 import os
 import cgi
 import socket
-import codecs
-import traceback
 import select
 import re
 import urllib
 import nntplib
 import ftplib
-
 import linkcheck.httplib2
 import linkcheck.strformat
 import linkcheck.dns.exception
@ -153,110 +148,6 @@ acap        # application configuration access protocol

 ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)

-_encoding = linkcheck.i18n.default_encoding
-stderr = codecs.getwriter(_encoding)(sys.stderr, errors="ignore")
-
-def internal_error ():
-    """
-    Print internal error message to stderr.
-    """
-    print >> stderr, os.linesep
-    print >> stderr, _("""********** Oops, I did it again. *************
-
-You have found an internal error in LinkChecker. Please write a bug report
-at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
-or send mail to %s and include the following information:
- the URL or file you are testing
- your commandline arguments and/or configuration.
- the output of a debug run with option "-Dall" of the executed command
- the system information below.
-
-Disclosing some of the information above due to privacy reasons is ok.
-I will try to help you nonetheless, but you have to give me something
-I can work with ;) .
-""") % linkcheck.configuration.Email
-    etype, value = sys.exc_info()[:2]
-    print >> stderr, etype, value
-    traceback.print_exc()
-    print_app_info()
-    print >> stderr, os.linesep, \
-            _("******** LinkChecker internal error, over and out ********")
-    sys.exit(1)
-
-
-def print_app_info ():
-    """
-    Print system and application info to stderr.
-    """
-    print >> stderr, _("System info:")
-    print >> stderr, linkcheck.configuration.App
-    print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
-    for key in ("LC_ALL", "LC_MESSAGES",  "http_proxy", "ftp_proxy"):
-        value = os.getenv(key)
-        if value is not None:
-            print >> stderr, key, "=", repr(value)
-
-
-def check_urls (consumer):
-    """
-    Main check function; checks all configured URLs until interrupted
-    with Ctrl-C. If you call this function more than once, you can specify
-    different configurations with the consumer parameter.
-
-    @param consumer: an object where all runtime-dependent options are
-        stored
-    @type consumer: linkcheck.consumer.Consumer
-    @return: None
-    """
-    try:
-        _check_urls(consumer)
-    except (KeyboardInterrupt, SystemExit):
-        consumer.abort()
-    except:
-        consumer.abort()
-        internal_error()
-
-
-def _check_urls (consumer):
-    """
-    Checks all configured URLs. Prints status information, calls logger
-    methods.
-
-    @param consumer: an object where all runtime-dependent options are
-        stored
-    @type consumer: linkcheck.consumer.Consumer
-    @return: None
-    """
-    start_time = time.time()
-    status_time = start_time
-    while not consumer.finished():
-        url_data = consumer.incoming_get_url()
-        if url_data is None:
-            # wait for incoming queue to fill
-            time.sleep(0.1)
-        elif url_data.cached:
-            # was cached -> can be logged
-            consumer.log_url(url_data)
-        else:
-            # go check this url
-            if url_data.parent_url and not \
-               linkcheck.url.url_is_absolute(url_data.base_url):
-                name = url_data.parent_url
-            else:
-                name = u""
-            if url_data.base_url:
-                name += url_data.base_url
-            if not name:
-                name = None
-            consumer.check_url(url_data, name)
-        if consumer.config('status'):
-            curtime = time.time()
-            if (curtime - status_time) > 5:
-                consumer.print_status(curtime, start_time)
-                status_time = curtime
-    consumer.end_log_output()
-
-
 # file extensions we can parse recursively
 extensions = {
    "html": re.compile(r'(?i)\.s?html?$'),
@ -298,9 +189,9 @@ def absolute_url (base_url, base_ref, parent_url):
    return u""


-def get_url_from (base_url, recursion_level, consumer,
+def get_url_from (base_url, recursion_level, aggregate,
                  parent_url=None, base_ref=None, line=0, column=0,
-                  name=u"", cmdline=False):
+                  name=u"", assume_local=False):
    """
    Get url data from given base data.

@ -308,8 +199,8 @@ def get_url_from (base_url, recursion_level, consumer,
    @type base_url: string or None
    @param recursion_level: current recursion level
    @type recursion_level: number
-    @param consumer: consumer object
-    @type consumer: linkcheck.checker.consumer.Consumer
+    @param aggregate: aggregate object
+    @type aggregate: linkcheck.checker.aggregate.Consumer
    @param parent_url: parent url
    @type parent_url: string or None
    @param base_ref: base url from <base> tag
@ -329,7 +220,14 @@ def get_url_from (base_url, recursion_level, consumer,
        base_ref = linkcheck.strformat.unicode_safe(base_ref)
    name = linkcheck.strformat.unicode_safe(name)
    url = absolute_url(base_url, base_ref, parent_url).lower()
-    # test scheme
+    klass = get_urlclass_from(url, assume_local)
+    return klass(base_url, recursion_level, aggregate,
+                 parent_url=parent_url, base_ref=base_ref,
+                 line=line, column=column, name=name)
+
+
+def get_urlclass_from (url, assume_local):
+    """Return checker class for given URL."""
    if url.startswith("http:"):
        klass = linkcheck.checker.httpurl.HttpUrl
    elif url.startswith("ftp:"):
@ -351,24 +249,13 @@ def get_url_from (base_url, recursion_level, consumer,
    elif ignored_schemes_re.search(url):
        # ignored url
        klass = linkcheck.checker.ignoredurl.IgnoredUrl
-    elif cmdline:
-        # assume local file on command line
+    elif assume_local:
+        # assume local file
        klass = linkcheck.checker.fileurl.FileUrl
    else:
        # error url, no further checking, just log this
        klass = linkcheck.checker.errorurl.ErrorUrl
-    url_data = klass(base_url, recursion_level, consumer,
-                     parent_url=parent_url, base_ref=base_ref,
-                     line=line, column=column, name=name)
-    if cmdline:
-        # add intern URL regex to config for every URL that was given
-        # on the command line
-        pat = url_data.get_intern_pattern()
-        assert linkcheck.log.debug(linkcheck.LOG_CMDLINE,
-                            "Add intern pattern %r from command line", pat)
-        if pat:
-            consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
-    return url_data
+    return klass


 def get_index_html (urls):
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -23,11 +23,13 @@ import os
 import time
 import urlparse
 import urllib
+import urllib2

 import urlbase
 import linkcheck
 import linkcheck.log
 import linkcheck.checker
+import linkcheck.fileutil

 # if file extension lookup was unsuccessful, look at the content
 contents = {
@ -83,7 +85,7 @@ class FileUrl (urlbase.UrlBase):
    """

    def init (self, base_ref, base_url, parent_url, recursion_level,
-              consumer, line, column, name):
+              aggregate, line, column, name):
        """
        Besides the usual initialization the URL is normed according
        to the platform:
@ -91,7 +93,7 @@ class FileUrl (urlbase.UrlBase):
         - under Windows platform the drive specifier is normed
        """
        super(FileUrl, self).init(base_ref, base_url, parent_url,
-                               recursion_level, consumer, line, column, name)
+                               recursion_level, aggregate, line, column, name)
        if self.base_url is None:
            return
        base_url = self.base_url
@ -129,7 +131,8 @@ class FileUrl (urlbase.UrlBase):
        if self.is_directory():
            self.set_result(_("directory"))
        else:
-            super(FileUrl, self).check_connection()
+            url = linkcheck.fileutil.pathencode(self.url)
+            self.url_connection = urllib2.urlopen(url)
            self.check_case_sensitivity()

    def check_case_sensitivity (self):
@ -147,7 +150,6 @@ class FileUrl (urlbase.UrlBase):
                               "system path %r. You should always use "
                               "the system path in URLs.") % (path, realpath),
                               tag="file-system-path")
-        pass

    def get_content (self):
        """
@ -208,7 +210,7 @@ class FileUrl (urlbase.UrlBase):
        path = self.urlparts[2]
        if os.name == 'nt':
            path = prepare_urlpath_for_nt(path)
-        return urllib.url2pathname(path)
+        return linkcheck.fileutil.pathencode(urllib.url2pathname(path))

    def is_directory (self):
        """
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@ -53,12 +53,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        order: login, changing directory, list the file.
        """
        # proxy support (we support only http)
-        self.set_proxy(self.consumer.config("proxy").get(self.scheme))
+        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        if self.proxy:
            # using a (HTTP) proxy
            http = httpurl.HttpUrl(self.base_url,
                  self.recursion_level,
-                  self.consumer,
+                  self.aggregate,
                  parent_url=self.parent_url,
                  base_ref=self.base_ref,
                  line=self.line,
@ -87,7 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        # ready to connect
        _user, _password = self.get_user_password()
        key = ("ftp", self.urlparts[1], _user, _password)
-        conn = self.consumer.get_connection(key)
+        conn = self.aggregate.connections.get(key)
        if conn is not None and conn.sock is not None:
            # reuse cached FTP connection
            self.url_connection = conn
@ -248,6 +248,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        # add to cached connections
        _user, _password = self.get_user_password()
        key = ("ftp", self.urlparts[1], _user, _password)
-        cache_add = self.consumer.add_connection
+        cache_add = self.aggregate.connections.add
        cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
        self.url_connection = None
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -129,8 +129,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        roboturl = self.get_robots_txt_url()
        user, password = self.get_user_password()
-        return self.consumer.robots_txt_allows_url(roboturl, url,
-                                                   user, password)
+        return self.aggregate.robots_txt.allows_url(roboturl, url,
+                                                    user, password)

    def check_connection (self):
        """
@ -150,7 +150,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
            valid request
        """
        # set the proxy, so a 407 status after this is an error
-        self.set_proxy(self.consumer.config("proxy").get(self.scheme))
+        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        # initialize check data
        self.headers = None
        self.auth = None
@ -360,19 +360,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                           tag="http-moved-permanent")
                    self.has301status = True
            # check cache again on the changed URL
-            if self.consumer.checked_redirect(redirected, self):
+            if self.aggregate.urlqueue.checked_redirect(redirected, self):
                return -1, response
            # in case of changed scheme make new URL object
            if self.urlparts[0] != self.scheme:
                newobj = linkcheck.checker.get_url_from(
-                          redirected, self.recursion_level, self.consumer,
+                          redirected, self.recursion_level, self.aggregate,
                          parent_url=self.parent_url, base_ref=self.base_ref,
                          line=self.line, column=self.column, name=self.name,
-                          cmdline=False)
+                          assume_local=False)
                newobj.warnings = self.warnings
                newobj.info = self.info
                # append new object to queue
-                self.consumer.append_url(newobj)
+                self.aggregate.append_url(newobj)
                # pretend to be finished and logged
                return -1, response
            # new response data
@ -406,14 +406,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                            linkcheck.strformat.unicode_safe(response.reason),
                            tag="http-empty-content")
            # store cookies for valid links
-            if self.consumer.config('cookies'):
+            if self.aggregate.config['cookies']:
                for c in self.cookies:
                    self.add_info(_("Store cookie: %s.") % c)
                try:
-                    out = self.consumer.store_cookies(self.headers,
-                                                      self.urlparts[0],
-                                                      self.urlparts[1],
-                                                      self.urlparts[2])
+                    out = self.aggregate.cookies.add(self.headers,
+                                                     self.urlparts[0],
+                                                     self.urlparts[1],
+                                                     self.urlparts[2])
                    for h in out:
                        self.add_info(linkcheck.strformat.unicode_safe(h))
                except Cookie.CookieError, msg:
@ -471,13 +471,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
                                      linkcheck.configuration.UserAgent)
        self.url_connection.putheader("Accept-Encoding",
                                  "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
-        if self.consumer.config('cookies'):
+        if self.aggregate.config['cookies']:
            scheme = self.urlparts[0]
            host = self.urlparts[1]
            port = linkcheck.url.default_ports.get(scheme, 80)
            host, port = urllib.splitnport(host, port)
            path = self.urlparts[2]
-            self.cookies = self.consumer.get_cookies(scheme, host, port, path)
+            self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
            for c in self.cookies:
                name = c.client_header_name()
                value = c.client_header_value()
@ -505,7 +505,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        """
        _user, _password = self.get_user_password()
        key = (scheme, self.urlparts[1], _user, _password)
-        conn = self.consumer.get_connection(key)
+        conn = self.aggregate.connections.get(key)
        if conn is not None:
            assert linkcheck.log.debug(linkcheck.LOG_CHECK,
                                "reuse cached HTTP(S) connection %s", conn)
@ -634,7 +634,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
        # add to cached connections
        _user, _password = self.get_user_password()
        key = ("http", self.urlparts[1], _user, _password)
-        cache_add = self.consumer.add_connection
+        cache_add = self.aggregate.connections.add
        # note: only cache the connection when it is persistent
        # and all pending content has been received
        if not self.persistent or not self.has_content or \
--- a/linkcheck/checker/nntpurl.py
+++ b/linkcheck/checker/nntpurl.py
@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
        Connect to NNTP server and try to request the URL article
        resource (if specified).
        """
-        nntpserver = self.host or self.consumer.config("nntpserver")
+        nntpserver = self.host or self.aggregate.config["nntpserver"]
        if not nntpserver:
            self.add_warning(
                    _("No NNTP server was specified, skipping this URL."),
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@ -63,7 +63,7 @@ class ProxySupport (object):
        """
        Check if self.host is in the no-proxy-for ignore list.
        """
-        for ro in self.consumer.config("noproxyfor"):
+        for ro in self.aggregate.config["noproxyfor"]:
            if ro.search(self.host):
                return True
        return False
--- a/linkcheck/checker/telneturl.py
+++ b/linkcheck/checker/telneturl.py
@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
        label is "login: ", expected password label is "Password: ".
        """
        self.url_connection = telnetlib.Telnet()
-        if self.consumer.config("debug"):
+        if self.aggregate.config["debug"]:
            self.url_connection.set_debuglevel(1)
        self.url_connection.open(self.host, self.port)
        if self.user:
--- a/linkcheck/checker/tests/init.py
+++ b/linkcheck/checker/tests/init.py
@ -25,8 +25,6 @@ import unittest

 import linkcheck
 import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
 import linkcheck.configuration
 import linkcheck.logger

@ -93,7 +91,17 @@ class TestLogger (linkcheck.logger.Logger):
            self.diff.append(line)


-def get_test_consumer (confargs, logargs):
+def get_file (filename=None):
+    """
+    Get file name located within 'data' directory.
+    """
+    directory = os.path.join("linkcheck", "checker", "tests", "data")
+    if filename:
+        return unicode(os.path.join(directory, filename))
+    return unicode(directory)
+
+
+def get_test_aggregate (confargs, logargs):
    """
    Initialize a test configuration object.
    """
@ -101,14 +109,15 @@ def get_test_consumer (confargs, logargs):
    config.logger_add('test', TestLogger)
    config['recursionlevel'] = 1
    config['logger'] = config.logger_new('test', **logargs)
+    # uncomment for debugging
+    #config.init_logging(debug=["all"])
    config["anchors"] = True
    config["verbose"] = True
    config['threads'] = 0
+    config['status'] = False
    config['cookies'] = True
-    config['geoip'] = None
    config.update(confargs)
-    cache = linkcheck.checker.cache.Cache()
-    return linkcheck.checker.consumer.Consumer(config, cache)
+    return linkcheck.director.get_aggregate(config)


 class LinkCheckTest (unittest.TestCase):
@ -122,21 +131,14 @@ class LinkCheckTest (unittest.TestCase):
        """
        return linkcheck.url.url_norm(url)[0]

-    def get_file (self, filename):
-        """
-        Get file name located within 'data' directory.
-        """
-        return unicode(os.path.join("linkcheck", "checker", "tests",
-                                    "data", filename))
-
    def get_resultlines (self, filename):
        """
        Return contents of file, as list of lines without line endings,
        ignoring empty lines and lines starting with a hash sign (#).
        """
-        resultfile = self.get_file(filename+".result")
+        resultfile = get_file(filename+".result")
        d = {'curdir': os.getcwd(),
-             'datadir': 'linkcheck/checker/tests/data',
+             'datadir': get_file(),
            }
        f = codecs.open(resultfile, "r", "iso-8859-15")
        resultlines = [line.rstrip('\r\n') % d for line in f \
@ -144,27 +146,30 @@ class LinkCheckTest (unittest.TestCase):
        f.close()
        return resultlines

-    def file_test (self, filename, confargs=None, cmdline=True):
+    def file_test (self, filename, confargs=None, assume_local=True):
        """
        Check <filename> with expected result in <filename>.result.
        """
-        url = self.get_file(filename)
+        url = get_file(filename)
        if confargs is None:
            confargs = {}
        logargs = {'expected': self.get_resultlines(filename)}
-        consumer = get_test_consumer(confargs, logargs)
+        aggregate = get_test_aggregate(confargs, logargs)
        url_data = linkcheck.checker.get_url_from(
-                                      url, 0, consumer, cmdline=cmdline)
-        consumer.append_url(url_data)
-        linkcheck.checker.check_urls(consumer)
-        if consumer.config('logger').diff:
+                                url, 0, aggregate, assume_local=assume_local)
+        if assume_local:
+            linkcheck.add_intern_pattern(url_data, aggregate.config)
+        aggregate.urlqueue.put(url_data)
+        linkcheck.director.check_urls(aggregate)
+        diff = aggregate.config['logger'].diff
+        if diff:
            sep = unicode(os.linesep)
-            l = [url] + consumer.config('logger').diff
+            l = [url] + diff
            l = sep.join(l)
            self.fail(l.encode("iso8859-1", "ignore"))

    def direct (self, url, resultlines, fields=None, recursionlevel=0,
-                confargs=None, cmdline=False):
+                confargs=None, assume_local=False):
        """
        Check url with expected result.
        """
@ -176,14 +181,17 @@ class LinkCheckTest (unittest.TestCase):
        logargs = {'expected': resultlines}
        if fields is not None:
            logargs['fields'] = fields
-        consumer = get_test_consumer(confargs, logargs)
+        aggregate = get_test_aggregate(confargs, logargs)
        url_data = linkcheck.checker.get_url_from(
-                                          url, 0, consumer, cmdline=cmdline)
-        consumer.append_url(url_data)
-        linkcheck.checker.check_urls(consumer)
-        if consumer.config('logger').diff:
+                                url, 0, aggregate, assume_local=assume_local)
+        if assume_local:
+            linkcheck.add_intern_pattern(url_data, aggregate.config)
+        aggregate.urlqueue.put(url_data)
+        linkcheck.director.check_urls(aggregate)
+        diff = aggregate.config['logger'].diff
+        if diff:
            sep = unicode(os.linesep)
            l = [u"Differences found testing %s" % url]
-            l.extend(x.rstrip() for x in consumer.config('logger').diff[2:])
+            l.extend(x.rstrip() for x in diff[2:])
            self.fail(sep.join(l).encode("iso8859-1", "ignore"))

--- a/linkcheck/checker/tests/data/misc.html.result
+++ b/linkcheck/checker/tests/data/misc.html.result
@ -1,7 +1,3 @@
-url 
-cache key None
-real url None
-error
 url file://%(curdir)s/%(datadir)s/misc.html
 cache key file://%(curdir)s/%(datadir)s/misc.html
 real url file://%(curdir)s/%(datadir)s/misc.html
@ -21,3 +17,8 @@ url favicon.ico (cached)
 cache key file://%(curdir)s/%(datadir)s/favicon.ico
 real url file://%(curdir)s/%(datadir)s/favicon.ico
 valid
+
+url 
+cache key None
+real url None
+error
--- a/linkcheck/checker/tests/test_http.py
+++ b/linkcheck/checker/tests/test_http.py
@ -36,8 +36,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
            url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
                  u"http.html" % self.port
            resultlines = self.get_resultlines("http.html")
-            self.direct(url, resultlines, recursionlevel=1, cmdline=True)
-            self.redirect_http_test()
+            self.direct(url, resultlines, recursionlevel=1, assume_local=True)
+            self.redirect1_http_test()
+            self.redirect2_http_test()
            self.noproxyfor_test()
        finally:
            self.stop_server()
@ -64,9 +65,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
            u"original URL was u'http://localhost:%d/redirect1'." % self.port,
            u"valid",
        ]
-        self.direct(url, resultlines, recursionlevel=0, cmdline=True)
+        self.direct(url, resultlines, recursionlevel=0, assume_local=True)

-    def redirect_http_test (self):
+    def redirect1_http_test (self):
        url = u"http://localhost:%d/redirect1" % self.port
        nurl = url
        rurl = url.replace("redirect", "newurl")
@ -77,7 +78,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
            u"info Redirected to %s." % rurl,
            u"error",
        ]
-        self.direct(url, resultlines, recursionlevel=0, cmdline=True)
+        self.direct(url, resultlines, recursionlevel=0, assume_local=True)
+
+    def redirect2_http_test (self):
        url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
              self.port
        nurl = url
@ -94,7 +97,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
            u"name Recursive Redirect",
            u"valid",
        ]
-        self.direct(url, resultlines, recursionlevel=99, cmdline=True)
+        self.direct(url, resultlines, recursionlevel=99, assume_local=True)

    def noproxyfor_test (self):
        """
@ -113,7 +116,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
            u"valid",
        ]
        self.direct(url, resultlines, recursionlevel=0,
-                    confargs=confargs, cmdline=True)
+                    confargs=confargs, assume_local=True)
        del os.environ["http_proxy"]


--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -32,6 +32,7 @@ import traceback
 import linkcheck
 import linkcheck.linkparse
 import linkcheck.checker
+import linkcheck.director
 import linkcheck.strformat
 import linkcheck.containers
 import linkcheck.log
@ -55,7 +56,7 @@ class UrlBase (object):
    An URL with additional information like validity etc.
    """

-    def __init__ (self, base_url, recursion_level, consumer,
+    def __init__ (self, base_url, recursion_level, aggregate,
                  parent_url = None, base_ref = None,
                  line = -1, column = -1, name = u""):
        """
@ -63,7 +64,7 @@ class UrlBase (object):

        @param base_url: unquoted and possibly unnormed url
        @param recursion_level: on what check level lies the base url
-        @param consumer: consumer instance
+        @param aggregate: aggregate instance
        @param parent_url: quoted and normed url of parent or None
        @param base_ref: quoted and normed url of <base href=""> or None
        @param line: line number of url in parent content
@ -71,13 +72,13 @@ class UrlBase (object):
        @param name: name of url or empty
        """
        self.init(base_ref, base_url, parent_url, recursion_level,
-                  consumer, line, column, name)
+                  aggregate, line, column, name)
        self.reset()
        self.check_syntax()


    def init (self, base_ref, base_url, parent_url, recursion_level,
-              consumer, line, column, name):
+              aggregate, line, column, name):
        """
        Initialize internal data.
        """
@ -86,7 +87,7 @@ class UrlBase (object):
        self.base_url = base_url
        self.parent_url = parent_url
        self.recursion_level = recursion_level
-        self.consumer = consumer
+        self.aggregate = aggregate
        self.line = line
        self.column = column
        self.name = name
@ -203,6 +204,7 @@ class UrlBase (object):
        Fill attributes from cache data.
        """
        self.result = cache_data["result"]
+        self.has_result = True
        self.warnings.extend(cache_data["warnings"])
        self.info.extend(cache_data["info"])
        self.valid = cache_data["valid"]
@ -240,8 +242,8 @@ class UrlBase (object):
        assert linkcheck.log.debug(linkcheck.LOG_CACHE,
                              "Content cache key %r", self.cache_content_key)
        # construct cache key
-        if self.consumer.config("anchorcaching") and \
-           self.consumer.config("anchors"):
+        if self.aggregate.config["anchorcaching"] and \
+           self.aggregate.config["anchors"]:
            # do not ignore anchor
            parts = self.urlparts[:]
            parts[4] = self.anchor
@ -343,32 +345,28 @@ class UrlBase (object):
        """
        Main check function for checking this URL.
        """
-        if self.consumer.config("trace"):
+        if self.aggregate.config["trace"]:
            linkcheck.trace.trace_on()
        try:
            self.local_check()
-            self.consumer.checked(self)
        except (socket.error, select.error):
-            self.consumer.interrupted(self)
            # on Unix, ctrl-c can raise
            # error: (4, 'Interrupted system call')
            etype, value = sys.exc_info()[:2]
-            if etype == 4:
+            if etype == errno.EINTR:
                raise KeyboardInterrupt(value)
            else:
                raise
        except KeyboardInterrupt:
-            self.consumer.interrupted(self)
            raise
        except:
-            self.consumer.interrupted(self)
-            linkcheck.checker.internal_error()
+            linkcheck.director.internal_error()

    def add_country_info (self):
        """
        Try to ask GeoIP database for country info.
        """
-        country = self.consumer.get_country_name(self.host)
+        country = linkcheck.cache.geoip.get_country(self.host)
        if country is not None:
            self.add_info(_("URL is located in %s.") % _(country))

@ -377,10 +375,11 @@ class UrlBase (object):
        Local check function can be overridden in subclasses.
        """
        assert linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
-        if self.recursion_level and self.consumer.config('wait'):
+        wait = self.aggregate.config['wait']
+        if self.recursion_level and wait:
            assert linkcheck.log.debug(linkcheck.LOG_CHECK,
-                      "sleeping for %d seconds", self.consumer.config('wait'))
-            time.sleep(self.consumer.config('wait'))
+                "sleeping for %d seconds", wait)
+            time.sleep(wait)
        t = time.time()
        self.set_extern(self.url)
        if self.extern[0] and self.extern[1]:
@ -392,7 +391,7 @@ class UrlBase (object):
        try:
            self.check_connection()
            self.add_country_info()
-            if self.consumer.config("anchors"):
+            if self.aggregate.config["anchors"]:
                self.check_anchors()
        except tuple(linkcheck.checker.ExcList):
            value = self.handle_exception()
@ -406,7 +405,7 @@ class UrlBase (object):
                            valid=False)

        # check content
-        warningregex = self.consumer.config("warningregex")
+        warningregex = self.aggregate.config["warningregex"]
        if warningregex and self.valid:
            assert linkcheck.log.debug(linkcheck.LOG_CHECK,
                                       "checking content")
@ -486,8 +485,8 @@ class UrlBase (object):
            assert linkcheck.log.debug(linkcheck.LOG_CHECK,
                                "... no, cannot get content.")
            return False
-        if self.consumer.config("recursionlevel") >= 0 and \
-           self.recursion_level >= self.consumer.config("recursionlevel"):
+        rec_level = self.aggregate.config["recursionlevel"]
+        if  rec_level >= 0 and self.recursion_level >= rec_level:
            assert linkcheck.log.debug(linkcheck.LOG_CHECK,
                                "... no, maximum recursion level reached.")
            return False
@ -551,7 +550,7 @@ class UrlBase (object):

        @return: None
        """
-        for entry in self.consumer.config("externlinks"):
+        for entry in self.aggregate.config["externlinks"]:
            match = entry['pattern'].search(url)
            if (entry['negate'] and not match) or \
               (match and not entry['negate']):
@ -559,7 +558,7 @@ class UrlBase (object):
                                           "Extern URL %r", url)
                self.extern = (1, entry['strict'])
                return
-        for entry in self.consumer.config("internlinks"):
+        for entry in self.aggregate.config["internlinks"]:
            match = entry['pattern'].search(url)
            if (entry['negate'] and not match) or \
               (match and not entry['negate']):
@ -607,7 +606,7 @@ class UrlBase (object):
        If a maximum size was given, call this function to check it
        against the content size of this url.
        """
-        maxbytes = self.consumer.config("warnsizebytes")
+        maxbytes = self.aggregate.config["warnsizebytes"]
        if maxbytes is not None and self.dlsize >= maxbytes:
            self.add_warning(_("Content size %s is larger than %s.") %
                         (linkcheck.strformat.strsize(self.dlsize),
@ -626,7 +625,7 @@ class UrlBase (object):
        Get tuple (user, password) from configured authentication.
        Both user and password can be None if not specified.
        """
-        for auth in self.consumer.config("authentication"):
+        for auth in self.aggregate.config["authentication"]:
            if auth['pattern'].match(self.url):
                return auth['user'], auth['password']
        return None, None
@ -651,10 +650,10 @@ class UrlBase (object):
            else:
                base_ref = h.base_ref
            url_data = linkcheck.checker.get_url_from(url,
-                  self.recursion_level+1, self.consumer, parent_url=self.url,
+                  self.recursion_level+1, self.aggregate, parent_url=self.url,
                  base_ref=base_ref, line=line, column=column, name=name,
-                  cmdline=False)
-            self.consumer.append_url(url_data)
+                  assume_local=False)
+            self.aggregate.urlqueue.put(url_data)

    def parse_opera (self):
        """
@ -674,10 +673,10 @@ class UrlBase (object):
                url = line[4:]
                if url:
                    url_data = linkcheck.checker.get_url_from(url,
-                              self.recursion_level+1, self.consumer,
+                              self.recursion_level+1, self.aggregate,
                              parent_url=self.url, line=lineno, name=name,
-                              cmdline=False)
-                    self.consumer.append_url(url_data)
+                              assume_local=False)
+                    self.aggregate.urlqueue.put(url_data)
                name = ""

    def parse_text (self):
@ -694,10 +693,10 @@ class UrlBase (object):
            if not line or line.startswith('#'):
                continue
            url_data = linkcheck.checker.get_url_from(line,
-                              self.recursion_level+1, self.consumer,
+                              self.recursion_level+1, self.aggregate,
                              parent_url=self.url, line=lineno,
-                              cmdline=False)
-            self.consumer.append_url(url_data)
+                              assume_local=False)
+            self.aggregate.urlqueue.put(url_data)

    def parse_css (self):
        """
@ -712,10 +711,10 @@ class UrlBase (object):
                column = mo.start("url")
                url = linkcheck.strformat.unquote(mo.group("url").strip())
                url_data = linkcheck.checker.get_url_from(url,
-                             self.recursion_level+1, self.consumer,
+                             self.recursion_level+1, self.aggregate,
                             parent_url=self.url, line=lineno, column=column,
-                             cmdline=False)
-                self.consumer.append_url(url_data)
+                             assume_local=False)
+                self.aggregate.urlqueue.put(url_data)

    def serialized (self):
        """
@ -758,7 +757,7 @@ class UrlBase (object):
        @rtype: string
        """
        s = self.serialized()
-        return self.consumer.config('logger').encode(s)
+        return self.aggregate.config['logger'].encode(s)

    def __repr__ (self):
        """
--- a/linkcheck/configuration/init.py
+++ b/linkcheck/configuration/init.py
@ -28,11 +28,6 @@ import linkcheck
 import linkcheck.log
 import linkcheck.containers
 import confparse
-try:
-    import GeoIP
-    _has_geoip = True
-except ImportError:
-    _has_geoip = False

 Version = _linkchecker_configdata.version
 AppName = u"LinkChecker"
@ -83,6 +78,7 @@ class Configuration (dict):
        self["internlinks"] = []
        self["noproxyfor"] = []
        self["interactive"] = False
+        self["maxqueuesize"] = 0
        # on ftp, password is set by Pythons ftplib
        self["authentication"] = []
        self["proxy"] = urllib.getproxies()
@ -149,18 +145,6 @@ class Configuration (dict):
        self["warnsizebytes"] = None
        self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
        self["threads"] = 10
-        self.init_geoip()
-
-    def init_geoip (self):
-        """
-        If GeoIP.dat file is found, initialize a standard geoip DB and
-        store it in self["geoip"]; else this value will be None.
-        """
-        geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
-        if _has_geoip and os.path.exists(geoip_dat):
-            self["geoip"] = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
-        else:
-            self["geoip"] = None

    def init_logging (self, debug=None):
        """
--- a/linkcheck/configuration/tests/test_config.py
+++ b/linkcheck/configuration/tests/test_config.py
@ -23,12 +23,14 @@ import os
 import linkcheck.configuration


-def get_file (filename):
+def get_file (filename=None):
    """
    Get file name located within 'data' directory.
    """
-    return unicode(os.path.join("linkcheck", "configuration", "tests",
-                                "data", filename))
+    directory = os.path.join("linkcheck", "configuration", "tests", "data")
+    if filename:
+        return unicode(os.path.join(directory, filename))
+    return unicode(directory)


 class TestConfig (unittest.TestCase):
--- a/linkcheck/cookies.py
+++ b/linkcheck/cookies.py
@ -297,40 +297,3 @@ class Rfc2965Cookie (HttpCookie):

 # XXX more methods (equality test)

-
-class CookieJar (set):
-    """
-    Cookie storage, implementing the default cookie handling policy for
-    LinkChecker.
-    """
-
-    def add_cookies (self, headers, scheme, host, path):
-        """
-        Parse cookie values, add to jar.
-        """
-        to_add = set()
-        for h in headers.getallmatchingheaders("Set-Cookie"):
-            # RFC 2109 (Netscape) cookie type
-            try:
-                to_add.add(NetscapeCookie(h, scheme, host, path))
-            except CookieError:
-                assert linkcheck.log.debug(linkcheck.LOG_CACHE,
-               "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
-        for h in headers.getallmatchingheaders("Set-Cookie2"):
-            # RFC 2965 cookie type
-            try:
-                to_add.add(Rfc2965Cookie(h, scheme, host, path))
-            except CookieError:
-                assert linkcheck.log.debug(linkcheck.LOG_CACHE,
-              "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
-        for x in to_add:
-            self.add(x)
-        return to_add
-
-    def remove_expired (self):
-        """
-        Remove expired cookies from jar.
-        """
-        to_remove = [x for x in self if not x.check_expired()]
-        return self.difference_update(to_remove)
-
--- a/linkcheck/lc_cgi.py
+++ b/linkcheck/lc_cgi.py
@ -31,8 +31,7 @@ import linkcheck.url
 import linkcheck.i18n
 import linkcheck.strformat
 import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
+import linkcheck.director

 _logfile = None
 _supported_langs = ('de', 'fr', 'nl', 'C')
@ -99,13 +98,16 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
    config["externlinks"].append(
             linkcheck.get_link_pat("^%s$" % linkcheck.url.safe_url_pattern))
    config["externlinks"].append(linkcheck.get_link_pat(".*", strict=True))
+    # start checking
+    aggregate = linkcheck.director.get_aggregate(config)
+
    cache = linkcheck.checker.cache.Cache()
    consumer = linkcheck.checker.consumer.Consumer(config, cache)
-    # start checking
    url = form["url"].value
-    url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=False)
-    consumer.append_url(url_data)
-    linkcheck.checker.check_urls(consumer)
+    url_data = linkcheck.checker.get_url_from(url, 0, aggregate,
+                                              assume_local=False)
+    aggregate.urlqueue.put(url_data)
+    linkcheck.director.check_urls(aggregate)


 def get_host_name (form):
--- a/linkcheck/tests/test_urlbuild.py
+++ b/linkcheck/tests/test_urlbuild.py
@ -20,18 +20,16 @@ Test url build method from url data objects.

 import unittest
 import linkcheck.configuration
+import linkcheck.director
 import linkcheck.checker.httpurl
-import linkcheck.checker.cache
-import linkcheck.checker.consumer

-def get_test_consumer ():
+def get_test_aggregate ():
    """
    Initialize a test configuration object.
    """
    config = linkcheck.configuration.Configuration()
    config['logger'] = config.logger_new('none')
-    cache = linkcheck.checker.cache.Cache()
-    return linkcheck.checker.consumer.Consumer(config, cache)
+    return linkcheck.director.get_aggregate(config)


 class TestUrlBuild (unittest.TestCase):
@ -43,9 +41,9 @@ class TestUrlBuild (unittest.TestCase):
        parent_url = "http://localhost:8001/linkcheck/checker/tests/data/http.html"
        base_url = "http://"
        recursion_level = 0
-        consumer = get_test_consumer()
+        aggregate = get_test_aggregate()
        o = linkcheck.checker.httpurl.HttpUrl(base_url, recursion_level,
-               consumer, parent_url=parent_url)
+               aggregate, parent_url=parent_url)
        o.build_url()
        self.assertEquals(o.url, 'http://')

--- a/24
+++ b/24
@ -38,8 +38,7 @@ optparse._ = _
 import linkcheck.log
 import linkcheck.i18n
 import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
+import linkcheck.director
 import linkcheck.configuration
 import linkcheck.fileutil
 import linkcheck.strformat
@ -654,14 +653,15 @@ if len(args) <= 0:
    else:
        linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))

-# initialize the cache and the consumer model
-cache = linkcheck.checker.cache.Cache()
-consumer = linkcheck.checker.consumer.Consumer(config, cache)
+# prepare checking queue
+aggregate = linkcheck.director.get_aggregate(config)
 if options.trace:
    config["trace"] = True
+    import linkcheck.trace
    linkcheck.trace.trace_filter([r"^linkcheck"])
    linkcheck.trace.trace_on()
 # add urls to queue
+get_url_from = linkcheck.checker.get_url_from
 for url in args:
    if url.lower().startswith("www."):
        # syntactic sugar
@ -669,14 +669,14 @@ for url in args:
    elif url.lower().startswith("ftp."):
        # syntactic sugar
        url = "ftp://%s" % url
-    url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
-    consumer.append_url(url_data)
-############################# check the URLs ################################
+    url_data = get_url_from(url, 0, aggregate, assume_local=True)
+    linkcheck.add_intern_pattern(url_data, config)
+    aggregate.urlqueue.put(url_data)
+# set up profiling/psyco
 if do_profile and not has_profile:
    linkcheck.log.warn(linkcheck.LOG_CMDLINE,
                       _("The `profile' Python module is not installed,"
                         " therefore the --profile option is disabled."))
-
 if do_profile and has_profile:
    run = True
    if os.path.exists(_profile):
@ -690,7 +690,7 @@ if do_profile and has_profile:
            run = False
    if run:
        import profile
-        profile.run("linkcheck.checker.check_urls(consumer)", _profile)
+        profile.run("manager.check_urls()", _profile)
 elif options.psyco:
    try:
        import psyco
@ -705,8 +705,8 @@ elif options.psyco:
    except ImportError:
        # no psyco available, just ignore
        pass
-linkcheck.checker.check_urls(consumer)
-#############################################################################
+# start checking
+linkcheck.director.check_urls(aggregate)

 # interactive input end
 if config['interactive']:
--- a/setup.py
+++ b/setup.py
@ -537,7 +537,8 @@ o a (Fast)CGI web interface (requires HTTP server)
                   'clean': MyClean,
                  },
       packages = ['linkcheck', 'linkcheck.logger', 'linkcheck.checker',
-                   'linkcheck.configuration',
+                   'linkcheck.director', 'linkcheck.configuration',
+                   'linkcheck.cache',
                   'linkcheck.dns', 'linkcheck.dns.rdtypes',
                   'linkcheck.dns.rdtypes.ANY', 'linkcheck.dns.rdtypes.IN',
                   'linkcheck.HtmlParser', 'linkcheck.ftpparse', ],