diff --git a/ChangeLog b/ChangeLog
index 474e387e..0ef48cc9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -14,6 +14,12 @@
Changed: linkcheck/HtmlParser/htmllex.[lc],
linkcheck/tests/test_parser.py
+ * Revamp the threading algorithm by using a URL queue, with a
+ constant number of consumer threads called 'workers'.
+ This fixes the remaining "dequeue mutated during iteration" errors.
+ Type: feature
+ Changed: *.py
+
3.4 "The Chumscrubbers" (released 4.2.2006)
* Ignore decoding errors when retrieving the robots.txt URL.
diff --git a/MANIFEST.in b/MANIFEST.in
index de62f0a0..077cbae1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -37,6 +37,7 @@ include doc/de/*.1
include doc/fr/*.1
include doc/Makefile doc/rest2htmlnav
recursive-include linkcheck/checker/tests/data *.txt *.html *.result *.asc *.css *.ico
+recursive-include linkcheck/configuration/tests/data *.ini
include linkcheck/tests/*.py
include linkcheck/checker/tests/*.py
include linkcheck/dns/tests/*.py
diff --git a/TODO b/TODO
index 1b78c670..0461182d 100644
--- a/TODO
+++ b/TODO
@@ -1,7 +1,11 @@
-http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483752
-http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/475160
+- Improved print_status
-- use format_time from quodlibet for times
+- Ctrl-C is not really working.
+
+- To limit the used memory, put a maximum size on the URL queue
+ (eg. 20000 URLs). If reached, the worker calling queue.put() will
+ wait for another worker to call queue.get() before continuing.
+ Problem: dead lock when all workers called queue.put().
- [FEATURE] postmortem debugging with pdb.pm()
diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py
index 06330748..c6d9f56a 100644
--- a/linkcheck/__init__.py
+++ b/linkcheck/__init__.py
@@ -58,6 +58,17 @@ class LinkCheckerError (Exception):
pass
+def add_intern_pattern (url_data, config):
+ """
+ Add intern URL regex to config.
+ """
+ pat = url_data.get_intern_pattern()
+ if pat:
+ assert linkcheck.log.debug(LOG_CHECK,
+ "Add intern pattern %r from command line", pat)
+ config['internlinks'].append(get_link_pat(pat))
+
+
def get_link_pat (arg, strict=False):
"""
Get a link pattern matcher for intern/extern links.
diff --git a/linkcheck/cache/__init__.py b/linkcheck/cache/__init__.py
new file mode 100644
index 00000000..318628da
--- /dev/null
+++ b/linkcheck/cache/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and provide cached data during checking in a thread-safe manner.
+"""
diff --git a/linkcheck/cache/connection.py b/linkcheck/cache/connection.py
new file mode 100644
index 00000000..8231a69d
--- /dev/null
+++ b/linkcheck/cache/connection.py
@@ -0,0 +1,111 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2005-2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and retrieve open connections.
+"""
+
+import time
+import threading
+from linkcheck.decorators import synchronized
+
+# lock for robots.txt caching
+_lock = threading.Lock()
+
+
+class ConnectionPool (object):
+ """
+ Thread-safe cache, storing a set of connections for URL retrieval.
+ """
+
+ def __init__ (self):
+ """
+ Initialize an empty connection dictionary which will have entries
+ of the form::
+ key -> [connection, status, expiration time]
+
+ Connection can be any open connection object (HTTP, FTP, ...).
+ Status is either 'available' or 'busy'.
+ Expiration time is the point of time in seconds when this
+ connection will be timed out.
+
+ The identifier key is usually a tuple (type, host, user, pass),
+ but it can be any immutable Python object.
+ """
+ # open connections
+ # {(type, host, user, pass) -> [connection, status, expiration time]}
+ self.connections = {}
+
+ @synchronized(_lock)
+ def add (self, key, conn, timeout):
+ """
+ Add connection to the pool with given identifier key and timeout
+ in seconds.
+ """
+ self.connections[key] = [conn, 'available', time.time() + timeout]
+
+ @synchronized(_lock)
+ def get (self, key):
+ """
+ Get open connection if available, for at most 30 seconds.
+
+ @return: Open connection object or None if no connection is available.
+ @rtype None or FTPConnection or HTTP(S)Connection
+ """
+ if key not in self.connections:
+ # not found
+ return None
+ conn_data = self.connections[key]
+ t = time.time()
+ if t > conn_data[2]:
+ # timed out
+ try:
+ conn_data[1].close()
+ except:
+ # ignore close errors
+ pass
+ del self.connections[key]
+ return None
+ # wait at most 300*0.1=30 seconds for connection to become available
+ for dummy in xrange(300):
+ if conn_data[1] != 'busy':
+ conn_data[1] = 'busy'
+ conn_data[2] = t
+ return conn_data[0]
+ time.sleep(0.1)
+ # connection is in use
+ return None
+
+ @synchronized(_lock)
+ def release (self, key):
+ """
+ Mark an open and reusable connection as available.
+ """
+ if key in self.connections:
+ self.connections[key][1] = 'available'
+
+ @synchronized(_lock)
+ def expire_connections (self):
+ """
+ Remove expired connections from this pool.
+ """
+ t = time.time()
+ to_delete = []
+ for key, conn_data in self.connections.iteritems():
+ if conn_data[1] == 'available' and t > conn_data[2]:
+ to_delete.append(key)
+ for key in to_delete:
+ del self.connections[key]
diff --git a/linkcheck/cache/cookie.py b/linkcheck/cache/cookie.py
new file mode 100644
index 00000000..f5a110bc
--- /dev/null
+++ b/linkcheck/cache/cookie.py
@@ -0,0 +1,73 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Store and retrieve cookies.
+"""
+import threading
+from linkcheck.decorators import synchronized
+import linkcheck
+import linkcheck.log
+import linkcheck.cookies
+
+# lock for caching
+_lock = threading.Lock()
+
+
+class CookieJar (object):
+ """
+ Cookie storage, implementing the default cookie handling policy for
+ LinkChecker.
+ """
+
+ def __init__ (self):
+ self.cache = {}
+
+ @synchronized(_lock)
+ def add (self, headers, scheme, host, path):
+ """
+ Parse cookie values, add to cache.
+ """
+ jar = set()
+ for h in headers.getallmatchingheaders("Set-Cookie"):
+ # RFC 2109 (Netscape) cookie type
+ try:
+ c = linkcheck.cookies.NetscapeCookie(h, scheme, host, path)
+ jar.add(c)
+ except linkcheck.cookies.CookieError:
+ assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+ "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
+ for h in headers.getallmatchingheaders("Set-Cookie2"):
+ # RFC 2965 cookie type
+ try:
+ c = linkcheck.cookies.Rfc2965Cookie(h, scheme, host, path)
+ jar.add(c)
+ except linkcheck.cookies.CookieError:
+ assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+ "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
+ self.cache[host] = jar
+ return jar
+
+ @synchronized(_lock)
+ def get (self, scheme, host, port, path):
+ """
+ Cookie cache getter function.
+ """
+ assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+ "Get cookies for host %r path %r", host, path)
+ jar = self.cache.setdefault(host, set())
+ return [x for x in jar if x.check_expired() and \
+ x.is_valid_for(scheme, host, port, path)]
diff --git a/linkcheck/checker/geoip.py b/linkcheck/cache/geoip.py
similarity index 91%
rename from linkcheck/checker/geoip.py
rename to linkcheck/cache/geoip.py
index 881cc4cc..73cb7546 100644
--- a/linkcheck/checker/geoip.py
+++ b/linkcheck/cache/geoip.py
@@ -15,23 +15,44 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
-GeoIP wrapper.
+Store and retrieve country names for IPs.
"""
+import os
+import threading
+from linkcheck.decorators import synchronized
-def get_country (gi, host):
+# I don't know if the geoip library is already thread-safe, but
+# we take no risks here.
+_lock = threading.Lock()
+
+# initialize GeoIP database
+geoip = None
+try:
+ import GeoIP
+ geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
+ if os.name == 'posix' and os.path.exists(geoip_dat):
+ geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
+ del geoip_dat
+except ImportError:
+ pass
+
+
+@synchronized(_lock)
+def get_country (host):
"""
Get translated country name.
@return: country string or None
"""
- c = gi.country_code_by_name(host)
+ if geoip is None:
+ return None
+ c = geoip.country_code_by_name(host)
if c and c in countries:
return "%s, %s" % (c, countries[c])
return None
# GeoIP country map with {short name -> translated full name} entries
-
countries = {
"AP": "Asia/Pacific Region",
"EU": "Europe",
diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py
new file mode 100644
index 00000000..ca1f1383
--- /dev/null
+++ b/linkcheck/cache/robots_txt.py
@@ -0,0 +1,52 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Cache robots.txt contents.
+"""
+import threading
+from linkcheck.decorators import synchronized
+import linkcheck.robotparser2
+import linkcheck.configuration
+
+
+# lock for caching
+_lock = threading.Lock()
+
+
+class RobotsTxt (object):
+ """
+ Thread-safe cache of downloaded robots.txt files.
+ format: {cache key (string) -> robots.txt content (RobotFileParser)}
+ """
+
+ def __init__ (self):
+ self.cache = {}
+
+ @synchronized(_lock)
+ def allows_url (self, roboturl, url, user, password):
+ """
+ Ask robots.txt allowance.
+ """
+ if roboturl not in self.cache:
+ rp = linkcheck.robotparser2.RobotFileParser(
+ user=user, password=password)
+ rp.set_url(roboturl)
+ rp.read()
+ self.cache[roboturl] = rp
+ else:
+ rp = self.cache[roboturl]
+ return rp.can_fetch(linkcheck.configuration.UserAgent, url)
diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py
new file mode 100644
index 00000000..ff7ac6a5
--- /dev/null
+++ b/linkcheck/cache/urlqueue.py
@@ -0,0 +1,210 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2000-2006 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Handle a queue of URLs to check.
+"""
+import threading
+import Queue
+import time
+import linkcheck
+import linkcheck.log
+
+
+class UrlQueue (Queue.Queue):
+ """
+ A queue supporting several consumer tasks. The task_done() idea is
+ from the Python 2.5 Subversion repository.
+ """
+
+ def __init__ (self, maxsize=0):
+ """
+ Initialize the queue state and task counters.
+ """
+ Queue.Queue.__init__(self, maxsize=maxsize)
+ self.all_tasks_done = threading.Condition(self.mutex)
+ self.unfinished_tasks = 0
+ self.finished_tasks = 0
+ self.in_progress = {}
+ self.checked = {}
+ self.shutdown = False
+
+ def get (self):
+ """
+ Get first not-in-progress url from the queue and
+ return it. If no such url is available return None. The
+ url might be already cached.
+ """
+ self.not_empty.acquire()
+ try:
+ while self._empty():
+ self.not_empty.wait()
+ url_data = self._get()
+ key = url_data.cache_url_key
+ if url_data.has_result:
+ # Already checked and copied from cache.
+ pass
+ elif key in self.checked:
+ # Already checked; copy result. And even ignore
+ # the case where url happens to be in_progress.
+ url_data.copy_from_cache(self.checked[key])
+ elif key in self.in_progress:
+ # It's being checked currently; put it back in the queue.
+ Queue.Queue._put(self, url_data)
+ url_data = None
+ else:
+ self.in_progress[key] = url_data
+ self.not_full.notify()
+ return url_data
+ finally:
+ self.not_empty.release()
+
+ def _put (self, url_data):
+ """
+ Put URL in queue, increase number of unfished tasks.
+ """
+ if self.shutdown:
+ # don't accept more URLs
+ return
+ key = url_data.cache_url_key
+ if key in self.checked:
+ # Put at beginning of queue to get consumed quickly.
+ url_data.copy_from_cache(self.checked[key])
+ self.queue.appendleft(url_data)
+ else:
+ self.queue.append(url_data)
+ self.unfinished_tasks += 1
+
+ def task_done (self, url_data):
+ """
+ Indicate that a formerly enqueued task is complete.
+
+ Used by Queue consumer threads. For each get() used to fetch a task,
+ a subsequent call to task_done() tells the queue that the processing
+ on the task is complete.
+
+ If a join() is currently blocking, it will resume when all items
+ have been processed (meaning that a task_done() call was received
+ for every item that had been put() into the queue).
+
+ Raises a ValueError if called more times than there were items
+ placed in the queue.
+ """
+ self.all_tasks_done.acquire()
+ try:
+ if url_data is not None:
+ key = url_data.cache_url_key
+ if key is not None and key not in self.checked:
+ self._cache_url(key, url_data)
+ self.finished_tasks += 1
+ unfinished = self.unfinished_tasks - 1
+ if unfinished <= 0:
+ if unfinished < 0:
+ raise ValueError('task_done() called too many times')
+ self.all_tasks_done.notifyAll()
+ self.unfinished_tasks = unfinished
+ finally:
+ self.all_tasks_done.release()
+
+ def _cache_url (self, key, url_data):
+ """
+ Put URL result data into cache.
+ """
+ assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+ "Caching %r", key)
+ assert key in self.in_progress, \
+ "%r not in %s" % (key, self.in_progress)
+ del self.in_progress[key]
+ data = url_data.get_cache_data()
+ self.checked[key] = data
+ # check for aliases (eg. through HTTP redirections)
+ if hasattr(url_data, "aliases"):
+ data = url_data.get_alias_cache_data()
+ for key in url_data.aliases:
+ if key in self.checked or key in self.in_progress:
+ continue
+ assert linkcheck.log.debug(linkcheck.LOG_CACHE,
+ "Caching alias %r", key)
+ self.checked[key] = data
+
+ def join (self, timeout=None):
+ """Blocks until all items in the Queue have been gotten and processed.
+
+ The count of unfinished tasks goes up whenever an item is added to the
+ queue. The count goes down whenever a consumer thread calls task_done()
+ to indicate the item was retrieved and all work on it is complete.
+
+ When the count of unfinished tasks drops to zero, join() unblocks.
+ """
+ self.all_tasks_done.acquire()
+ try:
+ if timeout is None:
+ while self.unfinished_tasks:
+ self.all_tasks_done.wait()
+ else:
+ if timeout < 0:
+ raise ValueError("'timeout' must be a positive number")
+ endtime = time.time() + timeout
+ while self.unfinished_tasks:
+ remaining = endtime - time.time()
+ if remaining <= 0.0:
+ return
+ self.all_tasks_done.wait(remaining)
+ finally:
+ self.all_tasks_done.release()
+
+ def do_shutdown (self):
+ """
+ Shutdown the queue by not accepting any more URLs.
+ """
+ self.mutex.acquire()
+ try:
+ unfinished = self.unfinished_tasks - len(self.queue)
+ self.queue.clear()
+ if unfinished <= 0:
+ if unfinished < 0:
+ raise ValueError('shutdown is in error')
+ self.all_tasks_done.notifyAll()
+ self.unfinished_tasks = unfinished
+ self.shutdown = True
+ finally:
+ self.mutex.release()
+
+ def status (self):
+ """
+ Get tuple (finished tasks, unfinished tasks, queue size).
+ """
+ self.mutex.acquire()
+ try:
+ return (self.finished_tasks, self.unfinished_tasks, len(self.queue))
+ finally:
+ self.mutex.release()
+
+ def checked_redirect (self, redirect, url_data):
+ """
+ Check if redirect is already in cache. Used for URL redirections
+ to avoid double checking of already cached URLs.
+ If the redirect URL is found in the cache, the result data is
+ already copied.
+ """
+ self.mutex.acquire()
+ try:
+ if redirect in self.checked:
+ url_data.copy_from_cache(self.checked[redirect])
+ return True
+ return False
+ finally:
+ self.mutex.release()
diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py
index ec84bce2..d32d2a30 100644
--- a/linkcheck/checker/__init__.py
+++ b/linkcheck/checker/__init__.py
@@ -18,19 +18,14 @@
Main functions for link checking.
"""
-import time
-import sys
import os
import cgi
import socket
-import codecs
-import traceback
import select
import re
import urllib
import nntplib
import ftplib
-
import linkcheck.httplib2
import linkcheck.strformat
import linkcheck.dns.exception
@@ -153,110 +148,6 @@ acap # application configuration access protocol
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
-_encoding = linkcheck.i18n.default_encoding
-stderr = codecs.getwriter(_encoding)(sys.stderr, errors="ignore")
-
-def internal_error ():
- """
- Print internal error message to stderr.
- """
- print >> stderr, os.linesep
- print >> stderr, _("""********** Oops, I did it again. *************
-
-You have found an internal error in LinkChecker. Please write a bug report
-at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
-or send mail to %s and include the following information:
-- the URL or file you are testing
-- your commandline arguments and/or configuration.
-- the output of a debug run with option "-Dall" of the executed command
-- the system information below.
-
-Disclosing some of the information above due to privacy reasons is ok.
-I will try to help you nonetheless, but you have to give me something
-I can work with ;) .
-""") % linkcheck.configuration.Email
- etype, value = sys.exc_info()[:2]
- print >> stderr, etype, value
- traceback.print_exc()
- print_app_info()
- print >> stderr, os.linesep, \
- _("******** LinkChecker internal error, over and out ********")
- sys.exit(1)
-
-
-def print_app_info ():
- """
- Print system and application info to stderr.
- """
- print >> stderr, _("System info:")
- print >> stderr, linkcheck.configuration.App
- print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
- for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"):
- value = os.getenv(key)
- if value is not None:
- print >> stderr, key, "=", repr(value)
-
-
-def check_urls (consumer):
- """
- Main check function; checks all configured URLs until interrupted
- with Ctrl-C. If you call this function more than once, you can specify
- different configurations with the consumer parameter.
-
- @param consumer: an object where all runtime-dependent options are
- stored
- @type consumer: linkcheck.consumer.Consumer
- @return: None
- """
- try:
- _check_urls(consumer)
- except (KeyboardInterrupt, SystemExit):
- consumer.abort()
- except:
- consumer.abort()
- internal_error()
-
-
-def _check_urls (consumer):
- """
- Checks all configured URLs. Prints status information, calls logger
- methods.
-
- @param consumer: an object where all runtime-dependent options are
- stored
- @type consumer: linkcheck.consumer.Consumer
- @return: None
- """
- start_time = time.time()
- status_time = start_time
- while not consumer.finished():
- url_data = consumer.incoming_get_url()
- if url_data is None:
- # wait for incoming queue to fill
- time.sleep(0.1)
- elif url_data.cached:
- # was cached -> can be logged
- consumer.log_url(url_data)
- else:
- # go check this url
- if url_data.parent_url and not \
- linkcheck.url.url_is_absolute(url_data.base_url):
- name = url_data.parent_url
- else:
- name = u""
- if url_data.base_url:
- name += url_data.base_url
- if not name:
- name = None
- consumer.check_url(url_data, name)
- if consumer.config('status'):
- curtime = time.time()
- if (curtime - status_time) > 5:
- consumer.print_status(curtime, start_time)
- status_time = curtime
- consumer.end_log_output()
-
-
# file extensions we can parse recursively
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
@@ -298,9 +189,9 @@ def absolute_url (base_url, base_ref, parent_url):
return u""
-def get_url_from (base_url, recursion_level, consumer,
+def get_url_from (base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=0, column=0,
- name=u"", cmdline=False):
+ name=u"", assume_local=False):
"""
Get url data from given base data.
@@ -308,8 +199,8 @@ def get_url_from (base_url, recursion_level, consumer,
@type base_url: string or None
@param recursion_level: current recursion level
@type recursion_level: number
- @param consumer: consumer object
- @type consumer: linkcheck.checker.consumer.Consumer
+ @param aggregate: aggregate object
+ @type aggregate: linkcheck.checker.aggregate.Consumer
@param parent_url: parent url
@type parent_url: string or None
@param base_ref: base url from tag
@@ -329,7 +220,14 @@ def get_url_from (base_url, recursion_level, consumer,
base_ref = linkcheck.strformat.unicode_safe(base_ref)
name = linkcheck.strformat.unicode_safe(name)
url = absolute_url(base_url, base_ref, parent_url).lower()
- # test scheme
+ klass = get_urlclass_from(url, assume_local)
+ return klass(base_url, recursion_level, aggregate,
+ parent_url=parent_url, base_ref=base_ref,
+ line=line, column=column, name=name)
+
+
+def get_urlclass_from (url, assume_local):
+ """Return checker class for given URL."""
if url.startswith("http:"):
klass = linkcheck.checker.httpurl.HttpUrl
elif url.startswith("ftp:"):
@@ -351,24 +249,13 @@ def get_url_from (base_url, recursion_level, consumer,
elif ignored_schemes_re.search(url):
# ignored url
klass = linkcheck.checker.ignoredurl.IgnoredUrl
- elif cmdline:
- # assume local file on command line
+ elif assume_local:
+ # assume local file
klass = linkcheck.checker.fileurl.FileUrl
else:
# error url, no further checking, just log this
klass = linkcheck.checker.errorurl.ErrorUrl
- url_data = klass(base_url, recursion_level, consumer,
- parent_url=parent_url, base_ref=base_ref,
- line=line, column=column, name=name)
- if cmdline:
- # add intern URL regex to config for every URL that was given
- # on the command line
- pat = url_data.get_intern_pattern()
- assert linkcheck.log.debug(linkcheck.LOG_CMDLINE,
- "Add intern pattern %r from command line", pat)
- if pat:
- consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
- return url_data
+ return klass
def get_index_html (urls):
diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py
index 22006618..53a0e218 100644
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@@ -23,11 +23,13 @@ import os
import time
import urlparse
import urllib
+import urllib2
import urlbase
import linkcheck
import linkcheck.log
import linkcheck.checker
+import linkcheck.fileutil
# if file extension lookup was unsuccessful, look at the content
contents = {
@@ -83,7 +85,7 @@ class FileUrl (urlbase.UrlBase):
"""
def init (self, base_ref, base_url, parent_url, recursion_level,
- consumer, line, column, name):
+ aggregate, line, column, name):
"""
Besides the usual initialization the URL is normed according
to the platform:
@@ -91,7 +93,7 @@ class FileUrl (urlbase.UrlBase):
- under Windows platform the drive specifier is normed
"""
super(FileUrl, self).init(base_ref, base_url, parent_url,
- recursion_level, consumer, line, column, name)
+ recursion_level, aggregate, line, column, name)
if self.base_url is None:
return
base_url = self.base_url
@@ -129,7 +131,8 @@ class FileUrl (urlbase.UrlBase):
if self.is_directory():
self.set_result(_("directory"))
else:
- super(FileUrl, self).check_connection()
+ url = linkcheck.fileutil.pathencode(self.url)
+ self.url_connection = urllib2.urlopen(url)
self.check_case_sensitivity()
def check_case_sensitivity (self):
@@ -147,7 +150,6 @@ class FileUrl (urlbase.UrlBase):
"system path %r. You should always use "
"the system path in URLs.") % (path, realpath),
tag="file-system-path")
- pass
def get_content (self):
"""
@@ -208,7 +210,7 @@ class FileUrl (urlbase.UrlBase):
path = self.urlparts[2]
if os.name == 'nt':
path = prepare_urlpath_for_nt(path)
- return urllib.url2pathname(path)
+ return linkcheck.fileutil.pathencode(urllib.url2pathname(path))
def is_directory (self):
"""
diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py
index a571a746..63f75964 100644
--- a/linkcheck/checker/ftpurl.py
+++ b/linkcheck/checker/ftpurl.py
@@ -53,12 +53,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
order: login, changing directory, list the file.
"""
# proxy support (we support only http)
- self.set_proxy(self.consumer.config("proxy").get(self.scheme))
+ self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
if self.proxy:
# using a (HTTP) proxy
http = httpurl.HttpUrl(self.base_url,
self.recursion_level,
- self.consumer,
+ self.aggregate,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
@@ -87,7 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# ready to connect
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
- conn = self.consumer.get_connection(key)
+ conn = self.aggregate.connections.get(key)
if conn is not None and conn.sock is not None:
# reuse cached FTP connection
self.url_connection = conn
@@ -248,6 +248,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
- cache_add = self.consumer.add_connection
+ cache_add = self.aggregate.connections.add
cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
self.url_connection = None
diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py
index 159ef2c4..58a5619d 100644
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@@ -129,8 +129,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
- return self.consumer.robots_txt_allows_url(roboturl, url,
- user, password)
+ return self.aggregate.robots_txt.allows_url(roboturl, url,
+ user, password)
def check_connection (self):
"""
@@ -150,7 +150,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
valid request
"""
# set the proxy, so a 407 status after this is an error
- self.set_proxy(self.consumer.config("proxy").get(self.scheme))
+ self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
# initialize check data
self.headers = None
self.auth = None
@@ -360,19 +360,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
tag="http-moved-permanent")
self.has301status = True
# check cache again on the changed URL
- if self.consumer.checked_redirect(redirected, self):
+ if self.aggregate.urlqueue.checked_redirect(redirected, self):
return -1, response
# in case of changed scheme make new URL object
if self.urlparts[0] != self.scheme:
newobj = linkcheck.checker.get_url_from(
- redirected, self.recursion_level, self.consumer,
+ redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name,
- cmdline=False)
+ assume_local=False)
newobj.warnings = self.warnings
newobj.info = self.info
# append new object to queue
- self.consumer.append_url(newobj)
+ self.aggregate.append_url(newobj)
# pretend to be finished and logged
return -1, response
# new response data
@@ -406,14 +406,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
linkcheck.strformat.unicode_safe(response.reason),
tag="http-empty-content")
# store cookies for valid links
- if self.consumer.config('cookies'):
+ if self.aggregate.config['cookies']:
for c in self.cookies:
self.add_info(_("Store cookie: %s.") % c)
try:
- out = self.consumer.store_cookies(self.headers,
- self.urlparts[0],
- self.urlparts[1],
- self.urlparts[2])
+ out = self.aggregate.cookies.add(self.headers,
+ self.urlparts[0],
+ self.urlparts[1],
+ self.urlparts[2])
for h in out:
self.add_info(linkcheck.strformat.unicode_safe(h))
except Cookie.CookieError, msg:
@@ -471,13 +471,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
linkcheck.configuration.UserAgent)
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
- if self.consumer.config('cookies'):
+ if self.aggregate.config['cookies']:
scheme = self.urlparts[0]
host = self.urlparts[1]
port = linkcheck.url.default_ports.get(scheme, 80)
host, port = urllib.splitnport(host, port)
path = self.urlparts[2]
- self.cookies = self.consumer.get_cookies(scheme, host, port, path)
+ self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
for c in self.cookies:
name = c.client_header_name()
value = c.client_header_value()
@@ -505,7 +505,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
_user, _password = self.get_user_password()
key = (scheme, self.urlparts[1], _user, _password)
- conn = self.consumer.get_connection(key)
+ conn = self.aggregate.connections.get(key)
if conn is not None:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"reuse cached HTTP(S) connection %s", conn)
@@ -634,7 +634,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("http", self.urlparts[1], _user, _password)
- cache_add = self.consumer.add_connection
+ cache_add = self.aggregate.connections.add
# note: only cache the connection when it is persistent
# and all pending content has been received
if not self.persistent or not self.has_content or \
diff --git a/linkcheck/checker/nntpurl.py b/linkcheck/checker/nntpurl.py
index 81673f69..0417ac1f 100644
--- a/linkcheck/checker/nntpurl.py
+++ b/linkcheck/checker/nntpurl.py
@@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
Connect to NNTP server and try to request the URL article
resource (if specified).
"""
- nntpserver = self.host or self.consumer.config("nntpserver")
+ nntpserver = self.host or self.aggregate.config["nntpserver"]
if not nntpserver:
self.add_warning(
_("No NNTP server was specified, skipping this URL."),
diff --git a/linkcheck/checker/proxysupport.py b/linkcheck/checker/proxysupport.py
index a9c1d907..edcd2ae4 100644
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@@ -63,7 +63,7 @@ class ProxySupport (object):
"""
Check if self.host is in the no-proxy-for ignore list.
"""
- for ro in self.consumer.config("noproxyfor"):
+ for ro in self.aggregate.config["noproxyfor"]:
if ro.search(self.host):
return True
return False
diff --git a/linkcheck/checker/telneturl.py b/linkcheck/checker/telneturl.py
index b14e1ac5..b9d96cbc 100644
--- a/linkcheck/checker/telneturl.py
+++ b/linkcheck/checker/telneturl.py
@@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
label is "login: ", expected password label is "Password: ".
"""
self.url_connection = telnetlib.Telnet()
- if self.consumer.config("debug"):
+ if self.aggregate.config["debug"]:
self.url_connection.set_debuglevel(1)
self.url_connection.open(self.host, self.port)
if self.user:
diff --git a/linkcheck/checker/tests/__init__.py b/linkcheck/checker/tests/__init__.py
index edc140e9..cf6efd8e 100644
--- a/linkcheck/checker/tests/__init__.py
+++ b/linkcheck/checker/tests/__init__.py
@@ -25,8 +25,6 @@ import unittest
import linkcheck
import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
import linkcheck.configuration
import linkcheck.logger
@@ -93,7 +91,17 @@ class TestLogger (linkcheck.logger.Logger):
self.diff.append(line)
-def get_test_consumer (confargs, logargs):
+def get_file (filename=None):
+ """
+ Get file name located within 'data' directory.
+ """
+ directory = os.path.join("linkcheck", "checker", "tests", "data")
+ if filename:
+ return unicode(os.path.join(directory, filename))
+ return unicode(directory)
+
+
+def get_test_aggregate (confargs, logargs):
"""
Initialize a test configuration object.
"""
@@ -101,14 +109,15 @@ def get_test_consumer (confargs, logargs):
config.logger_add('test', TestLogger)
config['recursionlevel'] = 1
config['logger'] = config.logger_new('test', **logargs)
+ # uncomment for debugging
+ #config.init_logging(debug=["all"])
config["anchors"] = True
config["verbose"] = True
config['threads'] = 0
+ config['status'] = False
config['cookies'] = True
- config['geoip'] = None
config.update(confargs)
- cache = linkcheck.checker.cache.Cache()
- return linkcheck.checker.consumer.Consumer(config, cache)
+ return linkcheck.director.get_aggregate(config)
class LinkCheckTest (unittest.TestCase):
@@ -122,21 +131,14 @@ class LinkCheckTest (unittest.TestCase):
"""
return linkcheck.url.url_norm(url)[0]
- def get_file (self, filename):
- """
- Get file name located within 'data' directory.
- """
- return unicode(os.path.join("linkcheck", "checker", "tests",
- "data", filename))
-
def get_resultlines (self, filename):
"""
Return contents of file, as list of lines without line endings,
ignoring empty lines and lines starting with a hash sign (#).
"""
- resultfile = self.get_file(filename+".result")
+ resultfile = get_file(filename+".result")
d = {'curdir': os.getcwd(),
- 'datadir': 'linkcheck/checker/tests/data',
+ 'datadir': get_file(),
}
f = codecs.open(resultfile, "r", "iso-8859-15")
resultlines = [line.rstrip('\r\n') % d for line in f \
@@ -144,27 +146,30 @@ class LinkCheckTest (unittest.TestCase):
f.close()
return resultlines
- def file_test (self, filename, confargs=None, cmdline=True):
+ def file_test (self, filename, confargs=None, assume_local=True):
"""
Check with expected result in .result.
"""
- url = self.get_file(filename)
+ url = get_file(filename)
if confargs is None:
confargs = {}
logargs = {'expected': self.get_resultlines(filename)}
- consumer = get_test_consumer(confargs, logargs)
+ aggregate = get_test_aggregate(confargs, logargs)
url_data = linkcheck.checker.get_url_from(
- url, 0, consumer, cmdline=cmdline)
- consumer.append_url(url_data)
- linkcheck.checker.check_urls(consumer)
- if consumer.config('logger').diff:
+ url, 0, aggregate, assume_local=assume_local)
+ if assume_local:
+ linkcheck.add_intern_pattern(url_data, aggregate.config)
+ aggregate.urlqueue.put(url_data)
+ linkcheck.director.check_urls(aggregate)
+ diff = aggregate.config['logger'].diff
+ if diff:
sep = unicode(os.linesep)
- l = [url] + consumer.config('logger').diff
+ l = [url] + diff
l = sep.join(l)
self.fail(l.encode("iso8859-1", "ignore"))
def direct (self, url, resultlines, fields=None, recursionlevel=0,
- confargs=None, cmdline=False):
+ confargs=None, assume_local=False):
"""
Check url with expected result.
"""
@@ -176,14 +181,17 @@ class LinkCheckTest (unittest.TestCase):
logargs = {'expected': resultlines}
if fields is not None:
logargs['fields'] = fields
- consumer = get_test_consumer(confargs, logargs)
+ aggregate = get_test_aggregate(confargs, logargs)
url_data = linkcheck.checker.get_url_from(
- url, 0, consumer, cmdline=cmdline)
- consumer.append_url(url_data)
- linkcheck.checker.check_urls(consumer)
- if consumer.config('logger').diff:
+ url, 0, aggregate, assume_local=assume_local)
+ if assume_local:
+ linkcheck.add_intern_pattern(url_data, aggregate.config)
+ aggregate.urlqueue.put(url_data)
+ linkcheck.director.check_urls(aggregate)
+ diff = aggregate.config['logger'].diff
+ if diff:
sep = unicode(os.linesep)
l = [u"Differences found testing %s" % url]
- l.extend(x.rstrip() for x in consumer.config('logger').diff[2:])
+ l.extend(x.rstrip() for x in diff[2:])
self.fail(sep.join(l).encode("iso8859-1", "ignore"))
diff --git a/linkcheck/checker/tests/data/misc.html.result b/linkcheck/checker/tests/data/misc.html.result
index 12ca5b72..3c14e439 100644
--- a/linkcheck/checker/tests/data/misc.html.result
+++ b/linkcheck/checker/tests/data/misc.html.result
@@ -1,7 +1,3 @@
-url
-cache key None
-real url None
-error
url file://%(curdir)s/%(datadir)s/misc.html
cache key file://%(curdir)s/%(datadir)s/misc.html
real url file://%(curdir)s/%(datadir)s/misc.html
@@ -21,3 +17,8 @@ url favicon.ico (cached)
cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
+
+url
+cache key None
+real url None
+error
diff --git a/linkcheck/checker/tests/test_http.py b/linkcheck/checker/tests/test_http.py
index 1ac0ad44..aa2662f8 100644
--- a/linkcheck/checker/tests/test_http.py
+++ b/linkcheck/checker/tests/test_http.py
@@ -36,8 +36,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
u"http.html" % self.port
resultlines = self.get_resultlines("http.html")
- self.direct(url, resultlines, recursionlevel=1, cmdline=True)
- self.redirect_http_test()
+ self.direct(url, resultlines, recursionlevel=1, assume_local=True)
+ self.redirect1_http_test()
+ self.redirect2_http_test()
self.noproxyfor_test()
finally:
self.stop_server()
@@ -64,9 +65,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"original URL was u'http://localhost:%d/redirect1'." % self.port,
u"valid",
]
- self.direct(url, resultlines, recursionlevel=0, cmdline=True)
+ self.direct(url, resultlines, recursionlevel=0, assume_local=True)
- def redirect_http_test (self):
+ def redirect1_http_test (self):
url = u"http://localhost:%d/redirect1" % self.port
nurl = url
rurl = url.replace("redirect", "newurl")
@@ -77,7 +78,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"info Redirected to %s." % rurl,
u"error",
]
- self.direct(url, resultlines, recursionlevel=0, cmdline=True)
+ self.direct(url, resultlines, recursionlevel=0, assume_local=True)
+
+ def redirect2_http_test (self):
url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
self.port
nurl = url
@@ -94,7 +97,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"name Recursive Redirect",
u"valid",
]
- self.direct(url, resultlines, recursionlevel=99, cmdline=True)
+ self.direct(url, resultlines, recursionlevel=99, assume_local=True)
def noproxyfor_test (self):
"""
@@ -113,7 +116,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"valid",
]
self.direct(url, resultlines, recursionlevel=0,
- confargs=confargs, cmdline=True)
+ confargs=confargs, assume_local=True)
del os.environ["http_proxy"]
diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index af092ed2..a1a4940a 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -32,6 +32,7 @@ import traceback
import linkcheck
import linkcheck.linkparse
import linkcheck.checker
+import linkcheck.director
import linkcheck.strformat
import linkcheck.containers
import linkcheck.log
@@ -55,7 +56,7 @@ class UrlBase (object):
An URL with additional information like validity etc.
"""
- def __init__ (self, base_url, recursion_level, consumer,
+ def __init__ (self, base_url, recursion_level, aggregate,
parent_url = None, base_ref = None,
line = -1, column = -1, name = u""):
"""
@@ -63,7 +64,7 @@ class UrlBase (object):
@param base_url: unquoted and possibly unnormed url
@param recursion_level: on what check level lies the base url
- @param consumer: consumer instance
+ @param aggregate: aggregate instance
@param parent_url: quoted and normed url of parent or None
@param base_ref: quoted and normed url of or None
@param line: line number of url in parent content
@@ -71,13 +72,13 @@ class UrlBase (object):
@param name: name of url or empty
"""
self.init(base_ref, base_url, parent_url, recursion_level,
- consumer, line, column, name)
+ aggregate, line, column, name)
self.reset()
self.check_syntax()
def init (self, base_ref, base_url, parent_url, recursion_level,
- consumer, line, column, name):
+ aggregate, line, column, name):
"""
Initialize internal data.
"""
@@ -86,7 +87,7 @@ class UrlBase (object):
self.base_url = base_url
self.parent_url = parent_url
self.recursion_level = recursion_level
- self.consumer = consumer
+ self.aggregate = aggregate
self.line = line
self.column = column
self.name = name
@@ -203,6 +204,7 @@ class UrlBase (object):
Fill attributes from cache data.
"""
self.result = cache_data["result"]
+ self.has_result = True
self.warnings.extend(cache_data["warnings"])
self.info.extend(cache_data["info"])
self.valid = cache_data["valid"]
@@ -240,8 +242,8 @@ class UrlBase (object):
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Content cache key %r", self.cache_content_key)
# construct cache key
- if self.consumer.config("anchorcaching") and \
- self.consumer.config("anchors"):
+ if self.aggregate.config["anchorcaching"] and \
+ self.aggregate.config["anchors"]:
# do not ignore anchor
parts = self.urlparts[:]
parts[4] = self.anchor
@@ -343,32 +345,28 @@ class UrlBase (object):
"""
Main check function for checking this URL.
"""
- if self.consumer.config("trace"):
+ if self.aggregate.config["trace"]:
linkcheck.trace.trace_on()
try:
self.local_check()
- self.consumer.checked(self)
except (socket.error, select.error):
- self.consumer.interrupted(self)
# on Unix, ctrl-c can raise
# error: (4, 'Interrupted system call')
etype, value = sys.exc_info()[:2]
- if etype == 4:
+ if etype == errno.EINTR:
raise KeyboardInterrupt(value)
else:
raise
except KeyboardInterrupt:
- self.consumer.interrupted(self)
raise
except:
- self.consumer.interrupted(self)
- linkcheck.checker.internal_error()
+ linkcheck.director.internal_error()
def add_country_info (self):
"""
Try to ask GeoIP database for country info.
"""
- country = self.consumer.get_country_name(self.host)
+ country = linkcheck.cache.geoip.get_country(self.host)
if country is not None:
self.add_info(_("URL is located in %s.") % _(country))
@@ -377,10 +375,11 @@ class UrlBase (object):
Local check function can be overridden in subclasses.
"""
assert linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
- if self.recursion_level and self.consumer.config('wait'):
+ wait = self.aggregate.config['wait']
+ if self.recursion_level and wait:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
- "sleeping for %d seconds", self.consumer.config('wait'))
- time.sleep(self.consumer.config('wait'))
+ "sleeping for %d seconds", wait)
+ time.sleep(wait)
t = time.time()
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
@@ -392,7 +391,7 @@ class UrlBase (object):
try:
self.check_connection()
self.add_country_info()
- if self.consumer.config("anchors"):
+ if self.aggregate.config["anchors"]:
self.check_anchors()
except tuple(linkcheck.checker.ExcList):
value = self.handle_exception()
@@ -406,7 +405,7 @@ class UrlBase (object):
valid=False)
# check content
- warningregex = self.consumer.config("warningregex")
+ warningregex = self.aggregate.config["warningregex"]
if warningregex and self.valid:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"checking content")
@@ -486,8 +485,8 @@ class UrlBase (object):
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"... no, cannot get content.")
return False
- if self.consumer.config("recursionlevel") >= 0 and \
- self.recursion_level >= self.consumer.config("recursionlevel"):
+ rec_level = self.aggregate.config["recursionlevel"]
+ if rec_level >= 0 and self.recursion_level >= rec_level:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"... no, maximum recursion level reached.")
return False
@@ -551,7 +550,7 @@ class UrlBase (object):
@return: None
"""
- for entry in self.consumer.config("externlinks"):
+ for entry in self.aggregate.config["externlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
@@ -559,7 +558,7 @@ class UrlBase (object):
"Extern URL %r", url)
self.extern = (1, entry['strict'])
return
- for entry in self.consumer.config("internlinks"):
+ for entry in self.aggregate.config["internlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
@@ -607,7 +606,7 @@ class UrlBase (object):
If a maximum size was given, call this function to check it
against the content size of this url.
"""
- maxbytes = self.consumer.config("warnsizebytes")
+ maxbytes = self.aggregate.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(_("Content size %s is larger than %s.") %
(linkcheck.strformat.strsize(self.dlsize),
@@ -626,7 +625,7 @@ class UrlBase (object):
Get tuple (user, password) from configured authentication.
Both user and password can be None if not specified.
"""
- for auth in self.consumer.config("authentication"):
+ for auth in self.aggregate.config["authentication"]:
if auth['pattern'].match(self.url):
return auth['user'], auth['password']
return None, None
@@ -651,10 +650,10 @@ class UrlBase (object):
else:
base_ref = h.base_ref
url_data = linkcheck.checker.get_url_from(url,
- self.recursion_level+1, self.consumer, parent_url=self.url,
+ self.recursion_level+1, self.aggregate, parent_url=self.url,
base_ref=base_ref, line=line, column=column, name=name,
- cmdline=False)
- self.consumer.append_url(url_data)
+ assume_local=False)
+ self.aggregate.urlqueue.put(url_data)
def parse_opera (self):
"""
@@ -674,10 +673,10 @@ class UrlBase (object):
url = line[4:]
if url:
url_data = linkcheck.checker.get_url_from(url,
- self.recursion_level+1, self.consumer,
+ self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, name=name,
- cmdline=False)
- self.consumer.append_url(url_data)
+ assume_local=False)
+ self.aggregate.urlqueue.put(url_data)
name = ""
def parse_text (self):
@@ -694,10 +693,10 @@ class UrlBase (object):
if not line or line.startswith('#'):
continue
url_data = linkcheck.checker.get_url_from(line,
- self.recursion_level+1, self.consumer,
+ self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno,
- cmdline=False)
- self.consumer.append_url(url_data)
+ assume_local=False)
+ self.aggregate.urlqueue.put(url_data)
def parse_css (self):
"""
@@ -712,10 +711,10 @@ class UrlBase (object):
column = mo.start("url")
url = linkcheck.strformat.unquote(mo.group("url").strip())
url_data = linkcheck.checker.get_url_from(url,
- self.recursion_level+1, self.consumer,
+ self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, column=column,
- cmdline=False)
- self.consumer.append_url(url_data)
+ assume_local=False)
+ self.aggregate.urlqueue.put(url_data)
def serialized (self):
"""
@@ -758,7 +757,7 @@ class UrlBase (object):
@rtype: string
"""
s = self.serialized()
- return self.consumer.config('logger').encode(s)
+ return self.aggregate.config['logger'].encode(s)
def __repr__ (self):
"""
diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py
index 7056305b..d20fab46 100644
--- a/linkcheck/configuration/__init__.py
+++ b/linkcheck/configuration/__init__.py
@@ -28,11 +28,6 @@ import linkcheck
import linkcheck.log
import linkcheck.containers
import confparse
-try:
- import GeoIP
- _has_geoip = True
-except ImportError:
- _has_geoip = False
Version = _linkchecker_configdata.version
AppName = u"LinkChecker"
@@ -83,6 +78,7 @@ class Configuration (dict):
self["internlinks"] = []
self["noproxyfor"] = []
self["interactive"] = False
+ self["maxqueuesize"] = 0
# on ftp, password is set by Pythons ftplib
self["authentication"] = []
self["proxy"] = urllib.getproxies()
@@ -149,18 +145,6 @@ class Configuration (dict):
self["warnsizebytes"] = None
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
self["threads"] = 10
- self.init_geoip()
-
- def init_geoip (self):
- """
- If GeoIP.dat file is found, initialize a standard geoip DB and
- store it in self["geoip"]; else this value will be None.
- """
- geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
- if _has_geoip and os.path.exists(geoip_dat):
- self["geoip"] = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
- else:
- self["geoip"] = None
def init_logging (self, debug=None):
"""
diff --git a/linkcheck/configuration/tests/test_config.py b/linkcheck/configuration/tests/test_config.py
index 864d7a95..ef087527 100644
--- a/linkcheck/configuration/tests/test_config.py
+++ b/linkcheck/configuration/tests/test_config.py
@@ -23,12 +23,14 @@ import os
import linkcheck.configuration
-def get_file (filename):
+def get_file (filename=None):
"""
Get file name located within 'data' directory.
"""
- return unicode(os.path.join("linkcheck", "configuration", "tests",
- "data", filename))
+ directory = os.path.join("linkcheck", "configuration", "tests", "data")
+ if filename:
+ return unicode(os.path.join(directory, filename))
+ return unicode(directory)
class TestConfig (unittest.TestCase):
diff --git a/linkcheck/cookies.py b/linkcheck/cookies.py
index d41636ab..3e8b784b 100644
--- a/linkcheck/cookies.py
+++ b/linkcheck/cookies.py
@@ -297,40 +297,3 @@ class Rfc2965Cookie (HttpCookie):
# XXX more methods (equality test)
-
-class CookieJar (set):
- """
- Cookie storage, implementing the default cookie handling policy for
- LinkChecker.
- """
-
- def add_cookies (self, headers, scheme, host, path):
- """
- Parse cookie values, add to jar.
- """
- to_add = set()
- for h in headers.getallmatchingheaders("Set-Cookie"):
- # RFC 2109 (Netscape) cookie type
- try:
- to_add.add(NetscapeCookie(h, scheme, host, path))
- except CookieError:
- assert linkcheck.log.debug(linkcheck.LOG_CACHE,
- "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
- for h in headers.getallmatchingheaders("Set-Cookie2"):
- # RFC 2965 cookie type
- try:
- to_add.add(Rfc2965Cookie(h, scheme, host, path))
- except CookieError:
- assert linkcheck.log.debug(linkcheck.LOG_CACHE,
- "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
- for x in to_add:
- self.add(x)
- return to_add
-
- def remove_expired (self):
- """
- Remove expired cookies from jar.
- """
- to_remove = [x for x in self if not x.check_expired()]
- return self.difference_update(to_remove)
-
diff --git a/linkcheck/lc_cgi.py b/linkcheck/lc_cgi.py
index 29d7cc90..04c6cea2 100644
--- a/linkcheck/lc_cgi.py
+++ b/linkcheck/lc_cgi.py
@@ -31,8 +31,7 @@ import linkcheck.url
import linkcheck.i18n
import linkcheck.strformat
import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
+import linkcheck.director
_logfile = None
_supported_langs = ('de', 'fr', 'nl', 'C')
@@ -99,13 +98,16 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
config["externlinks"].append(
linkcheck.get_link_pat("^%s$" % linkcheck.url.safe_url_pattern))
config["externlinks"].append(linkcheck.get_link_pat(".*", strict=True))
+ # start checking
+ aggregate = linkcheck.director.get_aggregate(config)
+
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
- # start checking
url = form["url"].value
- url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=False)
- consumer.append_url(url_data)
- linkcheck.checker.check_urls(consumer)
+ url_data = linkcheck.checker.get_url_from(url, 0, aggregate,
+ assume_local=False)
+ aggregate.urlqueue.put(url_data)
+ linkcheck.director.check_urls(aggregate)
def get_host_name (form):
diff --git a/linkcheck/tests/test_urlbuild.py b/linkcheck/tests/test_urlbuild.py
index b9ffb1a4..6f59d085 100644
--- a/linkcheck/tests/test_urlbuild.py
+++ b/linkcheck/tests/test_urlbuild.py
@@ -20,18 +20,16 @@ Test url build method from url data objects.
import unittest
import linkcheck.configuration
+import linkcheck.director
import linkcheck.checker.httpurl
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
-def get_test_consumer ():
+def get_test_aggregate ():
"""
Initialize a test configuration object.
"""
config = linkcheck.configuration.Configuration()
config['logger'] = config.logger_new('none')
- cache = linkcheck.checker.cache.Cache()
- return linkcheck.checker.consumer.Consumer(config, cache)
+ return linkcheck.director.get_aggregate(config)
class TestUrlBuild (unittest.TestCase):
@@ -43,9 +41,9 @@ class TestUrlBuild (unittest.TestCase):
parent_url = "http://localhost:8001/linkcheck/checker/tests/data/http.html"
base_url = "http://"
recursion_level = 0
- consumer = get_test_consumer()
+ aggregate = get_test_aggregate()
o = linkcheck.checker.httpurl.HttpUrl(base_url, recursion_level,
- consumer, parent_url=parent_url)
+ aggregate, parent_url=parent_url)
o.build_url()
self.assertEquals(o.url, 'http://')
diff --git a/linkchecker b/linkchecker
index d4976d45..8d35683f 100755
--- a/linkchecker
+++ b/linkchecker
@@ -38,8 +38,7 @@ optparse._ = _
import linkcheck.log
import linkcheck.i18n
import linkcheck.checker
-import linkcheck.checker.cache
-import linkcheck.checker.consumer
+import linkcheck.director
import linkcheck.configuration
import linkcheck.fileutil
import linkcheck.strformat
@@ -654,14 +653,15 @@ if len(args) <= 0:
else:
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
-# initialize the cache and the consumer model
-cache = linkcheck.checker.cache.Cache()
-consumer = linkcheck.checker.consumer.Consumer(config, cache)
+# prepare checking queue
+aggregate = linkcheck.director.get_aggregate(config)
if options.trace:
config["trace"] = True
+ import linkcheck.trace
linkcheck.trace.trace_filter([r"^linkcheck"])
linkcheck.trace.trace_on()
# add urls to queue
+get_url_from = linkcheck.checker.get_url_from
for url in args:
if url.lower().startswith("www."):
# syntactic sugar
@@ -669,14 +669,14 @@ for url in args:
elif url.lower().startswith("ftp."):
# syntactic sugar
url = "ftp://%s" % url
- url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
- consumer.append_url(url_data)
-############################# check the URLs ################################
+ url_data = get_url_from(url, 0, aggregate, assume_local=True)
+ linkcheck.add_intern_pattern(url_data, config)
+ aggregate.urlqueue.put(url_data)
+# set up profiling/psyco
if do_profile and not has_profile:
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("The `profile' Python module is not installed,"
" therefore the --profile option is disabled."))
-
if do_profile and has_profile:
run = True
if os.path.exists(_profile):
@@ -690,7 +690,7 @@ if do_profile and has_profile:
run = False
if run:
import profile
- profile.run("linkcheck.checker.check_urls(consumer)", _profile)
+ profile.run("manager.check_urls()", _profile)
elif options.psyco:
try:
import psyco
@@ -705,8 +705,8 @@ elif options.psyco:
except ImportError:
# no psyco available, just ignore
pass
-linkcheck.checker.check_urls(consumer)
-#############################################################################
+# start checking
+linkcheck.director.check_urls(aggregate)
# interactive input end
if config['interactive']:
diff --git a/setup.py b/setup.py
index c7434a03..7dfd397b 100755
--- a/setup.py
+++ b/setup.py
@@ -537,7 +537,8 @@ o a (Fast)CGI web interface (requires HTTP server)
'clean': MyClean,
},
packages = ['linkcheck', 'linkcheck.logger', 'linkcheck.checker',
- 'linkcheck.configuration',
+ 'linkcheck.director', 'linkcheck.configuration',
+ 'linkcheck.cache',
'linkcheck.dns', 'linkcheck.dns.rdtypes',
'linkcheck.dns.rdtypes.ANY', 'linkcheck.dns.rdtypes.IN',
'linkcheck.HtmlParser', 'linkcheck.ftpparse', ],