diff --git a/ChangeLog b/ChangeLog index 474e387e..0ef48cc9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -14,6 +14,12 @@ Changed: linkcheck/HtmlParser/htmllex.[lc], linkcheck/tests/test_parser.py + * Revamp the threading algorithm by using a URL queue, with a + constant number of consumer threads called 'workers'. + This fixes the remaining "dequeue mutated during iteration" errors. + Type: feature + Changed: *.py + 3.4 "The Chumscrubbers" (released 4.2.2006) * Ignore decoding errors when retrieving the robots.txt URL. diff --git a/MANIFEST.in b/MANIFEST.in index de62f0a0..077cbae1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -37,6 +37,7 @@ include doc/de/*.1 include doc/fr/*.1 include doc/Makefile doc/rest2htmlnav recursive-include linkcheck/checker/tests/data *.txt *.html *.result *.asc *.css *.ico +recursive-include linkcheck/configuration/tests/data *.ini include linkcheck/tests/*.py include linkcheck/checker/tests/*.py include linkcheck/dns/tests/*.py diff --git a/TODO b/TODO index 1b78c670..0461182d 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,11 @@ -http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483752 -http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/475160 +- Improved print_status -- use format_time from quodlibet for times +- Ctrl-C is not really working. + +- To limit the used memory, put a maximum size on the URL queue + (eg. 20000 URLs). If reached, the worker calling queue.put() will + wait for another worker to call queue.get() before continuing. + Problem: dead lock when all workers called queue.put(). - [FEATURE] postmortem debugging with pdb.pm() diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index 06330748..c6d9f56a 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -58,6 +58,17 @@ class LinkCheckerError (Exception): pass +def add_intern_pattern (url_data, config): + """ + Add intern URL regex to config. + """ + pat = url_data.get_intern_pattern() + if pat: + assert linkcheck.log.debug(LOG_CHECK, + "Add intern pattern %r from command line", pat) + config['internlinks'].append(get_link_pat(pat)) + + def get_link_pat (arg, strict=False): """ Get a link pattern matcher for intern/extern links. diff --git a/linkcheck/cache/__init__.py b/linkcheck/cache/__init__.py new file mode 100644 index 00000000..318628da --- /dev/null +++ b/linkcheck/cache/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2006 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Store and provide cached data during checking in a thread-safe manner. +""" diff --git a/linkcheck/cache/connection.py b/linkcheck/cache/connection.py new file mode 100644 index 00000000..8231a69d --- /dev/null +++ b/linkcheck/cache/connection.py @@ -0,0 +1,111 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2005-2006 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Store and retrieve open connections. +""" + +import time +import threading +from linkcheck.decorators import synchronized + +# lock for robots.txt caching +_lock = threading.Lock() + + +class ConnectionPool (object): + """ + Thread-safe cache, storing a set of connections for URL retrieval. + """ + + def __init__ (self): + """ + Initialize an empty connection dictionary which will have entries + of the form:: + key -> [connection, status, expiration time] + + Connection can be any open connection object (HTTP, FTP, ...). + Status is either 'available' or 'busy'. + Expiration time is the point of time in seconds when this + connection will be timed out. + + The identifier key is usually a tuple (type, host, user, pass), + but it can be any immutable Python object. + """ + # open connections + # {(type, host, user, pass) -> [connection, status, expiration time]} + self.connections = {} + + @synchronized(_lock) + def add (self, key, conn, timeout): + """ + Add connection to the pool with given identifier key and timeout + in seconds. + """ + self.connections[key] = [conn, 'available', time.time() + timeout] + + @synchronized(_lock) + def get (self, key): + """ + Get open connection if available, for at most 30 seconds. + + @return: Open connection object or None if no connection is available. + @rtype None or FTPConnection or HTTP(S)Connection + """ + if key not in self.connections: + # not found + return None + conn_data = self.connections[key] + t = time.time() + if t > conn_data[2]: + # timed out + try: + conn_data[1].close() + except: + # ignore close errors + pass + del self.connections[key] + return None + # wait at most 300*0.1=30 seconds for connection to become available + for dummy in xrange(300): + if conn_data[1] != 'busy': + conn_data[1] = 'busy' + conn_data[2] = t + return conn_data[0] + time.sleep(0.1) + # connection is in use + return None + + @synchronized(_lock) + def release (self, key): + """ + Mark an open and reusable connection as available. + """ + if key in self.connections: + self.connections[key][1] = 'available' + + @synchronized(_lock) + def expire_connections (self): + """ + Remove expired connections from this pool. + """ + t = time.time() + to_delete = [] + for key, conn_data in self.connections.iteritems(): + if conn_data[1] == 'available' and t > conn_data[2]: + to_delete.append(key) + for key in to_delete: + del self.connections[key] diff --git a/linkcheck/cache/cookie.py b/linkcheck/cache/cookie.py new file mode 100644 index 00000000..f5a110bc --- /dev/null +++ b/linkcheck/cache/cookie.py @@ -0,0 +1,73 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2006 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Store and retrieve cookies. +""" +import threading +from linkcheck.decorators import synchronized +import linkcheck +import linkcheck.log +import linkcheck.cookies + +# lock for caching +_lock = threading.Lock() + + +class CookieJar (object): + """ + Cookie storage, implementing the default cookie handling policy for + LinkChecker. + """ + + def __init__ (self): + self.cache = {} + + @synchronized(_lock) + def add (self, headers, scheme, host, path): + """ + Parse cookie values, add to cache. + """ + jar = set() + for h in headers.getallmatchingheaders("Set-Cookie"): + # RFC 2109 (Netscape) cookie type + try: + c = linkcheck.cookies.NetscapeCookie(h, scheme, host, path) + jar.add(c) + except linkcheck.cookies.CookieError: + assert linkcheck.log.debug(linkcheck.LOG_CACHE, + "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h) + for h in headers.getallmatchingheaders("Set-Cookie2"): + # RFC 2965 cookie type + try: + c = linkcheck.cookies.Rfc2965Cookie(h, scheme, host, path) + jar.add(c) + except linkcheck.cookies.CookieError: + assert linkcheck.log.debug(linkcheck.LOG_CACHE, + "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h) + self.cache[host] = jar + return jar + + @synchronized(_lock) + def get (self, scheme, host, port, path): + """ + Cookie cache getter function. + """ + assert linkcheck.log.debug(linkcheck.LOG_CACHE, + "Get cookies for host %r path %r", host, path) + jar = self.cache.setdefault(host, set()) + return [x for x in jar if x.check_expired() and \ + x.is_valid_for(scheme, host, port, path)] diff --git a/linkcheck/checker/geoip.py b/linkcheck/cache/geoip.py similarity index 91% rename from linkcheck/checker/geoip.py rename to linkcheck/cache/geoip.py index 881cc4cc..73cb7546 100644 --- a/linkcheck/checker/geoip.py +++ b/linkcheck/cache/geoip.py @@ -15,23 +15,44 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ -GeoIP wrapper. +Store and retrieve country names for IPs. """ +import os +import threading +from linkcheck.decorators import synchronized -def get_country (gi, host): +# I don't know if the geoip library is already thread-safe, but +# we take no risks here. +_lock = threading.Lock() + +# initialize GeoIP database +geoip = None +try: + import GeoIP + geoip_dat = "/usr/share/GeoIP/GeoIP.dat" + if os.name == 'posix' and os.path.exists(geoip_dat): + geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD) + del geoip_dat +except ImportError: + pass + + +@synchronized(_lock) +def get_country (host): """ Get translated country name. @return: country string or None """ - c = gi.country_code_by_name(host) + if geoip is None: + return None + c = geoip.country_code_by_name(host) if c and c in countries: return "%s, %s" % (c, countries[c]) return None # GeoIP country map with {short name -> translated full name} entries - countries = { "AP": "Asia/Pacific Region", "EU": "Europe", diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py new file mode 100644 index 00000000..ca1f1383 --- /dev/null +++ b/linkcheck/cache/robots_txt.py @@ -0,0 +1,52 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2006 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Cache robots.txt contents. +""" +import threading +from linkcheck.decorators import synchronized +import linkcheck.robotparser2 +import linkcheck.configuration + + +# lock for caching +_lock = threading.Lock() + + +class RobotsTxt (object): + """ + Thread-safe cache of downloaded robots.txt files. + format: {cache key (string) -> robots.txt content (RobotFileParser)} + """ + + def __init__ (self): + self.cache = {} + + @synchronized(_lock) + def allows_url (self, roboturl, url, user, password): + """ + Ask robots.txt allowance. + """ + if roboturl not in self.cache: + rp = linkcheck.robotparser2.RobotFileParser( + user=user, password=password) + rp.set_url(roboturl) + rp.read() + self.cache[roboturl] = rp + else: + rp = self.cache[roboturl] + return rp.can_fetch(linkcheck.configuration.UserAgent, url) diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py new file mode 100644 index 00000000..ff7ac6a5 --- /dev/null +++ b/linkcheck/cache/urlqueue.py @@ -0,0 +1,210 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2000-2006 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Handle a queue of URLs to check. +""" +import threading +import Queue +import time +import linkcheck +import linkcheck.log + + +class UrlQueue (Queue.Queue): + """ + A queue supporting several consumer tasks. The task_done() idea is + from the Python 2.5 Subversion repository. + """ + + def __init__ (self, maxsize=0): + """ + Initialize the queue state and task counters. + """ + Queue.Queue.__init__(self, maxsize=maxsize) + self.all_tasks_done = threading.Condition(self.mutex) + self.unfinished_tasks = 0 + self.finished_tasks = 0 + self.in_progress = {} + self.checked = {} + self.shutdown = False + + def get (self): + """ + Get first not-in-progress url from the queue and + return it. If no such url is available return None. The + url might be already cached. + """ + self.not_empty.acquire() + try: + while self._empty(): + self.not_empty.wait() + url_data = self._get() + key = url_data.cache_url_key + if url_data.has_result: + # Already checked and copied from cache. + pass + elif key in self.checked: + # Already checked; copy result. And even ignore + # the case where url happens to be in_progress. + url_data.copy_from_cache(self.checked[key]) + elif key in self.in_progress: + # It's being checked currently; put it back in the queue. + Queue.Queue._put(self, url_data) + url_data = None + else: + self.in_progress[key] = url_data + self.not_full.notify() + return url_data + finally: + self.not_empty.release() + + def _put (self, url_data): + """ + Put URL in queue, increase number of unfished tasks. + """ + if self.shutdown: + # don't accept more URLs + return + key = url_data.cache_url_key + if key in self.checked: + # Put at beginning of queue to get consumed quickly. + url_data.copy_from_cache(self.checked[key]) + self.queue.appendleft(url_data) + else: + self.queue.append(url_data) + self.unfinished_tasks += 1 + + def task_done (self, url_data): + """ + Indicate that a formerly enqueued task is complete. + + Used by Queue consumer threads. For each get() used to fetch a task, + a subsequent call to task_done() tells the queue that the processing + on the task is complete. + + If a join() is currently blocking, it will resume when all items + have been processed (meaning that a task_done() call was received + for every item that had been put() into the queue). + + Raises a ValueError if called more times than there were items + placed in the queue. + """ + self.all_tasks_done.acquire() + try: + if url_data is not None: + key = url_data.cache_url_key + if key is not None and key not in self.checked: + self._cache_url(key, url_data) + self.finished_tasks += 1 + unfinished = self.unfinished_tasks - 1 + if unfinished <= 0: + if unfinished < 0: + raise ValueError('task_done() called too many times') + self.all_tasks_done.notifyAll() + self.unfinished_tasks = unfinished + finally: + self.all_tasks_done.release() + + def _cache_url (self, key, url_data): + """ + Put URL result data into cache. + """ + assert linkcheck.log.debug(linkcheck.LOG_CACHE, + "Caching %r", key) + assert key in self.in_progress, \ + "%r not in %s" % (key, self.in_progress) + del self.in_progress[key] + data = url_data.get_cache_data() + self.checked[key] = data + # check for aliases (eg. through HTTP redirections) + if hasattr(url_data, "aliases"): + data = url_data.get_alias_cache_data() + for key in url_data.aliases: + if key in self.checked or key in self.in_progress: + continue + assert linkcheck.log.debug(linkcheck.LOG_CACHE, + "Caching alias %r", key) + self.checked[key] = data + + def join (self, timeout=None): + """Blocks until all items in the Queue have been gotten and processed. + + The count of unfinished tasks goes up whenever an item is added to the + queue. The count goes down whenever a consumer thread calls task_done() + to indicate the item was retrieved and all work on it is complete. + + When the count of unfinished tasks drops to zero, join() unblocks. + """ + self.all_tasks_done.acquire() + try: + if timeout is None: + while self.unfinished_tasks: + self.all_tasks_done.wait() + else: + if timeout < 0: + raise ValueError("'timeout' must be a positive number") + endtime = time.time() + timeout + while self.unfinished_tasks: + remaining = endtime - time.time() + if remaining <= 0.0: + return + self.all_tasks_done.wait(remaining) + finally: + self.all_tasks_done.release() + + def do_shutdown (self): + """ + Shutdown the queue by not accepting any more URLs. + """ + self.mutex.acquire() + try: + unfinished = self.unfinished_tasks - len(self.queue) + self.queue.clear() + if unfinished <= 0: + if unfinished < 0: + raise ValueError('shutdown is in error') + self.all_tasks_done.notifyAll() + self.unfinished_tasks = unfinished + self.shutdown = True + finally: + self.mutex.release() + + def status (self): + """ + Get tuple (finished tasks, unfinished tasks, queue size). + """ + self.mutex.acquire() + try: + return (self.finished_tasks, self.unfinished_tasks, len(self.queue)) + finally: + self.mutex.release() + + def checked_redirect (self, redirect, url_data): + """ + Check if redirect is already in cache. Used for URL redirections + to avoid double checking of already cached URLs. + If the redirect URL is found in the cache, the result data is + already copied. + """ + self.mutex.acquire() + try: + if redirect in self.checked: + url_data.copy_from_cache(self.checked[redirect]) + return True + return False + finally: + self.mutex.release() diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index ec84bce2..d32d2a30 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -18,19 +18,14 @@ Main functions for link checking. """ -import time -import sys import os import cgi import socket -import codecs -import traceback import select import re import urllib import nntplib import ftplib - import linkcheck.httplib2 import linkcheck.strformat import linkcheck.dns.exception @@ -153,110 +148,6 @@ acap # application configuration access protocol ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) -_encoding = linkcheck.i18n.default_encoding -stderr = codecs.getwriter(_encoding)(sys.stderr, errors="ignore") - -def internal_error (): - """ - Print internal error message to stderr. - """ - print >> stderr, os.linesep - print >> stderr, _("""********** Oops, I did it again. ************* - -You have found an internal error in LinkChecker. Please write a bug report -at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913 -or send mail to %s and include the following information: -- the URL or file you are testing -- your commandline arguments and/or configuration. -- the output of a debug run with option "-Dall" of the executed command -- the system information below. - -Disclosing some of the information above due to privacy reasons is ok. -I will try to help you nonetheless, but you have to give me something -I can work with ;) . -""") % linkcheck.configuration.Email - etype, value = sys.exc_info()[:2] - print >> stderr, etype, value - traceback.print_exc() - print_app_info() - print >> stderr, os.linesep, \ - _("******** LinkChecker internal error, over and out ********") - sys.exit(1) - - -def print_app_info (): - """ - Print system and application info to stderr. - """ - print >> stderr, _("System info:") - print >> stderr, linkcheck.configuration.App - print >> stderr, _("Python %s on %s") % (sys.version, sys.platform) - for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"): - value = os.getenv(key) - if value is not None: - print >> stderr, key, "=", repr(value) - - -def check_urls (consumer): - """ - Main check function; checks all configured URLs until interrupted - with Ctrl-C. If you call this function more than once, you can specify - different configurations with the consumer parameter. - - @param consumer: an object where all runtime-dependent options are - stored - @type consumer: linkcheck.consumer.Consumer - @return: None - """ - try: - _check_urls(consumer) - except (KeyboardInterrupt, SystemExit): - consumer.abort() - except: - consumer.abort() - internal_error() - - -def _check_urls (consumer): - """ - Checks all configured URLs. Prints status information, calls logger - methods. - - @param consumer: an object where all runtime-dependent options are - stored - @type consumer: linkcheck.consumer.Consumer - @return: None - """ - start_time = time.time() - status_time = start_time - while not consumer.finished(): - url_data = consumer.incoming_get_url() - if url_data is None: - # wait for incoming queue to fill - time.sleep(0.1) - elif url_data.cached: - # was cached -> can be logged - consumer.log_url(url_data) - else: - # go check this url - if url_data.parent_url and not \ - linkcheck.url.url_is_absolute(url_data.base_url): - name = url_data.parent_url - else: - name = u"" - if url_data.base_url: - name += url_data.base_url - if not name: - name = None - consumer.check_url(url_data, name) - if consumer.config('status'): - curtime = time.time() - if (curtime - status_time) > 5: - consumer.print_status(curtime, start_time) - status_time = curtime - consumer.end_log_output() - - # file extensions we can parse recursively extensions = { "html": re.compile(r'(?i)\.s?html?$'), @@ -298,9 +189,9 @@ def absolute_url (base_url, base_ref, parent_url): return u"" -def get_url_from (base_url, recursion_level, consumer, +def get_url_from (base_url, recursion_level, aggregate, parent_url=None, base_ref=None, line=0, column=0, - name=u"", cmdline=False): + name=u"", assume_local=False): """ Get url data from given base data. @@ -308,8 +199,8 @@ def get_url_from (base_url, recursion_level, consumer, @type base_url: string or None @param recursion_level: current recursion level @type recursion_level: number - @param consumer: consumer object - @type consumer: linkcheck.checker.consumer.Consumer + @param aggregate: aggregate object + @type aggregate: linkcheck.checker.aggregate.Consumer @param parent_url: parent url @type parent_url: string or None @param base_ref: base url from tag @@ -329,7 +220,14 @@ def get_url_from (base_url, recursion_level, consumer, base_ref = linkcheck.strformat.unicode_safe(base_ref) name = linkcheck.strformat.unicode_safe(name) url = absolute_url(base_url, base_ref, parent_url).lower() - # test scheme + klass = get_urlclass_from(url, assume_local) + return klass(base_url, recursion_level, aggregate, + parent_url=parent_url, base_ref=base_ref, + line=line, column=column, name=name) + + +def get_urlclass_from (url, assume_local): + """Return checker class for given URL.""" if url.startswith("http:"): klass = linkcheck.checker.httpurl.HttpUrl elif url.startswith("ftp:"): @@ -351,24 +249,13 @@ def get_url_from (base_url, recursion_level, consumer, elif ignored_schemes_re.search(url): # ignored url klass = linkcheck.checker.ignoredurl.IgnoredUrl - elif cmdline: - # assume local file on command line + elif assume_local: + # assume local file klass = linkcheck.checker.fileurl.FileUrl else: # error url, no further checking, just log this klass = linkcheck.checker.errorurl.ErrorUrl - url_data = klass(base_url, recursion_level, consumer, - parent_url=parent_url, base_ref=base_ref, - line=line, column=column, name=name) - if cmdline: - # add intern URL regex to config for every URL that was given - # on the command line - pat = url_data.get_intern_pattern() - assert linkcheck.log.debug(linkcheck.LOG_CMDLINE, - "Add intern pattern %r from command line", pat) - if pat: - consumer.config_append('internlinks', linkcheck.get_link_pat(pat)) - return url_data + return klass def get_index_html (urls): diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 22006618..53a0e218 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -23,11 +23,13 @@ import os import time import urlparse import urllib +import urllib2 import urlbase import linkcheck import linkcheck.log import linkcheck.checker +import linkcheck.fileutil # if file extension lookup was unsuccessful, look at the content contents = { @@ -83,7 +85,7 @@ class FileUrl (urlbase.UrlBase): """ def init (self, base_ref, base_url, parent_url, recursion_level, - consumer, line, column, name): + aggregate, line, column, name): """ Besides the usual initialization the URL is normed according to the platform: @@ -91,7 +93,7 @@ class FileUrl (urlbase.UrlBase): - under Windows platform the drive specifier is normed """ super(FileUrl, self).init(base_ref, base_url, parent_url, - recursion_level, consumer, line, column, name) + recursion_level, aggregate, line, column, name) if self.base_url is None: return base_url = self.base_url @@ -129,7 +131,8 @@ class FileUrl (urlbase.UrlBase): if self.is_directory(): self.set_result(_("directory")) else: - super(FileUrl, self).check_connection() + url = linkcheck.fileutil.pathencode(self.url) + self.url_connection = urllib2.urlopen(url) self.check_case_sensitivity() def check_case_sensitivity (self): @@ -147,7 +150,6 @@ class FileUrl (urlbase.UrlBase): "system path %r. You should always use " "the system path in URLs.") % (path, realpath), tag="file-system-path") - pass def get_content (self): """ @@ -208,7 +210,7 @@ class FileUrl (urlbase.UrlBase): path = self.urlparts[2] if os.name == 'nt': path = prepare_urlpath_for_nt(path) - return urllib.url2pathname(path) + return linkcheck.fileutil.pathencode(urllib.url2pathname(path)) def is_directory (self): """ diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index a571a746..63f75964 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -53,12 +53,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): order: login, changing directory, list the file. """ # proxy support (we support only http) - self.set_proxy(self.consumer.config("proxy").get(self.scheme)) + self.set_proxy(self.aggregate.config["proxy"].get(self.scheme)) if self.proxy: # using a (HTTP) proxy http = httpurl.HttpUrl(self.base_url, self.recursion_level, - self.consumer, + self.aggregate, parent_url=self.parent_url, base_ref=self.base_ref, line=self.line, @@ -87,7 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # ready to connect _user, _password = self.get_user_password() key = ("ftp", self.urlparts[1], _user, _password) - conn = self.consumer.get_connection(key) + conn = self.aggregate.connections.get(key) if conn is not None and conn.sock is not None: # reuse cached FTP connection self.url_connection = conn @@ -248,6 +248,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # add to cached connections _user, _password = self.get_user_password() key = ("ftp", self.urlparts[1], _user, _password) - cache_add = self.consumer.add_connection + cache_add = self.aggregate.connections.add cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS) self.url_connection = None diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 159ef2c4..58a5619d 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -129,8 +129,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ roboturl = self.get_robots_txt_url() user, password = self.get_user_password() - return self.consumer.robots_txt_allows_url(roboturl, url, - user, password) + return self.aggregate.robots_txt.allows_url(roboturl, url, + user, password) def check_connection (self): """ @@ -150,7 +150,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): valid request """ # set the proxy, so a 407 status after this is an error - self.set_proxy(self.consumer.config("proxy").get(self.scheme)) + self.set_proxy(self.aggregate.config["proxy"].get(self.scheme)) # initialize check data self.headers = None self.auth = None @@ -360,19 +360,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): tag="http-moved-permanent") self.has301status = True # check cache again on the changed URL - if self.consumer.checked_redirect(redirected, self): + if self.aggregate.urlqueue.checked_redirect(redirected, self): return -1, response # in case of changed scheme make new URL object if self.urlparts[0] != self.scheme: newobj = linkcheck.checker.get_url_from( - redirected, self.recursion_level, self.consumer, + redirected, self.recursion_level, self.aggregate, parent_url=self.parent_url, base_ref=self.base_ref, line=self.line, column=self.column, name=self.name, - cmdline=False) + assume_local=False) newobj.warnings = self.warnings newobj.info = self.info # append new object to queue - self.consumer.append_url(newobj) + self.aggregate.append_url(newobj) # pretend to be finished and logged return -1, response # new response data @@ -406,14 +406,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): linkcheck.strformat.unicode_safe(response.reason), tag="http-empty-content") # store cookies for valid links - if self.consumer.config('cookies'): + if self.aggregate.config['cookies']: for c in self.cookies: self.add_info(_("Store cookie: %s.") % c) try: - out = self.consumer.store_cookies(self.headers, - self.urlparts[0], - self.urlparts[1], - self.urlparts[2]) + out = self.aggregate.cookies.add(self.headers, + self.urlparts[0], + self.urlparts[1], + self.urlparts[2]) for h in out: self.add_info(linkcheck.strformat.unicode_safe(h)) except Cookie.CookieError, msg: @@ -471,13 +471,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): linkcheck.configuration.UserAgent) self.url_connection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") - if self.consumer.config('cookies'): + if self.aggregate.config['cookies']: scheme = self.urlparts[0] host = self.urlparts[1] port = linkcheck.url.default_ports.get(scheme, 80) host, port = urllib.splitnport(host, port) path = self.urlparts[2] - self.cookies = self.consumer.get_cookies(scheme, host, port, path) + self.cookies = self.aggregate.cookies.get(scheme, host, port, path) for c in self.cookies: name = c.client_header_name() value = c.client_header_value() @@ -505,7 +505,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ _user, _password = self.get_user_password() key = (scheme, self.urlparts[1], _user, _password) - conn = self.consumer.get_connection(key) + conn = self.aggregate.connections.get(key) if conn is not None: assert linkcheck.log.debug(linkcheck.LOG_CHECK, "reuse cached HTTP(S) connection %s", conn) @@ -634,7 +634,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): # add to cached connections _user, _password = self.get_user_password() key = ("http", self.urlparts[1], _user, _password) - cache_add = self.consumer.add_connection + cache_add = self.aggregate.connections.add # note: only cache the connection when it is persistent # and all pending content has been received if not self.persistent or not self.has_content or \ diff --git a/linkcheck/checker/nntpurl.py b/linkcheck/checker/nntpurl.py index 81673f69..0417ac1f 100644 --- a/linkcheck/checker/nntpurl.py +++ b/linkcheck/checker/nntpurl.py @@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase): Connect to NNTP server and try to request the URL article resource (if specified). """ - nntpserver = self.host or self.consumer.config("nntpserver") + nntpserver = self.host or self.aggregate.config["nntpserver"] if not nntpserver: self.add_warning( _("No NNTP server was specified, skipping this URL."), diff --git a/linkcheck/checker/proxysupport.py b/linkcheck/checker/proxysupport.py index a9c1d907..edcd2ae4 100644 --- a/linkcheck/checker/proxysupport.py +++ b/linkcheck/checker/proxysupport.py @@ -63,7 +63,7 @@ class ProxySupport (object): """ Check if self.host is in the no-proxy-for ignore list. """ - for ro in self.consumer.config("noproxyfor"): + for ro in self.aggregate.config["noproxyfor"]: if ro.search(self.host): return True return False diff --git a/linkcheck/checker/telneturl.py b/linkcheck/checker/telneturl.py index b14e1ac5..b9d96cbc 100644 --- a/linkcheck/checker/telneturl.py +++ b/linkcheck/checker/telneturl.py @@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase): label is "login: ", expected password label is "Password: ". """ self.url_connection = telnetlib.Telnet() - if self.consumer.config("debug"): + if self.aggregate.config["debug"]: self.url_connection.set_debuglevel(1) self.url_connection.open(self.host, self.port) if self.user: diff --git a/linkcheck/checker/tests/__init__.py b/linkcheck/checker/tests/__init__.py index edc140e9..cf6efd8e 100644 --- a/linkcheck/checker/tests/__init__.py +++ b/linkcheck/checker/tests/__init__.py @@ -25,8 +25,6 @@ import unittest import linkcheck import linkcheck.checker -import linkcheck.checker.cache -import linkcheck.checker.consumer import linkcheck.configuration import linkcheck.logger @@ -93,7 +91,17 @@ class TestLogger (linkcheck.logger.Logger): self.diff.append(line) -def get_test_consumer (confargs, logargs): +def get_file (filename=None): + """ + Get file name located within 'data' directory. + """ + directory = os.path.join("linkcheck", "checker", "tests", "data") + if filename: + return unicode(os.path.join(directory, filename)) + return unicode(directory) + + +def get_test_aggregate (confargs, logargs): """ Initialize a test configuration object. """ @@ -101,14 +109,15 @@ def get_test_consumer (confargs, logargs): config.logger_add('test', TestLogger) config['recursionlevel'] = 1 config['logger'] = config.logger_new('test', **logargs) + # uncomment for debugging + #config.init_logging(debug=["all"]) config["anchors"] = True config["verbose"] = True config['threads'] = 0 + config['status'] = False config['cookies'] = True - config['geoip'] = None config.update(confargs) - cache = linkcheck.checker.cache.Cache() - return linkcheck.checker.consumer.Consumer(config, cache) + return linkcheck.director.get_aggregate(config) class LinkCheckTest (unittest.TestCase): @@ -122,21 +131,14 @@ class LinkCheckTest (unittest.TestCase): """ return linkcheck.url.url_norm(url)[0] - def get_file (self, filename): - """ - Get file name located within 'data' directory. - """ - return unicode(os.path.join("linkcheck", "checker", "tests", - "data", filename)) - def get_resultlines (self, filename): """ Return contents of file, as list of lines without line endings, ignoring empty lines and lines starting with a hash sign (#). """ - resultfile = self.get_file(filename+".result") + resultfile = get_file(filename+".result") d = {'curdir': os.getcwd(), - 'datadir': 'linkcheck/checker/tests/data', + 'datadir': get_file(), } f = codecs.open(resultfile, "r", "iso-8859-15") resultlines = [line.rstrip('\r\n') % d for line in f \ @@ -144,27 +146,30 @@ class LinkCheckTest (unittest.TestCase): f.close() return resultlines - def file_test (self, filename, confargs=None, cmdline=True): + def file_test (self, filename, confargs=None, assume_local=True): """ Check with expected result in .result. """ - url = self.get_file(filename) + url = get_file(filename) if confargs is None: confargs = {} logargs = {'expected': self.get_resultlines(filename)} - consumer = get_test_consumer(confargs, logargs) + aggregate = get_test_aggregate(confargs, logargs) url_data = linkcheck.checker.get_url_from( - url, 0, consumer, cmdline=cmdline) - consumer.append_url(url_data) - linkcheck.checker.check_urls(consumer) - if consumer.config('logger').diff: + url, 0, aggregate, assume_local=assume_local) + if assume_local: + linkcheck.add_intern_pattern(url_data, aggregate.config) + aggregate.urlqueue.put(url_data) + linkcheck.director.check_urls(aggregate) + diff = aggregate.config['logger'].diff + if diff: sep = unicode(os.linesep) - l = [url] + consumer.config('logger').diff + l = [url] + diff l = sep.join(l) self.fail(l.encode("iso8859-1", "ignore")) def direct (self, url, resultlines, fields=None, recursionlevel=0, - confargs=None, cmdline=False): + confargs=None, assume_local=False): """ Check url with expected result. """ @@ -176,14 +181,17 @@ class LinkCheckTest (unittest.TestCase): logargs = {'expected': resultlines} if fields is not None: logargs['fields'] = fields - consumer = get_test_consumer(confargs, logargs) + aggregate = get_test_aggregate(confargs, logargs) url_data = linkcheck.checker.get_url_from( - url, 0, consumer, cmdline=cmdline) - consumer.append_url(url_data) - linkcheck.checker.check_urls(consumer) - if consumer.config('logger').diff: + url, 0, aggregate, assume_local=assume_local) + if assume_local: + linkcheck.add_intern_pattern(url_data, aggregate.config) + aggregate.urlqueue.put(url_data) + linkcheck.director.check_urls(aggregate) + diff = aggregate.config['logger'].diff + if diff: sep = unicode(os.linesep) l = [u"Differences found testing %s" % url] - l.extend(x.rstrip() for x in consumer.config('logger').diff[2:]) + l.extend(x.rstrip() for x in diff[2:]) self.fail(sep.join(l).encode("iso8859-1", "ignore")) diff --git a/linkcheck/checker/tests/data/misc.html.result b/linkcheck/checker/tests/data/misc.html.result index 12ca5b72..3c14e439 100644 --- a/linkcheck/checker/tests/data/misc.html.result +++ b/linkcheck/checker/tests/data/misc.html.result @@ -1,7 +1,3 @@ -url -cache key None -real url None -error url file://%(curdir)s/%(datadir)s/misc.html cache key file://%(curdir)s/%(datadir)s/misc.html real url file://%(curdir)s/%(datadir)s/misc.html @@ -21,3 +17,8 @@ url favicon.ico (cached) cache key file://%(curdir)s/%(datadir)s/favicon.ico real url file://%(curdir)s/%(datadir)s/favicon.ico valid + +url +cache key None +real url None +error diff --git a/linkcheck/checker/tests/test_http.py b/linkcheck/checker/tests/test_http.py index 1ac0ad44..aa2662f8 100644 --- a/linkcheck/checker/tests/test_http.py +++ b/linkcheck/checker/tests/test_http.py @@ -36,8 +36,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest): url = u"http://localhost:%d/linkcheck/checker/tests/data/" \ u"http.html" % self.port resultlines = self.get_resultlines("http.html") - self.direct(url, resultlines, recursionlevel=1, cmdline=True) - self.redirect_http_test() + self.direct(url, resultlines, recursionlevel=1, assume_local=True) + self.redirect1_http_test() + self.redirect2_http_test() self.noproxyfor_test() finally: self.stop_server() @@ -64,9 +65,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest): u"original URL was u'http://localhost:%d/redirect1'." % self.port, u"valid", ] - self.direct(url, resultlines, recursionlevel=0, cmdline=True) + self.direct(url, resultlines, recursionlevel=0, assume_local=True) - def redirect_http_test (self): + def redirect1_http_test (self): url = u"http://localhost:%d/redirect1" % self.port nurl = url rurl = url.replace("redirect", "newurl") @@ -77,7 +78,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest): u"info Redirected to %s." % rurl, u"error", ] - self.direct(url, resultlines, recursionlevel=0, cmdline=True) + self.direct(url, resultlines, recursionlevel=0, assume_local=True) + + def redirect2_http_test (self): url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \ self.port nurl = url @@ -94,7 +97,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest): u"name Recursive Redirect", u"valid", ] - self.direct(url, resultlines, recursionlevel=99, cmdline=True) + self.direct(url, resultlines, recursionlevel=99, assume_local=True) def noproxyfor_test (self): """ @@ -113,7 +116,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest): u"valid", ] self.direct(url, resultlines, recursionlevel=0, - confargs=confargs, cmdline=True) + confargs=confargs, assume_local=True) del os.environ["http_proxy"] diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index af092ed2..a1a4940a 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -32,6 +32,7 @@ import traceback import linkcheck import linkcheck.linkparse import linkcheck.checker +import linkcheck.director import linkcheck.strformat import linkcheck.containers import linkcheck.log @@ -55,7 +56,7 @@ class UrlBase (object): An URL with additional information like validity etc. """ - def __init__ (self, base_url, recursion_level, consumer, + def __init__ (self, base_url, recursion_level, aggregate, parent_url = None, base_ref = None, line = -1, column = -1, name = u""): """ @@ -63,7 +64,7 @@ class UrlBase (object): @param base_url: unquoted and possibly unnormed url @param recursion_level: on what check level lies the base url - @param consumer: consumer instance + @param aggregate: aggregate instance @param parent_url: quoted and normed url of parent or None @param base_ref: quoted and normed url of or None @param line: line number of url in parent content @@ -71,13 +72,13 @@ class UrlBase (object): @param name: name of url or empty """ self.init(base_ref, base_url, parent_url, recursion_level, - consumer, line, column, name) + aggregate, line, column, name) self.reset() self.check_syntax() def init (self, base_ref, base_url, parent_url, recursion_level, - consumer, line, column, name): + aggregate, line, column, name): """ Initialize internal data. """ @@ -86,7 +87,7 @@ class UrlBase (object): self.base_url = base_url self.parent_url = parent_url self.recursion_level = recursion_level - self.consumer = consumer + self.aggregate = aggregate self.line = line self.column = column self.name = name @@ -203,6 +204,7 @@ class UrlBase (object): Fill attributes from cache data. """ self.result = cache_data["result"] + self.has_result = True self.warnings.extend(cache_data["warnings"]) self.info.extend(cache_data["info"]) self.valid = cache_data["valid"] @@ -240,8 +242,8 @@ class UrlBase (object): assert linkcheck.log.debug(linkcheck.LOG_CACHE, "Content cache key %r", self.cache_content_key) # construct cache key - if self.consumer.config("anchorcaching") and \ - self.consumer.config("anchors"): + if self.aggregate.config["anchorcaching"] and \ + self.aggregate.config["anchors"]: # do not ignore anchor parts = self.urlparts[:] parts[4] = self.anchor @@ -343,32 +345,28 @@ class UrlBase (object): """ Main check function for checking this URL. """ - if self.consumer.config("trace"): + if self.aggregate.config["trace"]: linkcheck.trace.trace_on() try: self.local_check() - self.consumer.checked(self) except (socket.error, select.error): - self.consumer.interrupted(self) # on Unix, ctrl-c can raise # error: (4, 'Interrupted system call') etype, value = sys.exc_info()[:2] - if etype == 4: + if etype == errno.EINTR: raise KeyboardInterrupt(value) else: raise except KeyboardInterrupt: - self.consumer.interrupted(self) raise except: - self.consumer.interrupted(self) - linkcheck.checker.internal_error() + linkcheck.director.internal_error() def add_country_info (self): """ Try to ask GeoIP database for country info. """ - country = self.consumer.get_country_name(self.host) + country = linkcheck.cache.geoip.get_country(self.host) if country is not None: self.add_info(_("URL is located in %s.") % _(country)) @@ -377,10 +375,11 @@ class UrlBase (object): Local check function can be overridden in subclasses. """ assert linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self) - if self.recursion_level and self.consumer.config('wait'): + wait = self.aggregate.config['wait'] + if self.recursion_level and wait: assert linkcheck.log.debug(linkcheck.LOG_CHECK, - "sleeping for %d seconds", self.consumer.config('wait')) - time.sleep(self.consumer.config('wait')) + "sleeping for %d seconds", wait) + time.sleep(wait) t = time.time() self.set_extern(self.url) if self.extern[0] and self.extern[1]: @@ -392,7 +391,7 @@ class UrlBase (object): try: self.check_connection() self.add_country_info() - if self.consumer.config("anchors"): + if self.aggregate.config["anchors"]: self.check_anchors() except tuple(linkcheck.checker.ExcList): value = self.handle_exception() @@ -406,7 +405,7 @@ class UrlBase (object): valid=False) # check content - warningregex = self.consumer.config("warningregex") + warningregex = self.aggregate.config["warningregex"] if warningregex and self.valid: assert linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content") @@ -486,8 +485,8 @@ class UrlBase (object): assert linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, cannot get content.") return False - if self.consumer.config("recursionlevel") >= 0 and \ - self.recursion_level >= self.consumer.config("recursionlevel"): + rec_level = self.aggregate.config["recursionlevel"] + if rec_level >= 0 and self.recursion_level >= rec_level: assert linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, maximum recursion level reached.") return False @@ -551,7 +550,7 @@ class UrlBase (object): @return: None """ - for entry in self.consumer.config("externlinks"): + for entry in self.aggregate.config["externlinks"]: match = entry['pattern'].search(url) if (entry['negate'] and not match) or \ (match and not entry['negate']): @@ -559,7 +558,7 @@ class UrlBase (object): "Extern URL %r", url) self.extern = (1, entry['strict']) return - for entry in self.consumer.config("internlinks"): + for entry in self.aggregate.config["internlinks"]: match = entry['pattern'].search(url) if (entry['negate'] and not match) or \ (match and not entry['negate']): @@ -607,7 +606,7 @@ class UrlBase (object): If a maximum size was given, call this function to check it against the content size of this url. """ - maxbytes = self.consumer.config("warnsizebytes") + maxbytes = self.aggregate.config["warnsizebytes"] if maxbytes is not None and self.dlsize >= maxbytes: self.add_warning(_("Content size %s is larger than %s.") % (linkcheck.strformat.strsize(self.dlsize), @@ -626,7 +625,7 @@ class UrlBase (object): Get tuple (user, password) from configured authentication. Both user and password can be None if not specified. """ - for auth in self.consumer.config("authentication"): + for auth in self.aggregate.config["authentication"]: if auth['pattern'].match(self.url): return auth['user'], auth['password'] return None, None @@ -651,10 +650,10 @@ class UrlBase (object): else: base_ref = h.base_ref url_data = linkcheck.checker.get_url_from(url, - self.recursion_level+1, self.consumer, parent_url=self.url, + self.recursion_level+1, self.aggregate, parent_url=self.url, base_ref=base_ref, line=line, column=column, name=name, - cmdline=False) - self.consumer.append_url(url_data) + assume_local=False) + self.aggregate.urlqueue.put(url_data) def parse_opera (self): """ @@ -674,10 +673,10 @@ class UrlBase (object): url = line[4:] if url: url_data = linkcheck.checker.get_url_from(url, - self.recursion_level+1, self.consumer, + self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno, name=name, - cmdline=False) - self.consumer.append_url(url_data) + assume_local=False) + self.aggregate.urlqueue.put(url_data) name = "" def parse_text (self): @@ -694,10 +693,10 @@ class UrlBase (object): if not line or line.startswith('#'): continue url_data = linkcheck.checker.get_url_from(line, - self.recursion_level+1, self.consumer, + self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno, - cmdline=False) - self.consumer.append_url(url_data) + assume_local=False) + self.aggregate.urlqueue.put(url_data) def parse_css (self): """ @@ -712,10 +711,10 @@ class UrlBase (object): column = mo.start("url") url = linkcheck.strformat.unquote(mo.group("url").strip()) url_data = linkcheck.checker.get_url_from(url, - self.recursion_level+1, self.consumer, + self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno, column=column, - cmdline=False) - self.consumer.append_url(url_data) + assume_local=False) + self.aggregate.urlqueue.put(url_data) def serialized (self): """ @@ -758,7 +757,7 @@ class UrlBase (object): @rtype: string """ s = self.serialized() - return self.consumer.config('logger').encode(s) + return self.aggregate.config['logger'].encode(s) def __repr__ (self): """ diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index 7056305b..d20fab46 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -28,11 +28,6 @@ import linkcheck import linkcheck.log import linkcheck.containers import confparse -try: - import GeoIP - _has_geoip = True -except ImportError: - _has_geoip = False Version = _linkchecker_configdata.version AppName = u"LinkChecker" @@ -83,6 +78,7 @@ class Configuration (dict): self["internlinks"] = [] self["noproxyfor"] = [] self["interactive"] = False + self["maxqueuesize"] = 0 # on ftp, password is set by Pythons ftplib self["authentication"] = [] self["proxy"] = urllib.getproxies() @@ -149,18 +145,6 @@ class Configuration (dict): self["warnsizebytes"] = None self["nntpserver"] = os.environ.get("NNTP_SERVER", None) self["threads"] = 10 - self.init_geoip() - - def init_geoip (self): - """ - If GeoIP.dat file is found, initialize a standard geoip DB and - store it in self["geoip"]; else this value will be None. - """ - geoip_dat = "/usr/share/GeoIP/GeoIP.dat" - if _has_geoip and os.path.exists(geoip_dat): - self["geoip"] = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD) - else: - self["geoip"] = None def init_logging (self, debug=None): """ diff --git a/linkcheck/configuration/tests/test_config.py b/linkcheck/configuration/tests/test_config.py index 864d7a95..ef087527 100644 --- a/linkcheck/configuration/tests/test_config.py +++ b/linkcheck/configuration/tests/test_config.py @@ -23,12 +23,14 @@ import os import linkcheck.configuration -def get_file (filename): +def get_file (filename=None): """ Get file name located within 'data' directory. """ - return unicode(os.path.join("linkcheck", "configuration", "tests", - "data", filename)) + directory = os.path.join("linkcheck", "configuration", "tests", "data") + if filename: + return unicode(os.path.join(directory, filename)) + return unicode(directory) class TestConfig (unittest.TestCase): diff --git a/linkcheck/cookies.py b/linkcheck/cookies.py index d41636ab..3e8b784b 100644 --- a/linkcheck/cookies.py +++ b/linkcheck/cookies.py @@ -297,40 +297,3 @@ class Rfc2965Cookie (HttpCookie): # XXX more methods (equality test) - -class CookieJar (set): - """ - Cookie storage, implementing the default cookie handling policy for - LinkChecker. - """ - - def add_cookies (self, headers, scheme, host, path): - """ - Parse cookie values, add to jar. - """ - to_add = set() - for h in headers.getallmatchingheaders("Set-Cookie"): - # RFC 2109 (Netscape) cookie type - try: - to_add.add(NetscapeCookie(h, scheme, host, path)) - except CookieError: - assert linkcheck.log.debug(linkcheck.LOG_CACHE, - "Invalid cookie header for %s:%s%s: %r", scheme, host, path, h) - for h in headers.getallmatchingheaders("Set-Cookie2"): - # RFC 2965 cookie type - try: - to_add.add(Rfc2965Cookie(h, scheme, host, path)) - except CookieError: - assert linkcheck.log.debug(linkcheck.LOG_CACHE, - "Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h) - for x in to_add: - self.add(x) - return to_add - - def remove_expired (self): - """ - Remove expired cookies from jar. - """ - to_remove = [x for x in self if not x.check_expired()] - return self.difference_update(to_remove) - diff --git a/linkcheck/lc_cgi.py b/linkcheck/lc_cgi.py index 29d7cc90..04c6cea2 100644 --- a/linkcheck/lc_cgi.py +++ b/linkcheck/lc_cgi.py @@ -31,8 +31,7 @@ import linkcheck.url import linkcheck.i18n import linkcheck.strformat import linkcheck.checker -import linkcheck.checker.cache -import linkcheck.checker.consumer +import linkcheck.director _logfile = None _supported_langs = ('de', 'fr', 'nl', 'C') @@ -99,13 +98,16 @@ def checklink (out=sys.stdout, form=None, env=os.environ): config["externlinks"].append( linkcheck.get_link_pat("^%s$" % linkcheck.url.safe_url_pattern)) config["externlinks"].append(linkcheck.get_link_pat(".*", strict=True)) + # start checking + aggregate = linkcheck.director.get_aggregate(config) + cache = linkcheck.checker.cache.Cache() consumer = linkcheck.checker.consumer.Consumer(config, cache) - # start checking url = form["url"].value - url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=False) - consumer.append_url(url_data) - linkcheck.checker.check_urls(consumer) + url_data = linkcheck.checker.get_url_from(url, 0, aggregate, + assume_local=False) + aggregate.urlqueue.put(url_data) + linkcheck.director.check_urls(aggregate) def get_host_name (form): diff --git a/linkcheck/tests/test_urlbuild.py b/linkcheck/tests/test_urlbuild.py index b9ffb1a4..6f59d085 100644 --- a/linkcheck/tests/test_urlbuild.py +++ b/linkcheck/tests/test_urlbuild.py @@ -20,18 +20,16 @@ Test url build method from url data objects. import unittest import linkcheck.configuration +import linkcheck.director import linkcheck.checker.httpurl -import linkcheck.checker.cache -import linkcheck.checker.consumer -def get_test_consumer (): +def get_test_aggregate (): """ Initialize a test configuration object. """ config = linkcheck.configuration.Configuration() config['logger'] = config.logger_new('none') - cache = linkcheck.checker.cache.Cache() - return linkcheck.checker.consumer.Consumer(config, cache) + return linkcheck.director.get_aggregate(config) class TestUrlBuild (unittest.TestCase): @@ -43,9 +41,9 @@ class TestUrlBuild (unittest.TestCase): parent_url = "http://localhost:8001/linkcheck/checker/tests/data/http.html" base_url = "http://" recursion_level = 0 - consumer = get_test_consumer() + aggregate = get_test_aggregate() o = linkcheck.checker.httpurl.HttpUrl(base_url, recursion_level, - consumer, parent_url=parent_url) + aggregate, parent_url=parent_url) o.build_url() self.assertEquals(o.url, 'http://') diff --git a/linkchecker b/linkchecker index d4976d45..8d35683f 100755 --- a/linkchecker +++ b/linkchecker @@ -38,8 +38,7 @@ optparse._ = _ import linkcheck.log import linkcheck.i18n import linkcheck.checker -import linkcheck.checker.cache -import linkcheck.checker.consumer +import linkcheck.director import linkcheck.configuration import linkcheck.fileutil import linkcheck.strformat @@ -654,14 +653,15 @@ if len(args) <= 0: else: linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given")) -# initialize the cache and the consumer model -cache = linkcheck.checker.cache.Cache() -consumer = linkcheck.checker.consumer.Consumer(config, cache) +# prepare checking queue +aggregate = linkcheck.director.get_aggregate(config) if options.trace: config["trace"] = True + import linkcheck.trace linkcheck.trace.trace_filter([r"^linkcheck"]) linkcheck.trace.trace_on() # add urls to queue +get_url_from = linkcheck.checker.get_url_from for url in args: if url.lower().startswith("www."): # syntactic sugar @@ -669,14 +669,14 @@ for url in args: elif url.lower().startswith("ftp."): # syntactic sugar url = "ftp://%s" % url - url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True) - consumer.append_url(url_data) -############################# check the URLs ################################ + url_data = get_url_from(url, 0, aggregate, assume_local=True) + linkcheck.add_intern_pattern(url_data, config) + aggregate.urlqueue.put(url_data) +# set up profiling/psyco if do_profile and not has_profile: linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("The `profile' Python module is not installed," " therefore the --profile option is disabled.")) - if do_profile and has_profile: run = True if os.path.exists(_profile): @@ -690,7 +690,7 @@ if do_profile and has_profile: run = False if run: import profile - profile.run("linkcheck.checker.check_urls(consumer)", _profile) + profile.run("manager.check_urls()", _profile) elif options.psyco: try: import psyco @@ -705,8 +705,8 @@ elif options.psyco: except ImportError: # no psyco available, just ignore pass -linkcheck.checker.check_urls(consumer) -############################################################################# +# start checking +linkcheck.director.check_urls(aggregate) # interactive input end if config['interactive']: diff --git a/setup.py b/setup.py index c7434a03..7dfd397b 100755 --- a/setup.py +++ b/setup.py @@ -537,7 +537,8 @@ o a (Fast)CGI web interface (requires HTTP server) 'clean': MyClean, }, packages = ['linkcheck', 'linkcheck.logger', 'linkcheck.checker', - 'linkcheck.configuration', + 'linkcheck.director', 'linkcheck.configuration', + 'linkcheck.cache', 'linkcheck.dns', 'linkcheck.dns.rdtypes', 'linkcheck.dns.rdtypes.ANY', 'linkcheck.dns.rdtypes.IN', 'linkcheck.HtmlParser', 'linkcheck.ftpparse', ],