# -*- coding: iso-8859-1 -*- # Copyright (C) 2000-2005 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ Url consumer class. """ import time try: import thread except ImportError: import dummy_thread as thread import linkcheck.threader import linkcheck.log import linkcheck.lock import linkcheck.strformat import linkcheck.checker.geoip from linkcheck.decorators import synchronized from linkcheck.checker import stderr # global lock for synchronizing all the checker threads _lock = thread.allocate_lock() def print_tocheck (tocheck): """ Print the number of queued URLs. """ msg = _n("%5d URL queued,", "%5d URLs queued,", tocheck) % tocheck print >> stderr, msg, def print_links (links): """ Print the number of checked URLs. """ msg = _n("%4d URL checked,", "%4d URLs checked,", links) % links print >> stderr, msg, def print_active (active): """ Print the number of active threads. """ msg = _n("%2d active thread,", "%2d active threads,", active) % active print >> stderr, msg, def print_duration (duration): """ Print the run time. """ msg = _("runtime %s") % linkcheck.strformat.strduration(duration) print >> stderr, msg, class Consumer (object): """ Consume URLs from the URL queue in a thread-safe manner. All public methods are synchronized, with the exception of abort() which calls locking methods itself, and check_url() which just spawns another checker thread. Additionally all public methods of the Cache() object are included as synchronized proxy functions via __getattr__(). The synchronization uses a global variable _lock defined above. """ def __init__ (self, config, cache): """ Initialize consumer data and threads. """ super(Consumer, self).__init__() # number of consumed URLs self._number = 0 self._config = config self._cache = cache self._threader = linkcheck.threader.Threader(num=config['threads']) self.start_log_output() @synchronized(_lock) def config (self, key): """ Get config value. """ return self._config[key] @synchronized(_lock) def config_append (self, key, val): """ Append config value. """ self._config[key].append(val) @synchronized(_lock) def __getattr__ (self, name): """ Delegate access to the internal cache if possible. """ if hasattr(self._cache, name): return getattr(self._cache, name) raise AttributeError(name) @synchronized(_lock) def append_url (self, url_data): """ Append url to incoming check list. """ if not self._cache.incoming_add(url_data): # can be logged self._log_url(url_data) def check_url (self, url_data, name): """ Check given URL data, spawning a new thread with given name. This eventually calls either Consumer.checked() or Consumer.interrupted(). This method is not thread safe (hence it should only be called from a single thread). """ name = linkcheck.strformat.ascii_safe(name) self._threader.start_thread(url_data.check, (), name=name) @synchronized(_lock) def checked (self, url_data): """ Put checked url in cache and log it. """ # log before putting it in the cache (otherwise we would see # a "(cached)" after every url self._log_url(url_data) if url_data.caching and not url_data.cached: self._cache.checked_add(url_data) else: self._cache.in_progress_remove(url_data) @synchronized(_lock) def interrupted (self, url_data): """ Remove url from active list. """ self._cache.in_progress_remove(url_data, ignore_missing=True) @synchronized(_lock) def finished (self): """ Return True if checking is finished. """ # avoid deadlock by requesting cache data before locking return self._threader.finished() and \ self._cache.incoming_len() == 0 @synchronized(_lock) def finish (self): """ Finish consuming URLs. """ self._threader.finish() @synchronized(_lock) def no_more_threads (self): """ Return True if no more active threads are running. """ return self._threader.finished() def abort (self): """ Abort checking and send end-of-output message to logger. """ # While loop to disable keyboard interrupts and system exits # (triggered from internal errors while finishing up) during abort. self.num_waited = 0 while True: try: self._abort() break except (KeyboardInterrupt, SystemExit): pass def _abort (self): """ Abort checking and send end-of-output message to logger. Private method not disabling exceptions. """ # wait for threads to finish while not self.no_more_threads(): if self.num_waited > 30: linkcheck.log.error(linkcheck.LOG_CHECK, "Thread wait timeout") self.end_log_output() return num = self.active_threads() msg = \ _n("keyboard interrupt; waiting for %d active thread to finish", "keyboard interrupt; waiting for %d active threads to finish", num) linkcheck.log.warn(linkcheck.LOG_CHECK, msg, num) self.finish() self.num_waited += 1 time.sleep(2) self.end_log_output() @synchronized(_lock) def print_status (self, curtime, start_time): """ Print check status looking at url queues. """ # avoid deadlock by requesting cache data before locking print >> stderr, _("Status:"), print_active(self._threader.active_threads()) print_links(self._number) print_tocheck(self._cache.incoming_len()) print_duration(curtime - start_time) print >> stderr @synchronized(_lock) def start_log_output (self): """ Start output of all configured loggers. """ self._config['logger'].start_output() for logger in self._config['fileoutput']: logger.start_output() @synchronized(_lock) def log_url (self, url_data): """ Log given URL. """ self._log_url(url_data) def _log_url (self, url_data): """ Send new url to all configured loggers. """ self._number += 1 has_warnings = False for tag, content in url_data.warnings: if tag not in self._config["ignorewarnings"]: has_warnings = True break do_print = self._config["verbose"] or not url_data.valid or \ (has_warnings and self._config["warnings"]) self._config['logger'].log_filter_url(url_data, do_print) for log in self._config['fileoutput']: log.log_filter_url(url_data, do_print) @synchronized(_lock) def end_log_output (self): """ End output of all configured loggers. """ self._config['logger'].end_output() for logger in self._config['fileoutput']: logger.end_output() @synchronized(_lock) def active_threads (self): """ Return number of active threads. """ return self._threader.active_threads() @synchronized(_lock) def get_country_name (self, host): """ Return country code for host if found, else None. """ gi = self._config["geoip"] if gi: return linkcheck.checker.geoip.get_country(gi, host) return None