linkchecker/linkcheck/director/aggregator.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Aggregate needed object instances for checker threads.
"""
import threading
import thread
import requests
import time
import random
from .. import log, LOG_CHECK, strformat, cookies
from ..decorators import synchronized
from ..cache import urlqueue
from . import logger, status, checker, interrupt


_threads_lock = threading.RLock()
_hosts_lock = threading.RLock()
_downloadedbytes_lock = threading.RLock()

def new_request_session(config):
    """Create a new request session."""
    session = requests.Session()
    session.max_redirects = config["maxhttpredirects"]
    # XXX proxies
    if config["cookiefile"]:
        for cookie in cookies.from_file(config["cookiefile"]):
            session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
    return session


class Aggregate (object):
    """Store thread-safe data collections for checker threads."""

    def __init__ (self, config, urlqueue, robots_txt, plugin_manager,
                  result_cache):
        """Store given link checking objects."""
        self.config = config
        self.urlqueue = urlqueue
        self.logger = logger.Logger(config)
        self.threads = []
        self.request_sessions = {}
        self.robots_txt = robots_txt
        self.plugin_manager = plugin_manager
        self.result_cache = result_cache
        self.times = {}
        requests_per_second = config["maxrequestspersecond"]
        self.wait_time_min = 1.0 / requests_per_second
        self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
        self.downloaded_bytes = 0

    @synchronized(_threads_lock)
    def start_threads (self):
        """Spawn threads for URL checking and status printing."""
        if self.config["status"]:
            t = status.Status(self, self.config["status_wait_seconds"])
            t.start()
            self.threads.append(t)
        if self.config["maxrunseconds"]:
            t = interrupt.Interrupt(self.config["maxrunseconds"])
            t.start()
            self.threads.append(t)
        num = self.config["threads"]
        if num > 0:
            for dummy in range(num):
                t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
                self.threads.append(t)
                t.start()
        else:
            self.request_sessions[thread.get_ident()] = new_request_session(self.config)
            checker.check_urls(self.urlqueue, self.logger)

    @synchronized(_threads_lock)
    def add_request_session(self):
        """Add a request session for current thread."""
        session = new_request_session(self.config)
        self.request_sessions[thread.get_ident()] = session

    @synchronized(_threads_lock)
    def get_request_session(self):
        """Get the request session for current thread."""
        return self.request_sessions[thread.get_ident()]

    @synchronized(_hosts_lock)
    def wait_for_host(self, host):
        """Throttle requests to one host."""
        t = time.time()
        if host in self.times:
            due_time = self.times[host]
            if due_time > t:
                wait = due_time - t
                time.sleep(wait)
                t = time.time()
        wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
        self.times[host] = t + wait_time

    @synchronized(_threads_lock)
    def print_active_threads (self):
        """Log all currently active threads."""
        debug = log.is_debug(LOG_CHECK)
        if debug:
            first = True
            for name in self.get_check_threads():
                if first:
                    log.info(LOG_CHECK, _("These URLs are still active:"))
                    first = False
                log.info(LOG_CHECK, name[12:])
        args = dict(
            num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
            timeout=strformat.strduration_long(self.config["aborttimeout"]),
        )
        log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)

    @synchronized(_threads_lock)
    def get_check_threads(self):
        """Return iterator of checker threads."""
        for t in self.threads:
            name = t.getName()
            if name.startswith("CheckThread-"):
                yield name

    def cancel (self):
        """Empty the URL queue."""
        self.urlqueue.do_shutdown()

    def abort (self):
        """Print still-active URLs and empty the URL queue."""
        self.print_active_threads()
        self.cancel()
        timeout = self.config["aborttimeout"]
        try:
            self.urlqueue.join(timeout=timeout)
        except urlqueue.Timeout:
            log.warn(LOG_CHECK, "Abort timed out after %d seconds, stopping application." % timeout)
            raise KeyboardInterrupt()

    @synchronized(_threads_lock)
    def remove_stopped_threads (self):
        """Remove the stopped threads from the internal thread list."""
        self.threads = [t for t in self.threads if t.is_alive()]

    @synchronized(_threads_lock)
    def finish (self):
        """Wait for checker threads to finish."""
        if not self.urlqueue.empty():
            # This happens when all checker threads died.
            self.cancel()
        for t in self.threads:
            t.stop()

    @synchronized(_threads_lock)
    def is_finished (self):
        """Determine if checking is finished."""
        self.remove_stopped_threads()
        return self.urlqueue.empty() and not self.threads

    @synchronized(_downloadedbytes_lock)
    def add_downloaded_bytes(self, numbytes):
        """Add to number of downloaded bytes."""
        self.downloaded_bytes += numbytes

    def end_log_output(self):
        """Print ending output to log."""
        kwargs = dict(
            downloaded_bytes=self.downloaded_bytes,
            num_urls = len(self.result_cache),
        )
        self.logger.end_log_output(**kwargs)