linkchecker/linkcheck/director/aggregator.py
2014-03-14 23:46:17 +01:00

182 lines
6.6 KiB
Python

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Aggregate needed object instances for checker threads.
"""
import threading
import thread
import requests
import time
import random
from .. import log, LOG_CHECK, strformat, cookies
from ..decorators import synchronized
from ..cache import urlqueue
from . import logger, status, checker, interrupt
_threads_lock = threading.RLock()
_hosts_lock = threading.RLock()
_downloadedbytes_lock = threading.RLock()
def new_request_session(config):
"""Create a new request session."""
session = requests.Session()
session.max_redirects = config["maxhttpredirects"]
# XXX proxies
if config["cookiefile"]:
for cookie in cookies.from_file(config["cookiefile"]):
session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
return session
class Aggregate (object):
"""Store thread-safe data collections for checker threads."""
def __init__ (self, config, urlqueue, robots_txt, plugin_manager,
result_cache):
"""Store given link checking objects."""
self.config = config
self.urlqueue = urlqueue
self.logger = logger.Logger(config)
self.threads = []
self.request_sessions = {}
self.robots_txt = robots_txt
self.plugin_manager = plugin_manager
self.result_cache = result_cache
self.times = {}
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
self.downloaded_bytes = 0
@synchronized(_threads_lock)
def start_threads (self):
"""Spawn threads for URL checking and status printing."""
if self.config["status"]:
t = status.Status(self, self.config["status_wait_seconds"])
t.start()
self.threads.append(t)
if self.config["maxrunseconds"]:
t = interrupt.Interrupt(self.config["maxrunseconds"])
t.start()
self.threads.append(t)
num = self.config["threads"]
if num > 0:
for dummy in range(num):
t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
self.threads.append(t)
t.start()
else:
self.request_sessions[thread.get_ident()] = new_request_session(self.config)
checker.check_urls(self.urlqueue, self.logger)
@synchronized(_threads_lock)
def add_request_session(self):
"""Add a request session for current thread."""
session = new_request_session(self.config)
self.request_sessions[thread.get_ident()] = session
@synchronized(_threads_lock)
def get_request_session(self):
"""Get the request session for current thread."""
return self.request_sessions[thread.get_ident()]
@synchronized(_hosts_lock)
def wait_for_host(self, host):
"""Throttle requests to one host."""
t = time.time()
if host in self.times:
due_time = self.times[host]
if due_time > t:
wait = due_time - t
time.sleep(wait)
t = time.time()
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
self.times[host] = t + wait_time
@synchronized(_threads_lock)
def print_active_threads (self):
"""Log all currently active threads."""
debug = log.is_debug(LOG_CHECK)
if debug:
first = True
for name in self.get_check_threads():
if first:
log.info(LOG_CHECK, _("These URLs are still active:"))
first = False
log.info(LOG_CHECK, name[12:])
args = dict(
num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
timeout=strformat.strduration_long(self.config["aborttimeout"]),
)
log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)
@synchronized(_threads_lock)
def get_check_threads(self):
"""Return iterator of checker threads."""
for t in self.threads:
name = t.getName()
if name.startswith("CheckThread-"):
yield name
def cancel (self):
"""Empty the URL queue."""
self.urlqueue.do_shutdown()
def abort (self):
"""Print still-active URLs and empty the URL queue."""
self.print_active_threads()
self.cancel()
timeout = self.config["aborttimeout"]
try:
self.urlqueue.join(timeout=timeout)
except urlqueue.Timeout:
log.warn(LOG_CHECK, "Abort timed out after %d seconds, stopping application." % timeout)
raise KeyboardInterrupt()
@synchronized(_threads_lock)
def remove_stopped_threads (self):
"""Remove the stopped threads from the internal thread list."""
self.threads = [t for t in self.threads if t.is_alive()]
@synchronized(_threads_lock)
def finish (self):
"""Wait for checker threads to finish."""
if not self.urlqueue.empty():
# This happens when all checker threads died.
self.cancel()
for t in self.threads:
t.stop()
@synchronized(_threads_lock)
def is_finished (self):
"""Determine if checking is finished."""
self.remove_stopped_threads()
return self.urlqueue.empty() and not self.threads
@synchronized(_downloadedbytes_lock)
def add_downloaded_bytes(self, numbytes):
"""Add to number of downloaded bytes."""
self.downloaded_bytes += numbytes
def end_log_output(self):
"""Print ending output to log."""
kwargs = dict(
downloaded_bytes=self.downloaded_bytes,
num_urls = len(self.result_cache),
)
self.logger.end_log_output(**kwargs)