2014-01-08 21:33:04 +00:00
|
|
|
# Copyright (C) 2006-2014 Bastian Kleineidam
|
2006-05-13 18:07:46 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
2009-07-24 21:58:20 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2006-05-13 18:07:46 +00:00
|
|
|
"""
|
|
|
|
|
Management of checking a queue of links with several threads.
|
|
|
|
|
"""
|
2008-04-27 11:39:21 +00:00
|
|
|
import os
|
2014-02-28 23:12:34 +00:00
|
|
|
import time
|
2020-05-15 18:37:04 +00:00
|
|
|
|
2020-06-23 16:28:31 +00:00
|
|
|
from .. import log, LOG_CHECK, LinkCheckerError, LinkCheckerInterrupt, plugins
|
2014-03-03 22:29:45 +00:00
|
|
|
from ..cache import urlqueue, robots_txt, results
|
2008-05-09 06:16:03 +00:00
|
|
|
from . import aggregator, console
|
2010-10-14 16:36:11 +00:00
|
|
|
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def check_urls(aggregate):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Main check function; checks all configured URLs until interrupted
|
2006-05-13 18:07:46 +00:00
|
|
|
with Ctrl-C.
|
2020-07-25 15:35:48 +00:00
|
|
|
|
2006-05-13 18:07:46 +00:00
|
|
|
@return: None
|
|
|
|
|
"""
|
2010-10-14 16:36:11 +00:00
|
|
|
try:
|
2014-07-15 16:37:05 +00:00
|
|
|
aggregate.visit_loginurl()
|
2020-06-23 16:28:31 +00:00
|
|
|
except LinkCheckerError as msg:
|
|
|
|
|
log.warn(LOG_CHECK, _("Problem using login URL: %(msg)s.") % dict(msg=msg))
|
|
|
|
|
return
|
2012-11-26 17:49:07 +00:00
|
|
|
except Exception as msg:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % dict(msg=msg))
|
2010-10-14 16:36:11 +00:00
|
|
|
raise
|
2006-05-13 18:07:46 +00:00
|
|
|
try:
|
|
|
|
|
aggregate.logger.start_log_output()
|
2014-07-09 19:54:47 +00:00
|
|
|
except Exception as msg:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % dict(msg=msg))
|
2014-07-09 19:54:47 +00:00
|
|
|
raise
|
|
|
|
|
try:
|
2006-05-13 18:07:46 +00:00
|
|
|
if not aggregate.urlqueue.empty():
|
|
|
|
|
aggregate.start_threads()
|
2006-09-16 08:41:15 +00:00
|
|
|
check_url(aggregate)
|
|
|
|
|
aggregate.finish()
|
2014-03-14 22:46:17 +00:00
|
|
|
aggregate.end_log_output()
|
2009-03-07 09:15:14 +00:00
|
|
|
except LinkCheckerInterrupt:
|
|
|
|
|
raise
|
2006-05-15 18:05:52 +00:00
|
|
|
except KeyboardInterrupt:
|
2006-09-21 13:43:54 +00:00
|
|
|
interrupt(aggregate)
|
2020-05-15 18:37:04 +00:00
|
|
|
except RuntimeError:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(
|
|
|
|
|
LOG_CHECK,
|
|
|
|
|
_(
|
|
|
|
|
"Could not start a new thread. Check that the current user"
|
|
|
|
|
" is allowed to start new threads."
|
|
|
|
|
),
|
|
|
|
|
)
|
2006-09-24 10:05:10 +00:00
|
|
|
abort(aggregate)
|
2008-04-27 11:39:21 +00:00
|
|
|
except Exception:
|
2010-10-13 20:42:43 +00:00
|
|
|
# Catching "Exception" is intentionally done. This saves the program
|
2012-09-14 20:26:45 +00:00
|
|
|
# from libraries that raise all kinds of strange exceptions.
|
2006-05-24 22:16:36 +00:00
|
|
|
console.internal_error()
|
2011-12-14 21:54:26 +00:00
|
|
|
aggregate.logger.log_internal_error()
|
2006-09-16 08:41:15 +00:00
|
|
|
abort(aggregate)
|
2022-09-02 09:20:02 +00:00
|
|
|
# Not caught exceptions at this point are SystemExit and GeneratorExit,
|
2010-10-13 20:42:43 +00:00
|
|
|
# and both should be handled by the calling layer.
|
2006-09-16 08:41:15 +00:00
|
|
|
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def check_url(aggregate):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Helper function waiting for URL queue."""
|
2006-09-16 08:41:15 +00:00
|
|
|
while True:
|
|
|
|
|
try:
|
2011-02-18 13:49:53 +00:00
|
|
|
aggregate.urlqueue.join(timeout=30)
|
2006-09-16 08:41:15 +00:00
|
|
|
break
|
2008-05-09 06:16:03 +00:00
|
|
|
except urlqueue.Timeout:
|
2011-02-18 13:49:53 +00:00
|
|
|
# Cleanup threads every 30 seconds
|
2006-09-16 08:41:15 +00:00
|
|
|
aggregate.remove_stopped_threads()
|
2012-06-24 20:51:43 +00:00
|
|
|
if not any(aggregate.get_check_threads()):
|
2006-09-16 08:41:15 +00:00
|
|
|
break
|
2006-05-13 18:07:46 +00:00
|
|
|
|
2006-09-21 13:43:54 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def interrupt(aggregate):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Interrupt execution and shutdown, ignoring any subsequent
|
|
|
|
|
interrupts."""
|
2006-09-21 13:43:54 +00:00
|
|
|
while True:
|
|
|
|
|
try:
|
2020-05-30 16:01:36 +00:00
|
|
|
log.warn(LOG_CHECK, _("interrupt; waiting for active threads to finish"))
|
|
|
|
|
log.warn(LOG_CHECK, _("another interrupt will exit immediately"))
|
2006-09-21 13:43:54 +00:00
|
|
|
abort(aggregate)
|
|
|
|
|
break
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def abort(aggregate):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Helper function to ensure a clean shutdown."""
|
2006-09-16 08:41:15 +00:00
|
|
|
while True:
|
|
|
|
|
try:
|
|
|
|
|
aggregate.abort()
|
|
|
|
|
aggregate.finish()
|
2014-04-30 17:59:43 +00:00
|
|
|
aggregate.end_log_output(interrupt=True)
|
2006-09-16 08:41:15 +00:00
|
|
|
break
|
|
|
|
|
except KeyboardInterrupt:
|
2010-11-06 15:46:22 +00:00
|
|
|
log.warn(LOG_CHECK, _("user abort; force shutdown"))
|
2014-04-30 18:17:33 +00:00
|
|
|
aggregate.end_log_output(interrupt=True)
|
2008-07-28 16:56:02 +00:00
|
|
|
abort_now()
|
|
|
|
|
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def abort_now():
|
2008-07-28 16:56:02 +00:00
|
|
|
"""Force exit of current process without cleanup."""
|
|
|
|
|
if os.name == 'posix':
|
2014-02-28 23:12:34 +00:00
|
|
|
# Unix systems can use signals
|
2008-07-28 16:56:02 +00:00
|
|
|
import signal
|
2020-05-30 16:01:36 +00:00
|
|
|
|
2014-02-28 23:12:34 +00:00
|
|
|
os.kill(os.getpid(), signal.SIGTERM)
|
|
|
|
|
time.sleep(1)
|
2008-07-28 16:56:02 +00:00
|
|
|
os.kill(os.getpid(), signal.SIGKILL)
|
|
|
|
|
elif os.name == 'nt':
|
2010-09-30 15:42:52 +00:00
|
|
|
# NT has os.abort()
|
2008-07-28 16:56:02 +00:00
|
|
|
os.abort()
|
|
|
|
|
else:
|
|
|
|
|
# All other systems have os._exit() as best shot.
|
|
|
|
|
os._exit(3)
|
2006-05-13 18:07:46 +00:00
|
|
|
|
2007-11-29 07:50:22 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def get_aggregate(config):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Get an aggregator instance with given configuration."""
|
2014-03-14 19:21:46 +00:00
|
|
|
_urlqueue = urlqueue.UrlQueue(max_allowed_urls=config["maxnumurls"])
|
2014-07-15 16:37:05 +00:00
|
|
|
_robots_txt = robots_txt.RobotsTxt(config["useragent"])
|
2014-02-28 23:12:34 +00:00
|
|
|
plugin_manager = plugins.PluginManager(config)
|
2021-06-21 18:45:19 +00:00
|
|
|
result_cache = results.ResultCache(config["resultcachesize"])
|
2020-05-30 16:01:36 +00:00
|
|
|
return aggregator.Aggregate(
|
|
|
|
|
config, _urlqueue, _robots_txt, plugin_manager, result_cache
|
|
|
|
|
)
|