linkchecker/linkcheck/director/aggregator.py

244 lines
8.7 KiB
Python
Raw Normal View History

2014-01-08 21:33:04 +00:00
# Copyright (C) 2006-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
2009-07-24 21:58:20 +00:00
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Aggregate needed object instances for checker threads.
"""
import threading
2018-01-05 16:16:35 +00:00
import requests
import time
2020-05-14 19:15:28 +00:00
import urllib.parse
import random
from .. import log, LOG_CHECK, strformat, LinkCheckerError
from ..decorators import synchronized
2013-12-10 22:41:52 +00:00
from ..cache import urlqueue
from ..htmlutil import loginformsearch
from ..cookies import from_file
from . import logger, status, checker, interrupter
_threads_lock = threading.RLock()
_hosts_lock = threading.RLock()
2014-03-14 20:06:10 +00:00
_downloadedbytes_lock = threading.RLock()
2020-05-30 16:01:36 +00:00
2014-07-15 16:37:05 +00:00
def new_request_session(config, cookies):
"""Create a new request session."""
session = requests.Session()
2014-07-15 16:37:05 +00:00
if cookies:
session.cookies = cookies
2014-03-06 20:58:35 +00:00
session.max_redirects = config["maxhttpredirects"]
2020-05-30 16:01:36 +00:00
session.headers.update(
Fix remaining flake8 violations in linkcheck/ linkcheck/better_exchook2.py:28:89: E501 line too long (90 > 88 characters) linkcheck/better_exchook2.py:155:9: E722 do not use bare 'except' linkcheck/better_exchook2.py:166:9: E722 do not use bare 'except' linkcheck/better_exchook2.py:289:13: E741 ambiguous variable name 'l' linkcheck/better_exchook2.py:299:9: E722 do not use bare 'except' linkcheck/containers.py:48:13: E731 do not assign a lambda expression, use a def linkcheck/ftpparse.py:123:89: E501 line too long (93 > 88 characters) linkcheck/loader.py:46:47: E203 whitespace before ':' linkcheck/logconf.py:45:29: E231 missing whitespace after ',' linkcheck/robotparser2.py:157:89: E501 line too long (95 > 88 characters) linkcheck/robotparser2.py:182:89: E501 line too long (89 > 88 characters) linkcheck/strformat.py:181:16: E203 whitespace before ':' linkcheck/strformat.py:181:43: E203 whitespace before ':' linkcheck/strformat.py:253:9: E731 do not assign a lambda expression, use a def linkcheck/strformat.py:254:9: E731 do not assign a lambda expression, use a def linkcheck/strformat.py:341:89: E501 line too long (111 > 88 characters) linkcheck/url.py:102:32: E203 whitespace before ':' linkcheck/url.py:277:5: E741 ambiguous variable name 'l' linkcheck/url.py:402:5: E741 ambiguous variable name 'l' linkcheck/checker/__init__.py:203:1: E402 module level import not at top of file linkcheck/checker/fileurl.py:200:89: E501 line too long (103 > 88 characters) linkcheck/checker/mailtourl.py:122:60: E203 whitespace before ':' linkcheck/checker/mailtourl.py:157:89: E501 line too long (96 > 88 characters) linkcheck/checker/mailtourl.py:190:89: E501 line too long (109 > 88 characters) linkcheck/checker/mailtourl.py:200:89: E501 line too long (111 > 88 characters) linkcheck/checker/mailtourl.py:249:89: E501 line too long (106 > 88 characters) linkcheck/checker/unknownurl.py:226:23: W291 trailing whitespace linkcheck/checker/urlbase.py:245:89: E501 line too long (101 > 88 characters) linkcheck/configuration/confparse.py:236:89: E501 line too long (186 > 88 characters) linkcheck/configuration/confparse.py:247:89: E501 line too long (111 > 88 characters) linkcheck/configuration/__init__.py:164:9: E266 too many leading '#' for block comment linkcheck/configuration/__init__.py:184:9: E266 too many leading '#' for block comment linkcheck/configuration/__init__.py:190:9: E266 too many leading '#' for block comment linkcheck/configuration/__init__.py:195:9: E266 too many leading '#' for block comment linkcheck/configuration/__init__.py:198:9: E266 too many leading '#' for block comment linkcheck/configuration/__init__.py:435:89: E501 line too long (90 > 88 characters) linkcheck/director/aggregator.py:45:43: E231 missing whitespace after ',' linkcheck/director/aggregator.py:178:89: E501 line too long (106 > 88 characters) linkcheck/logger/__init__.py:29:1: E731 do not assign a lambda expression, use a def linkcheck/logger/__init__.py:108:13: E741 ambiguous variable name 'l' linkcheck/logger/__init__.py:275:19: F821 undefined name '_' linkcheck/logger/__init__.py:342:16: F821 undefined name '_' linkcheck/logger/__init__.py:380:13: F821 undefined name '_' linkcheck/logger/__init__.py:384:13: F821 undefined name '_' linkcheck/logger/__init__.py:387:13: F821 undefined name '_' linkcheck/logger/__init__.py:396:13: F821 undefined name '_' linkcheck/network/__init__.py:1:1: W391 blank line at end of file linkcheck/plugins/locationinfo.py:89:9: E731 do not assign a lambda expression, use a def linkcheck/plugins/locationinfo.py:91:9: E731 do not assign a lambda expression, use a def linkcheck/plugins/markdowncheck.py:112:89: E501 line too long (111 > 88 characters) linkcheck/plugins/markdowncheck.py:141:9: E741 ambiguous variable name 'l' linkcheck/plugins/markdowncheck.py:165:23: E203 whitespace before ':' linkcheck/plugins/viruscheck.py:95:42: E203 whitespace before ':'
2020-05-30 16:01:36 +00:00
{"User-Agent": config["useragent"]}
2020-05-30 16:01:36 +00:00
)
if config["cookiefile"]:
for cookie in from_file(config["cookiefile"]):
session.cookies.set_cookie(cookie)
return session
class Aggregate:
"""Store thread-safe data collections for checker threads."""
2020-05-30 16:01:36 +00:00
def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache):
2011-02-14 20:06:34 +00:00
"""Store given link checking objects."""
self.config = config
self.urlqueue = urlqueue
self.logger = logger.Logger(config)
self.threads = []
self.request_sessions = {}
self.robots_txt = robots_txt
self.plugin_manager = plugin_manager
self.result_cache = result_cache
self.times = {}
2014-07-15 16:37:05 +00:00
self.cookies = None
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
2014-03-14 20:06:10 +00:00
self.downloaded_bytes = 0
2014-07-15 16:37:05 +00:00
def visit_loginurl(self):
"""Check for a login URL and visit it."""
url = self.config["loginurl"]
if not url:
return
user, password = self.config.get_user_password(url)
if not user and not password:
raise LinkCheckerError(
2020-05-30 16:01:36 +00:00
"loginurl is configured but neither user nor password are set"
)
session = new_request_session(self.config, self.cookies)
log.debug(LOG_CHECK, "Getting login form %s", url)
kwargs = dict(timeout=self.config["timeout"])
# XXX: sslverify? can we reuse HttpUrl.get_request_kwargs()
# somehow?
response = session.get(url, **kwargs)
response.raise_for_status()
cgiuser = self.config["loginuserfield"] if user else None
cgipassword = self.config["loginpasswordfield"] if password else None
form = loginformsearch.search_form(response.text, cgiuser, cgipassword)
if not form:
raise LinkCheckerError("Login form not found at %s" % url)
if user:
form.data[cgiuser] = user
if password:
form.data[cgipassword] = password
2014-07-15 16:37:05 +00:00
for key, value in self.config["loginextrafields"].items():
form.data[key] = value
2020-05-14 19:15:28 +00:00
formurl = urllib.parse.urljoin(url, form.url)
log.debug(LOG_CHECK, "Posting login data to %s", formurl)
response = session.post(formurl, data=form.data, **kwargs)
response.raise_for_status()
self.cookies = session.cookies
if len(self.cookies) == 0:
raise LinkCheckerError("No cookies set by login URL %s" % url)
2014-07-15 16:37:05 +00:00
@synchronized(_threads_lock)
def start_threads(self):
"""Spawn threads for URL checking and status printing."""
if self.config["status"]:
2014-03-14 20:06:10 +00:00
t = status.Status(self, self.config["status_wait_seconds"])
t.start()
self.threads.append(t)
if self.config["maxrunseconds"]:
t = interrupter.Interrupt(self.config["maxrunseconds"])
t.start()
self.threads.append(t)
num = self.config["threads"]
if num > 0:
for dummy in range(num):
2020-05-30 16:01:36 +00:00
t = checker.Checker(
self.urlqueue, self.logger, self.add_request_session
)
self.threads.append(t)
t.start()
else:
2020-05-30 16:01:36 +00:00
self.request_sessions[threading.get_ident()] = new_request_session(
self.config, self.cookies
)
2014-03-08 18:35:10 +00:00
checker.check_urls(self.urlqueue, self.logger)
@synchronized(_threads_lock)
def add_request_session(self):
"""Add a request session for current thread."""
2014-07-15 16:37:05 +00:00
session = new_request_session(self.config, self.cookies)
2020-05-15 18:37:04 +00:00
self.request_sessions[threading.get_ident()] = session
@synchronized(_threads_lock)
def get_request_session(self):
"""Get the request session for current thread."""
2020-05-15 18:37:04 +00:00
return self.request_sessions[threading.get_ident()]
@synchronized(_hosts_lock)
def wait_for_host(self, host):
"""Throttle requests to one host."""
t = time.time()
if host in self.times:
due_time = self.times[host]
if due_time > t:
wait = due_time - t
time.sleep(wait)
t = time.time()
wait_time = random.uniform(self.wait_time_min, self.wait_time_max)
self.times[host] = t + wait_time
@synchronized(_threads_lock)
def print_active_threads(self):
2011-02-14 20:06:34 +00:00
"""Log all currently active threads."""
2012-11-06 20:33:43 +00:00
debug = log.is_debug(LOG_CHECK)
if debug:
first = True
for name in self.get_check_threads():
if first:
log.info(LOG_CHECK, _("These URLs are still active:"))
first = False
log.info(LOG_CHECK, name[12:])
args = dict(
2020-05-30 16:01:36 +00:00
num=len(
[x for x in self.threads if x.name.startswith("CheckThread-")]
2020-05-30 16:01:36 +00:00
),
timeout=strformat.strduration_long(self.config["aborttimeout"]),
2012-11-06 20:33:43 +00:00
)
2020-05-30 16:01:36 +00:00
log.info(
LOG_CHECK,
_(
"%(num)d URLs are still active. After a timeout of %(timeout)s"
" the active URLs will stop."
2020-05-30 16:01:36 +00:00
)
% args,
)
@synchronized(_threads_lock)
def get_check_threads(self):
"""Return iterator of checker threads."""
2010-09-30 15:42:52 +00:00
for t in self.threads:
if t.name.startswith("CheckThread-"):
yield t.name
2010-09-30 15:42:52 +00:00
def cancel(self):
2011-02-20 10:12:14 +00:00
"""Empty the URL queue."""
2011-02-19 08:02:52 +00:00
self.urlqueue.do_shutdown()
def abort(self):
2011-02-20 10:12:14 +00:00
"""Print still-active URLs and empty the URL queue."""
2010-09-30 15:42:52 +00:00
self.print_active_threads()
2011-02-19 08:02:52 +00:00
self.cancel()
timeout = self.config["aborttimeout"]
try:
self.urlqueue.join(timeout=timeout)
except urlqueue.Timeout:
2020-05-30 16:01:36 +00:00
log.warn(
LOG_CHECK,
"Abort timed out after %d seconds, stopping application." % timeout,
)
raise KeyboardInterrupt()
@synchronized(_threads_lock)
def remove_stopped_threads(self):
"""Remove the stopped threads from the internal thread list."""
2011-02-19 08:02:52 +00:00
self.threads = [t for t in self.threads if t.is_alive()]
@synchronized(_threads_lock)
def finish(self):
"""Wait for checker threads to finish."""
if not self.urlqueue.empty():
# This happens when all checker threads died.
self.cancel()
for t in self.threads:
t.stop()
for t in self.threads:
t.join(timeout=1.0)
@synchronized(_threads_lock)
def is_finished(self):
"""Determine if checking is finished."""
self.remove_stopped_threads()
return self.urlqueue.empty() and not self.threads
2014-03-14 20:06:10 +00:00
@synchronized(_downloadedbytes_lock)
def add_downloaded_bytes(self, numbytes):
2014-03-14 21:09:05 +00:00
"""Add to number of downloaded bytes."""
2014-03-14 20:06:10 +00:00
self.downloaded_bytes += numbytes
2014-03-14 22:46:17 +00:00
def end_log_output(self, **kwargs):
2014-03-14 22:46:17 +00:00
"""Print ending output to log."""
2020-05-30 16:01:36 +00:00
kwargs.update(
dict(
downloaded_bytes=self.downloaded_bytes, num_urls=len(self.result_cache),
)
)
2014-03-14 22:46:17 +00:00
self.logger.end_log_output(**kwargs)