mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-19 23:40:29 +00:00
224 lines
7.5 KiB
Python
224 lines
7.5 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2006-2010 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
"""
|
|
Management of checking a queue of links with several threads.
|
|
"""
|
|
import time
|
|
import os
|
|
import thread
|
|
import urlparse
|
|
from cStringIO import StringIO
|
|
from .. import log, LOG_CHECK, LinkCheckerInterrupt, cookies, dummy
|
|
from ..cache import urlqueue, robots_txt, cookie, connection
|
|
from . import aggregator, console
|
|
from ..httplib2 import HTTPMessage
|
|
|
|
|
|
def visit_loginurl (aggregate):
|
|
"""Check for a login URL and visit it."""
|
|
config = aggregate.config
|
|
url = config["loginurl"]
|
|
if not url:
|
|
return
|
|
try:
|
|
from twill import commands as tc
|
|
except ImportError:
|
|
log.warn(LOG_CHECK, _("Could not import twill for login URL visit"))
|
|
return
|
|
log.debug(LOG_CHECK, u"Visiting login URL %s", url)
|
|
configure_twill(tc)
|
|
tc.go(url)
|
|
if tc.get_browser().get_code() != 200:
|
|
log.warn(LOG_CHECK, _("Error visiting login URL %(url)s.") % \
|
|
{"url": url})
|
|
return
|
|
submit_login_form(config, url, tc)
|
|
if tc.get_browser().get_code() != 200:
|
|
log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
|
|
{"url": url})
|
|
return
|
|
store_cookies(tc.get_browser().cj, aggregate.cookies, url)
|
|
resulturl = tc.get_browser().get_url()
|
|
log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
|
|
# add result URL to check list
|
|
from ..checker import get_url_from
|
|
aggregate.urlqueue.put(get_url_from(resulturl, 0, aggregate))
|
|
|
|
|
|
def configure_twill (tc):
|
|
"""Configure twill to be used by LinkChecker.
|
|
Note that there is no need to set a proxy since twill uses the same
|
|
ones (provided from urllib) as LinkChecker does.
|
|
"""
|
|
# make sure readonly controls are writeable (might be needed)
|
|
tc.config("readonly_controls_writeable", True)
|
|
# fake IE 6.0 to talk sense into some sites (eg. SourceForge)
|
|
tc.agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)")
|
|
# tell twill to shut up
|
|
tc.OUT = dummy.Dummy()
|
|
from twill import browser
|
|
browser.OUT = dummy.Dummy()
|
|
# set debug level
|
|
if log.is_debug(LOG_CHECK):
|
|
tc.debug("http", 1)
|
|
|
|
|
|
def submit_login_form (config, url, tc):
|
|
"""Fill and submit login form."""
|
|
user, password = config.get_user_password(url)
|
|
cgiuser = config["loginuserfield"]
|
|
cgipassword = config["loginpasswordfield"]
|
|
formname = search_formname((cgiuser, cgipassword), tc)
|
|
tc.formvalue(formname, cgiuser, user)
|
|
tc.formvalue(formname, cgipassword, password)
|
|
for key, value in config["loginextrafields"].items():
|
|
tc.formvalue(formname, key, value)
|
|
tc.submit()
|
|
|
|
|
|
def search_formname (fieldnames, tc):
|
|
"""Search form that has all given CGI fieldnames."""
|
|
browser = tc.get_browser()
|
|
for form in browser.get_all_forms():
|
|
for name in fieldnames:
|
|
try:
|
|
browser.get_form_field(form, name)
|
|
except tc.TwillException:
|
|
break
|
|
else:
|
|
return form.name or form.attrs.get('id')
|
|
# none found
|
|
return None
|
|
|
|
|
|
def store_cookies (cookiejar, cookiecache, url):
|
|
"""Store cookies in cookiejar into the cookiecache."""
|
|
cookielst = []
|
|
for c in cookiejar:
|
|
cookielst.append("Set-Cookie2: %s" % cookies.cookie_str(c))
|
|
log.debug(LOG_CHECK, "Store cookies %s", cookielst)
|
|
headers = HTTPMessage(StringIO("\r\n".join(cookielst)))
|
|
urlparts = urlparse.urlsplit(url)
|
|
scheme, host, path = urlparts[0:3]
|
|
cookiecache.add(headers, scheme, host, path)
|
|
|
|
|
|
def check_urls (aggregate):
|
|
"""Main check function; checks all configured URLs until interrupted
|
|
with Ctrl-C.
|
|
@return: None
|
|
"""
|
|
try:
|
|
visit_loginurl(aggregate)
|
|
except Exception, msg:
|
|
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \
|
|
{'msg': str(msg)})
|
|
raise
|
|
try:
|
|
aggregate.logger.start_log_output()
|
|
if not aggregate.urlqueue.empty():
|
|
aggregate.start_threads()
|
|
check_url(aggregate)
|
|
aggregate.finish()
|
|
aggregate.logger.end_log_output()
|
|
except LinkCheckerInterrupt:
|
|
raise
|
|
except KeyboardInterrupt:
|
|
interrupt(aggregate)
|
|
except thread.error:
|
|
log.warn(LOG_CHECK,
|
|
_("Could not start a new thread. Check that the current user" \
|
|
" is allowed to start new threads."))
|
|
abort(aggregate)
|
|
except Exception:
|
|
# Catching "Exception" is intentionally done. This saves the program
|
|
# from badly-programmed libraries that raise all kinds of strange
|
|
# exceptions.
|
|
console.internal_error()
|
|
abort(aggregate)
|
|
# Not catched exceptions at this point are SystemExit and GeneratorExit,
|
|
# and both should be handled by the calling layer.
|
|
|
|
|
|
def check_url (aggregate):
|
|
"""Helper function waiting for URL queue."""
|
|
while True:
|
|
try:
|
|
aggregate.urlqueue.join(timeout=0.5)
|
|
break
|
|
except urlqueue.Timeout:
|
|
# Since urlqueue.join() is not interruptable, add a timeout
|
|
# and a one-second slumber.
|
|
time.sleep(1)
|
|
aggregate.remove_stopped_threads()
|
|
if not aggregate.threads:
|
|
break
|
|
if aggregate.wanted_stop:
|
|
# some other thread wants us to stop
|
|
raise KeyboardInterrupt
|
|
|
|
|
|
def interrupt (aggregate):
|
|
"""Interrupt execution and shutdown, ignoring any subsequent
|
|
interrupts."""
|
|
while True:
|
|
try:
|
|
log.warn(LOG_CHECK,
|
|
_("user interrupt; waiting for active threads to finish"))
|
|
log.warn(LOG_CHECK,
|
|
_("another interrupt will exit immediately"))
|
|
abort(aggregate)
|
|
break
|
|
except KeyboardInterrupt:
|
|
pass
|
|
|
|
|
|
def abort (aggregate):
|
|
"""Helper function to ensure a clean shutdown."""
|
|
while True:
|
|
try:
|
|
aggregate.abort()
|
|
aggregate.finish()
|
|
aggregate.logger.end_log_output()
|
|
break
|
|
except KeyboardInterrupt:
|
|
log.warn(LOG_CHECK, _("user abort; force shutdown"))
|
|
abort_now()
|
|
|
|
|
|
def abort_now ():
|
|
"""Force exit of current process without cleanup."""
|
|
if os.name == 'posix':
|
|
# Unix systems can use sigkill
|
|
import signal
|
|
os.kill(os.getpid(), signal.SIGKILL)
|
|
elif os.name == 'nt':
|
|
# NT has os.abort()
|
|
os.abort()
|
|
else:
|
|
# All other systems have os._exit() as best shot.
|
|
os._exit(3)
|
|
|
|
|
|
def get_aggregate (config):
|
|
"""Get an aggregator instance with given configuration."""
|
|
_urlqueue = urlqueue.UrlQueue()
|
|
connections = connection.ConnectionPool(wait=config["wait"])
|
|
cookies = cookie.CookieJar()
|
|
_robots_txt = robots_txt.RobotsTxt()
|
|
return aggregator.Aggregate(config, _urlqueue, connections,
|
|
cookies, _robots_txt)
|