split off cache and url consumer routines into separate classes

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1432 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-08-19 21:35:47 +00:00
parent c49ac001d1
commit f2e7ca6040
12 changed files with 303 additions and 168 deletions

View file

@ -137,55 +137,42 @@ acap # application configuration access protocol
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
def print_status (config, curtime, start_time):
"""print check status looking at url queues"""
tocheck = len(config.urls)
links = config['linknumber']
active = config.threader.active_threads()
duration = linkcheck.strformat.strduration(curtime - start_time)
print >> sys.stderr, \
_("%5d urls queued, %4d links checked, %2d active threads, runtime %s")\
% (tocheck, links, active, duration)
# main check function
def check_urls (config):
def check_urls (consumer):
"""Gets a complete configuration object as parameter where all
runtime-dependent options are stored. If you call this function
more than once, you can specify different configurations.
In the config object there are functions to get new URLs to check,
and to perform the actual checking.
"""
config.logger_start_output()
try:
start_time = time.time()
status_time = start_time
while True:
if config.has_more_urls():
config.check_url(config.get_url())
elif config.finished():
break
else:
# active connections are downloading/parsing, so
# wait a little
time.sleep(0.1)
if config['status']:
curtime = time.time()
if (curtime - status_time) > 5:
print_status(config, curtime, start_time)
status_time = curtime
config.logger_end_output()
_check_urls(consumer)
except KeyboardInterrupt:
config.finish()
config.logger_end_output()
active = config.threader.active_threads()
consumer.finish()
linkcheck.log.warn(linkcheck.LOG_CHECK,
_("keyboard interrupt; waiting for %d active threads to finish"),
active)
consumer.active_threads())
raise
def _check_urls (consumer):
consumer.logger_start_output()
start_time = time.time()
status_time = start_time
while not consumer.finished():
url = consumer.get_url()
if url is not None:
consumer.check_url(url)
else:
# active connections are downloading/parsing, so
# wait a little
time.sleep(0.1)
if consumer.config['status']:
curtime = time.time()
if (curtime - status_time) > 5:
consumer.print_status(curtime, start_time)
status_time = curtime
consumer.logger_end_output()
# file extensions we can parse recursively
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
@ -237,9 +224,9 @@ def absolute_url (base_url, base_ref, parent_url):
return ""
def get_url_from (base_url, recursion_level, config, parent_url=None,
base_ref=None, line=0, column=0, name=None,
cmdline=None):
def get_url_from (base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0,
name=None, cmdline=None):
"""get url data from given base data"""
if cmdline and linkcheck.url.url_needs_quoting(base_url):
base_url = linkcheck.url.url_quote(base_url)
@ -269,9 +256,10 @@ def get_url_from (base_url, recursion_level, config, parent_url=None,
# assume local file
else:
klass = linkcheck.checker.fileurl.FileUrl
if cmdline and url and config['strict'] and \
not (config['internlinks'] or config['externlinks']):
if cmdline and url and consumer.config['strict'] and \
not (consumer.config['internlinks'] or consumer.config['externlinks']):
# set automatic intern/extern stuff if no filter was given
set_intern_url(url, klass, config)
return klass(base_url, recursion_level, config, parent_url, base_ref,
set_intern_url(url, klass, consumer.config)
return klass(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)

View file

@ -0,0 +1,183 @@
# -*- coding: iso-8859-1 -*-
"""url consumer class"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import sys
try:
import threading
except ImportError:
import dummy_threading as threading
import linkcheck.threader
from linkcheck.i18n import _
class Consumer (object):
"""consume urls from the url queue in a threaded manner"""
def __init__ (self, config, cache):
"""initialize consumer data and threads"""
self.config = config
self.cache = cache
self.urls = []
self.threader = linkcheck.threader.Threader()
self._set_threads(config['threads'])
self.logger = config['logger']
self.fileoutput = config['fileoutput']
self.linknumber = 0
# one lock for the data
self.lock = threading.Lock()
def filter_url_queue (self):
"""remove already cached urls from queue"""
pass # deadlock!
#self.lock.acquire()
#try:
# urls = []
# for url_data in self.urls:
# if self.cache.check_cache(url_data):
# self.logger_new_url(url_data)
# else:
# urls.append(url_data)
# self.urls = urls
# print >> sys.stderr, \
# _("removed %d cached urls from incoming queue") % len(removed)
#finally:
# self.lock.release()
def _set_threads (self, num):
"""set number of checker threads to start"""
linkcheck.log.debug(linkcheck.LOG_CHECK,
"set threading with %d threads", num)
self.threader.threads_max = num
if num > 0:
sys.setcheckinterval(50)
else:
sys.setcheckinterval(100)
def check_url (self, url_data):
"""start new thread checking the given url"""
self.threader.start_thread(url_data.check, ())
def append_url (self, url_data):
"""add new url to list of urls to check"""
# check syntax
if not url_data.check_syntax():
# wrong syntax, do not check any further
return
# check the cache
if self.cache.check_cache(url_data):
# already cached
self.logger_new_url(url_data)
return
self.lock.acquire()
try:
self.urls.append(url_data)
finally:
self.lock.release()
def finished (self):
"""return True if checking is finished"""
self.lock.acquire()
try:
return self.threader.finished() and len(self.urls) <= 0
finally:
self.lock.release()
def get_url (self):
"""get first url in queue and return it"""
self.lock.acquire()
try:
if not self.urls:
return None
u = self.urls[0]
del self.urls[0]
return u
finally:
self.lock.release()
def finish (self):
"""finish checking and send of-of-output message to logger"""
self.lock.acquire()
try:
self.threader.finish()
finally:
self.lock.release()
self.logger_end_output()
def print_status (self, curtime, start_time):
"""print check status looking at url queues"""
self.lock.acquire()
try:
active = self.threader.active_threads()
links = self.linknumber
tocheck = len(self.urls)
duration = linkcheck.strformat.strduration(curtime - start_time)
print >> sys.stderr, _("%5d urls queued, %4d links checked, "\
"%2d active threads, runtime %s")\
% (tocheck, links, active, duration)
finally:
self.lock.release()
def logger_start_output (self):
"""start output of all configured loggers"""
self.lock.acquire()
try:
if not self.config['quiet']:
self.logger.start_output()
for logger in self.fileoutput:
logger.start_output()
finally:
self.lock.release()
def logger_new_url (self, url_data):
"""send new url to all configured loggers"""
self.lock.acquire()
try:
self.linknumber += 1
do_filter = (self.linknumber % 1000) == 0
if not self.config['quiet'] and \
(self.config["verbose"] or not url_data.valid or
(url_data.warning and self.config["warnings"])):
self.logger.new_url(url_data)
for log in self.fileoutput:
log.new_url(url_data)
finally:
self.lock.release()
# XXX deadlock!
#if do_filter:
# self.filter_queue(self)
def logger_end_output (self):
"""end output of all configured loggers"""
self.lock.acquire()
try:
if not self.config['quiet']:
self.logger.end_output(linknumber=self.linknumber)
for logger in self.fileoutput:
logger.end_output(linknumber=self.linknumber)
finally:
self.lock.release()
def active_threads (self):
"""return number of active threads"""
self.lock.acquire()
try:
return self.threader.active_threads()
finally:
self.lock.release()

View file

@ -53,15 +53,12 @@ def get_index_html (dirname):
class FileUrl (urlbase.UrlBase):
"Url link with file scheme"
def __init__ (self,
base_url,
config,
recursion_level,
def __init__ (self, base_url, recursion_level, consumer,
parent_url = None,
base_ref = None, line=0, column=0, name=""):
super(FileUrl, self).__init__(base_url, config, recursion_level,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
super(FileUrl, self).__init__(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
if not (parent_url or base_ref or self.base_url.startswith("file:")):
self.base_url = os.path.expanduser(self.base_url)
if not self.base_url.startswith("/"):

View file

@ -32,11 +32,11 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
def check_connection (self):
# proxy support (we support only http)
self.set_proxy(self.config["proxy"].get(self.scheme))
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
if self.proxy:
http = httpurl.HttpUrl(self.base_url,
self.recursion_level,
self.config,
self.consumer.config,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
@ -80,7 +80,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
# ready to connect
try:
self.url_connection = ftplib.FTP()
if self.config.get("debug"):
if self.consumer.config.get("debug"):
self.url_connection.set_debuglevel(1)
self.url_connection.connect(self.urlparts[1])
self.url_connection.login(_user, _password)

View file

@ -28,4 +28,4 @@ class HttpsUrl (httpurl.HttpUrl):
super(HttpsUrl, self).local_check()
else:
self.add_warning(_("%s url ignored")%self.scheme.capitalize())
self.log_me()
self.consumer.logger_new_url(self)

View file

@ -46,11 +46,11 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
"Url link with http scheme"
def __init__ (self, base_url, recursion_level, config, parent_url=None,
base_ref=None, line=0, column=0, name=""):
super(HttpUrl, self).__init__(base_url, recursion_level, config,
parent_url=parent_url, base_ref=base_ref, line=line,
column=column, name=name)
def __init__ (self, base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0, name=""):
super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref, line=line,
column=column, name=name)
self.aliases = []
self.max_redirects = 5
self.has301status = False
@ -109,13 +109,13 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
| extension-code
"""
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.config["proxy"].get(self.scheme))
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
if self.proxy:
self.add_info(_("Using Proxy %r") % self.proxy)
self.headers = None
self.auth = None
self.cookies = []
if not self.robots_txt_allows_url():
if not self.consumer.cache.robots_txt_allows_url(self):
self.add_warning(
_("Access denied by robots.txt, checked only syntax"))
return
@ -235,6 +235,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.set_result(
_("recursive redirection encountered:\n %s") % \
"\n => ".join(redirect_cache), valid=False)
self.consumer.logger_new_url(self)
return -1, response
redirect_cache.append(redirected)
# remember this alias
@ -252,11 +253,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.has301status = True
self.aliases.append(redirected)
# check cache again on possibly changed URL
key = self.get_cache_key()
if self.config.url_cache_has_key(key):
self.copy_from_cache(self.config.url_cache_get(key))
self.cached = True
self.log_me()
if self.consumer.cache.check_cache(self):
self.consumer.logger_new_url(self)
return -1, response
# check if we still have a http url, it could be another
# scheme, eg https or news
@ -266,15 +264,14 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
"the original url was %r.") % self.url)
# make new Url object
newobj = linkcheck.checker.get_url_from(
redirected, self.recursion_level, self.config,
redirected, self.recursion_level, self.consumer,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name)
newobj.warning = self.warning
newobj.info = self.info
# append new object to queue
self.config.append_url(newobj)
self.consumer.append_url(newobj)
# pretend to be finished and logged
self.cached = True
return -1, response
# new response data
response = self._get_http_response()
@ -302,10 +299,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
# no content
self.add_warning(response.reason)
# store cookies for valid links
if self.config['cookies']:
if self.consumer.config['cookies']:
for c in self.cookies:
self.add_info("Cookie: %s" % c)
out = self.config.storeCookies(self.headers, self.urlparts[1])
out = self.consumer.config.storeCookies(self.headers, self.urlparts[1])
for h in out:
self.add_info(h)
if response.status >= 200:
@ -335,14 +332,16 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
if self.url_connection:
self.close_connection()
self.url_connection = self.get_http_object(host, scheme)
url = urlparse.urlunsplit(self.urlparts)
if self.no_anchor:
qurlparts[4] = ''
anchor = ''
else:
anchor = self.urlparts[4]
if self.proxy:
path = urlparse.urlunsplit(self.urlparts)
path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
self.urlparts[2], self.urlparts[3], anchor))
else:
path = urlparse.urlunsplit(('', '', self.urlparts[2],
self.urlparts[3], self.urlparts[4]))
self.urlparts[3], anchor))
self.url_connection.putrequest(self.method, path, skip_host=True)
self.url_connection.putheader("Host", host)
# userinfo is from http://user@pass:host/
@ -360,8 +359,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
linkcheck.configuration.UserAgent)
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.config['cookies']:
self.cookies = self.config.getCookies(self.urlparts[1],
if self.consumer.config['cookies']:
self.cookies = self.consumer.config.getCookies(self.urlparts[1],
self.urlparts[2])
for c in self.cookies:
self.url_connection.putheader("Cookie", c)
@ -375,7 +374,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
h = linkcheck.httplib2.HTTPSConnection(host)
else:
raise linkcheck.LinkCheckerError("invalid url scheme %s" % scheme)
if self.config.get("debug"):
if self.consumer.config.get("debug"):
h.set_debuglevel(1)
h.connect()
return h
@ -447,15 +446,3 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
def get_robots_txt_url (self):
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
def robots_txt_allows_url (self):
roboturl = self.get_robots_txt_url()
linkcheck.log.debug(linkcheck.LOG_CHECK, "robots.txt url %r",
roboturl)
linkcheck.log.debug(linkcheck.LOG_CHECK, "url %r", self.url)
if not self.config.robots_txt_cache_has_key(roboturl):
rp = linkcheck.robotparser2.RobotFileParser()
rp.set_url(roboturl)
rp.read()
self.config.robots_txt_cache_set(roboturl, rp)
rp = self.config.robots_txt_cache_get(roboturl)
return rp.can_fetch(linkcheck.configuration.UserAgent, self.url)

View file

@ -25,7 +25,7 @@ class IgnoredUrl (urlbase.UrlBase):
def local_check (self):
self.add_warning(_("%s url ignored")%self.scheme.capitalize())
self.log_me()
self.consumer.logger_new_url(self)
def can_get_content (self):
return False

View file

@ -16,7 +16,6 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import re
import sys
import cgi
import urllib

View file

@ -46,7 +46,7 @@ class NntpUrl (urlbase.UrlBase):
linkcheck.log.debug(linkcheck.LOG_CHECK, self.urlparts)
def check_connection (self):
nntpserver = self.urlparts[1] or self.config["nntpserver"]
nntpserver = self.urlparts[1] or self.consumer.config["nntpserver"]
if not nntpserver:
self.add_warning(_("No NNTP server specified, skipping this URL"))
return

View file

@ -49,14 +49,14 @@ class TelnetUrl (urlconnect.UrlConnect):
def local_check (self):
if not self.host:
self.set_result(_("Host is empty"), valid=False)
self.log_me()
self.consumer.logger_new_url(self)
return
super(TelnetUrl, self).local_check()
def check_connection (self):
super(TelnetUrl, self).check_connection()
self.url_connection = telnetlib.Telnet()
if self.config.get("debug"):
if self.consumer.config.get("debug"):
self.url_connection.set_debuglevel(1)
self.url_connection.open(self.host, self.port)
if self.user:

View file

@ -79,7 +79,7 @@ def print_app_info ():
class UrlBase (object):
"""An URL with additional information like validity etc."""
def __init__ (self, base_url, recursion_level, config,
def __init__ (self, base_url, recursion_level, consumer,
parent_url = None, base_ref = None,
line = 0, column = 0, name = ""):
"""Initialize check data, and store given variables.
@ -100,8 +100,9 @@ class UrlBase (object):
self.parent_url = parent_url
self.anchor = None
self.recursion_level = recursion_level
self.config = config
self.consumer = consumer
self.result = ""
self.cached = False
self.valid = True
self.warning = linkcheck.containers.SetList()
self.info = linkcheck.containers.SetList()
@ -111,7 +112,6 @@ class UrlBase (object):
self.dltime = -1
self.dlsize = -1
self.checktime = 0
self.cached = False
self.url_connection = None
self.extern = (1, 0)
self.data = None
@ -169,6 +169,7 @@ class UrlBase (object):
self.info.extend(cache_data["info"])
self.valid = cache_data["valid"]
self.dltime = cache_data["dltime"]
self.cached = True
def get_cache_data (self):
"""return all data values that should be put in the cache"""
@ -186,13 +187,12 @@ class UrlBase (object):
return [key]
def is_cached (self):
key = self.get_cache_key()
return self.cached or self.config.url_seen_has_key(key)
return self.consumer.cache.url_is_cached(self.get_cache_key())
def get_cache_key (self):
# note: the host is already lowercase
if self.urlparts:
if self.config["anchorcaching"]:
if self.consumer.config["anchorcaching"]:
# do not ignore anchor
return urlparse.urlunsplit(self.urlparts)
else:
@ -200,16 +200,6 @@ class UrlBase (object):
return urlparse.urlunsplit(self.urlparts[:4]+[''])
return None
def put_in_cache (self):
"""put url data into cache"""
if self.is_cached():
# another thread was faster and cached this url already
return
data = self.get_cache_data()
for key in self.get_cache_keys():
self.config.url_cache_set(key, data)
self.config.url_seen_set(key)
def build_url (self):
# make url absolute
if self.base_ref:
@ -236,14 +226,6 @@ class UrlBase (object):
# safe anchor for later checking
self.anchor = self.urlparts[4]
def log_me (self):
"""announce the url data as checked to the configured loggers"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "logging url")
self.config.increment_linknumber()
if self.config["verbose"] or not self.valid or \
(self.warning and self.config["warnings"]):
self.config.logger_new_url(self)
def check (self):
try:
self.local_check()
@ -260,28 +242,30 @@ class UrlBase (object):
def local_check (self):
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
if self.recursion_level and self.config['wait']:
if self.recursion_level and self.consumer.config['wait']:
linkcheck.log.debug(linkcheck.LOG_CHECK,
"sleeping for %d seconds", self.config['wait'])
time.sleep(self.config['wait'])
"sleeping for %d seconds", self.consumer.config['wait'])
time.sleep(self.consumer.config['wait'])
t = time.time()
if not self.check_cache():
if self.consumer.cache.check_cache(self):
# was cached from previous queue member
self.consumer.logger_new_url(self)
return
# apply filter
linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern)
if self.extern[0] and (self.config["strict"] or self.extern[1]):
if self.extern[0] and (self.consumer.config["strict"] or self.extern[1]):
self.add_warning(
_("outside of domain filter, checked only syntax"))
self.log_me()
self.consumer.logger_new_url(self)
return
# check connection
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection")
try:
self.check_connection()
if self.cached:
if self.is_cached():
return
if self.config["anchors"]:
if self.consumer.config["anchors"]:
self.check_anchors()
except tuple(linkcheck.checker.ExcList):
etype, evalue, etb = sys.exc_info()
@ -296,7 +280,7 @@ class UrlBase (object):
self.set_result(str(evalue), valid=False)
# check content
warningregex = self.config["warningregex"]
warningregex = self.consumer.config["warningregex"]
if warningregex and self.valid:
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
try:
@ -323,40 +307,37 @@ class UrlBase (object):
valid=False)
# close
self.close_connection()
self.log_me()
self.consumer.logger_new_url(self)
linkcheck.log.debug(linkcheck.LOG_CHECK, "caching")
self.put_in_cache()
self.consumer.cache.url_data_cache_add(self)
def check_syntax (self):
"""Called before self.check(), this function inspects the
url syntax. Success enables further checking, failure
immediately logs this url. This syntax check must not
use any network resources.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
if not self.base_url:
self.set_result(_("URL is empty"), valid=False)
self.log_me()
self.consumer.logger_new_url(self)
return False
if ws_at_start_or_end(self.base_url):
# leading or trailing whitespace is common, so make a
# separate error message for this
self.set_result(_("URL has whitespace at beginning or end"),
valid=False)
self.log_me()
self.consumer.logger_new_url(self)
return False
try:
self.build_url()
self.extern = self._get_extern()
except linkcheck.LinkCheckerError, msg:
self.set_result(str(msg), valid=False)
self.log_me()
self.consumer.logger_new_url(self)
return False
return True
def check_cache (self):
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking cache")
for key in self.get_cache_keys():
if self.config.url_cache_has_key(key):
self.copy_from_cache(self.config.url_cache_get(key))
self.cached = True
self.log_me()
return False
return True
def close_connection (self):
"""close an opened url connection"""
# brute force closing
@ -379,8 +360,8 @@ class UrlBase (object):
self.is_parseable() and \
self.can_get_content() and \
not self.is_cached() and \
(self.config["recursionlevel"] < 0 or
self.recursion_level < self.config["recursionlevel"]) and \
(self.consumer.config["recursionlevel"] < 0 or
self.recursion_level < self.consumer.config["recursionlevel"]) and \
not self.extern[0] and self.content_allows_robots()
def content_allows_robots (self):
@ -418,19 +399,19 @@ class UrlBase (object):
self.add_warning(_("anchor #%s not found") % self.anchor)
def _get_extern (self):
if not (self.config["externlinks"] or self.config["internlinks"]):
if not (self.consumer.config["externlinks"] or self.consumer.config["internlinks"]):
return (0, 0)
# deny and allow external checking
linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", self.url)
if self.config["denyallow"]:
for entry in self.config["externlinks"]:
if self.consumer.config["denyallow"]:
for entry in self.consumer.config["externlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (1, entry['strict'])
for entry in self.config["internlinks"]:
for entry in self.consumer.config["internlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
entry)
match = entry['pattern'].search(self.url)
@ -439,14 +420,14 @@ class UrlBase (object):
return (0, 0)
return (0, 0)
else:
for entry in self.config["internlinks"]:
for entry in self.consumer.config["internlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
entry)
match = entry['pattern'].search(self.url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (0, 0)
for entry in self.config["externlinks"]:
for entry in self.consumer.config["externlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
entry)
match = entry['pattern'].search(self.url)
@ -482,7 +463,7 @@ class UrlBase (object):
def check_size (self):
"""if a maximum size was given, call this function to check it
against the content size of this url"""
maxbytes = self.config["warnsizebytes"]
maxbytes = self.consumer.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(_("Content size %s is larger than %s") % \
(linkcheck.strformat.strsize(self.dlsize),
@ -497,7 +478,7 @@ class UrlBase (object):
self.parse_html()
def get_user_password (self):
for auth in self.config["authentication"]:
for auth in self.consumer.config["authentication"]:
if auth['pattern'].match(self.url):
return auth['user'], auth['password']
return None, None
@ -535,10 +516,10 @@ class UrlBase (object):
base = base_ref
linkcheck.log.debug(linkcheck.LOG_CHECK, "Put url %r in queue",
url)
self.config.append_url(linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.config,
parent_url=self.url, base_ref=base,
line=line, column=column, name=name))
self.consumer.append_url(linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.consumer,
parent_url=self.url, base_ref=base,
line=line, column=column, name=name))
def parse_opera (self):
"""parse an opera bookmark file"""
@ -553,8 +534,9 @@ class UrlBase (object):
elif line.startswith("URL="):
url = line[4:]
if url:
self.config.append_url(linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.config, self.url, None, lineno, name))
self.consumer.append_url(linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.consumer,
self.url, None, lineno, name))
name = ""
def parse_text (self):
@ -567,9 +549,9 @@ class UrlBase (object):
lineno += 1
line = line.strip()
if not line or line.startswith('#'): continue
self.config.append_url(
self.consumer.append_url(
linkcheck.checker.get_url_from(line, self.recursion_level+1,
self.config, parent_url=self.url, line=lineno))
self.consumer, parent_url=self.url, line=lineno))
def parse_css (self):
"""parse a CSS file for url() patterns"""
@ -578,9 +560,9 @@ class UrlBase (object):
lineno += 1
for mo in linkcheck.linkparse.css_url_re.finditer(line):
column = mo.start("url")
self.config.append_url(
self.consumer.append_url(
linkcheck.checker.get_url_from(mo.group("url"),
self.recursion_level+1, self.config,
self.recursion_level+1, self.consumer,
parent_url=self.url, line=lineno, column=column))
def __str__ (self):
@ -590,7 +572,6 @@ class UrlBase (object):
"base_url=%s" % self.base_url,
"parent_url=%s" % self.parent_url,
"base_ref=%s" % self.base_ref,
"cached=%s" % self.cached,
"recursion_level=%s" % self.recursion_level,
"url_connection=%s" % self.url_connection,
"line=%s" % self.line,

View file

@ -27,9 +27,9 @@ from linkcheck.i18n import _
class UrlConnect (urlbase.UrlBase):
"""Url link for which we have to connect to a specific host"""
def __init__ (self, base_url, recursion_level, config, parent_url=None,
base_ref=None, line=0, column=0, name=""):
super(UrlConnect, self).__init__(base_url, recursion_level, config,
def __init__ (self, base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0, name=""):
super(UrlConnect, self).__init__(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
self.host = None