From f2e7ca6040aeaa6fc182dce83eb1fbbd9a592d8f Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 19 Aug 2004 21:35:47 +0000 Subject: [PATCH] split off cache and url consumer routines into separate classes git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1432 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/checker/__init__.py | 76 ++++++------- linkcheck/checker/consumer.py | 183 ++++++++++++++++++++++++++++++++ linkcheck/checker/fileurl.py | 11 +- linkcheck/checker/ftpurl.py | 6 +- linkcheck/checker/httpsurl.py | 2 +- linkcheck/checker/httpurl.py | 59 ++++------ linkcheck/checker/ignoredurl.py | 2 +- linkcheck/checker/mailtourl.py | 1 - linkcheck/checker/nntpurl.py | 2 +- linkcheck/checker/telneturl.py | 4 +- linkcheck/checker/urlbase.py | 119 +++++++++------------ linkcheck/checker/urlconnect.py | 6 +- 12 files changed, 303 insertions(+), 168 deletions(-) create mode 100644 linkcheck/checker/consumer.py diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 6aa53af8..7f5da75c 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -137,55 +137,42 @@ acap # application configuration access protocol ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) -def print_status (config, curtime, start_time): - """print check status looking at url queues""" - tocheck = len(config.urls) - links = config['linknumber'] - active = config.threader.active_threads() - duration = linkcheck.strformat.strduration(curtime - start_time) - print >> sys.stderr, \ - _("%5d urls queued, %4d links checked, %2d active threads, runtime %s")\ - % (tocheck, links, active, duration) - - # main check function -def check_urls (config): +def check_urls (consumer): """Gets a complete configuration object as parameter where all runtime-dependent options are stored. If you call this function more than once, you can specify different configurations. - - In the config object there are functions to get new URLs to check, - and to perform the actual checking. """ - config.logger_start_output() try: - start_time = time.time() - status_time = start_time - while True: - if config.has_more_urls(): - config.check_url(config.get_url()) - elif config.finished(): - break - else: - # active connections are downloading/parsing, so - # wait a little - time.sleep(0.1) - if config['status']: - curtime = time.time() - if (curtime - status_time) > 5: - print_status(config, curtime, start_time) - status_time = curtime - config.logger_end_output() + _check_urls(consumer) except KeyboardInterrupt: - config.finish() - config.logger_end_output() - active = config.threader.active_threads() + consumer.finish() linkcheck.log.warn(linkcheck.LOG_CHECK, _("keyboard interrupt; waiting for %d active threads to finish"), - active) + consumer.active_threads()) raise +def _check_urls (consumer): + consumer.logger_start_output() + start_time = time.time() + status_time = start_time + while not consumer.finished(): + url = consumer.get_url() + if url is not None: + consumer.check_url(url) + else: + # active connections are downloading/parsing, so + # wait a little + time.sleep(0.1) + if consumer.config['status']: + curtime = time.time() + if (curtime - status_time) > 5: + consumer.print_status(curtime, start_time) + status_time = curtime + consumer.logger_end_output() + + # file extensions we can parse recursively extensions = { "html": re.compile(r'(?i)\.s?html?$'), @@ -237,9 +224,9 @@ def absolute_url (base_url, base_ref, parent_url): return "" -def get_url_from (base_url, recursion_level, config, parent_url=None, - base_ref=None, line=0, column=0, name=None, - cmdline=None): +def get_url_from (base_url, recursion_level, consumer, + parent_url=None, base_ref=None, line=0, column=0, + name=None, cmdline=None): """get url data from given base data""" if cmdline and linkcheck.url.url_needs_quoting(base_url): base_url = linkcheck.url.url_quote(base_url) @@ -269,9 +256,10 @@ def get_url_from (base_url, recursion_level, config, parent_url=None, # assume local file else: klass = linkcheck.checker.fileurl.FileUrl - if cmdline and url and config['strict'] and \ - not (config['internlinks'] or config['externlinks']): + if cmdline and url and consumer.config['strict'] and \ + not (consumer.config['internlinks'] or consumer.config['externlinks']): # set automatic intern/extern stuff if no filter was given - set_intern_url(url, klass, config) - return klass(base_url, recursion_level, config, parent_url, base_ref, + set_intern_url(url, klass, consumer.config) + return klass(base_url, recursion_level, consumer, + parent_url=parent_url, base_ref=base_ref, line=line, column=column, name=name) diff --git a/linkcheck/checker/consumer.py b/linkcheck/checker/consumer.py new file mode 100644 index 00000000..df2f353e --- /dev/null +++ b/linkcheck/checker/consumer.py @@ -0,0 +1,183 @@ +# -*- coding: iso-8859-1 -*- +"""url consumer class""" +# Copyright (C) 2000-2004 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +import sys +try: + import threading +except ImportError: + import dummy_threading as threading + +import linkcheck.threader + +from linkcheck.i18n import _ + +class Consumer (object): + """consume urls from the url queue in a threaded manner""" + + def __init__ (self, config, cache): + """initialize consumer data and threads""" + self.config = config + self.cache = cache + self.urls = [] + self.threader = linkcheck.threader.Threader() + self._set_threads(config['threads']) + self.logger = config['logger'] + self.fileoutput = config['fileoutput'] + self.linknumber = 0 + # one lock for the data + self.lock = threading.Lock() + + def filter_url_queue (self): + """remove already cached urls from queue""" + pass # deadlock! + #self.lock.acquire() + #try: + # urls = [] + # for url_data in self.urls: + # if self.cache.check_cache(url_data): + # self.logger_new_url(url_data) + # else: + # urls.append(url_data) + # self.urls = urls + # print >> sys.stderr, \ + # _("removed %d cached urls from incoming queue") % len(removed) + #finally: + # self.lock.release() + + def _set_threads (self, num): + """set number of checker threads to start""" + linkcheck.log.debug(linkcheck.LOG_CHECK, + "set threading with %d threads", num) + self.threader.threads_max = num + if num > 0: + sys.setcheckinterval(50) + else: + sys.setcheckinterval(100) + + def check_url (self, url_data): + """start new thread checking the given url""" + self.threader.start_thread(url_data.check, ()) + + def append_url (self, url_data): + """add new url to list of urls to check""" + # check syntax + if not url_data.check_syntax(): + # wrong syntax, do not check any further + return + # check the cache + if self.cache.check_cache(url_data): + # already cached + self.logger_new_url(url_data) + return + self.lock.acquire() + try: + self.urls.append(url_data) + finally: + self.lock.release() + + def finished (self): + """return True if checking is finished""" + self.lock.acquire() + try: + return self.threader.finished() and len(self.urls) <= 0 + finally: + self.lock.release() + + def get_url (self): + """get first url in queue and return it""" + self.lock.acquire() + try: + if not self.urls: + return None + u = self.urls[0] + del self.urls[0] + return u + finally: + self.lock.release() + + def finish (self): + """finish checking and send of-of-output message to logger""" + self.lock.acquire() + try: + self.threader.finish() + finally: + self.lock.release() + self.logger_end_output() + + def print_status (self, curtime, start_time): + """print check status looking at url queues""" + self.lock.acquire() + try: + active = self.threader.active_threads() + links = self.linknumber + tocheck = len(self.urls) + duration = linkcheck.strformat.strduration(curtime - start_time) + print >> sys.stderr, _("%5d urls queued, %4d links checked, "\ + "%2d active threads, runtime %s")\ + % (tocheck, links, active, duration) + finally: + self.lock.release() + + def logger_start_output (self): + """start output of all configured loggers""" + self.lock.acquire() + try: + if not self.config['quiet']: + self.logger.start_output() + for logger in self.fileoutput: + logger.start_output() + finally: + self.lock.release() + + def logger_new_url (self, url_data): + """send new url to all configured loggers""" + self.lock.acquire() + try: + self.linknumber += 1 + do_filter = (self.linknumber % 1000) == 0 + if not self.config['quiet'] and \ + (self.config["verbose"] or not url_data.valid or + (url_data.warning and self.config["warnings"])): + self.logger.new_url(url_data) + for log in self.fileoutput: + log.new_url(url_data) + finally: + self.lock.release() + # XXX deadlock! + #if do_filter: + # self.filter_queue(self) + + def logger_end_output (self): + """end output of all configured loggers""" + self.lock.acquire() + try: + if not self.config['quiet']: + self.logger.end_output(linknumber=self.linknumber) + for logger in self.fileoutput: + logger.end_output(linknumber=self.linknumber) + finally: + self.lock.release() + + def active_threads (self): + """return number of active threads""" + self.lock.acquire() + try: + return self.threader.active_threads() + finally: + self.lock.release() + diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index bd19ce6d..7ead2496 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -53,15 +53,12 @@ def get_index_html (dirname): class FileUrl (urlbase.UrlBase): "Url link with file scheme" - def __init__ (self, - base_url, - config, - recursion_level, + def __init__ (self, base_url, recursion_level, consumer, parent_url = None, base_ref = None, line=0, column=0, name=""): - super(FileUrl, self).__init__(base_url, config, recursion_level, - parent_url=parent_url, base_ref=base_ref, - line=line, column=column, name=name) + super(FileUrl, self).__init__(base_url, recursion_level, consumer, + parent_url=parent_url, base_ref=base_ref, + line=line, column=column, name=name) if not (parent_url or base_ref or self.base_url.startswith("file:")): self.base_url = os.path.expanduser(self.base_url) if not self.base_url.startswith("/"): diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index 1f0fe37d..97f80b2a 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -32,11 +32,11 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport): def check_connection (self): # proxy support (we support only http) - self.set_proxy(self.config["proxy"].get(self.scheme)) + self.set_proxy(self.consumer.config["proxy"].get(self.scheme)) if self.proxy: http = httpurl.HttpUrl(self.base_url, self.recursion_level, - self.config, + self.consumer.config, parent_url=self.parent_url, base_ref=self.base_ref, line=self.line, @@ -80,7 +80,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport): # ready to connect try: self.url_connection = ftplib.FTP() - if self.config.get("debug"): + if self.consumer.config.get("debug"): self.url_connection.set_debuglevel(1) self.url_connection.connect(self.urlparts[1]) self.url_connection.login(_user, _password) diff --git a/linkcheck/checker/httpsurl.py b/linkcheck/checker/httpsurl.py index b02e798e..5510880f 100644 --- a/linkcheck/checker/httpsurl.py +++ b/linkcheck/checker/httpsurl.py @@ -28,4 +28,4 @@ class HttpsUrl (httpurl.HttpUrl): super(HttpsUrl, self).local_check() else: self.add_warning(_("%s url ignored")%self.scheme.capitalize()) - self.log_me() + self.consumer.logger_new_url(self) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 57daee50..54fae783 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -46,11 +46,11 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): "Url link with http scheme" - def __init__ (self, base_url, recursion_level, config, parent_url=None, - base_ref=None, line=0, column=0, name=""): - super(HttpUrl, self).__init__(base_url, recursion_level, config, - parent_url=parent_url, base_ref=base_ref, line=line, - column=column, name=name) + def __init__ (self, base_url, recursion_level, consumer, + parent_url=None, base_ref=None, line=0, column=0, name=""): + super(HttpUrl, self).__init__(base_url, recursion_level, consumer, + parent_url=parent_url, base_ref=base_ref, line=line, + column=column, name=name) self.aliases = [] self.max_redirects = 5 self.has301status = False @@ -109,13 +109,13 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): | extension-code """ # set the proxy, so a 407 status after this is an error - self.set_proxy(self.config["proxy"].get(self.scheme)) + self.set_proxy(self.consumer.config["proxy"].get(self.scheme)) if self.proxy: self.add_info(_("Using Proxy %r") % self.proxy) self.headers = None self.auth = None self.cookies = [] - if not self.robots_txt_allows_url(): + if not self.consumer.cache.robots_txt_allows_url(self): self.add_warning( _("Access denied by robots.txt, checked only syntax")) return @@ -235,6 +235,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): self.set_result( _("recursive redirection encountered:\n %s") % \ "\n => ".join(redirect_cache), valid=False) + self.consumer.logger_new_url(self) return -1, response redirect_cache.append(redirected) # remember this alias @@ -252,11 +253,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): self.has301status = True self.aliases.append(redirected) # check cache again on possibly changed URL - key = self.get_cache_key() - if self.config.url_cache_has_key(key): - self.copy_from_cache(self.config.url_cache_get(key)) - self.cached = True - self.log_me() + if self.consumer.cache.check_cache(self): + self.consumer.logger_new_url(self) return -1, response # check if we still have a http url, it could be another # scheme, eg https or news @@ -266,15 +264,14 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): "the original url was %r.") % self.url) # make new Url object newobj = linkcheck.checker.get_url_from( - redirected, self.recursion_level, self.config, + redirected, self.recursion_level, self.consumer, parent_url=self.parent_url, base_ref=self.base_ref, line=self.line, column=self.column, name=self.name) newobj.warning = self.warning newobj.info = self.info # append new object to queue - self.config.append_url(newobj) + self.consumer.append_url(newobj) # pretend to be finished and logged - self.cached = True return -1, response # new response data response = self._get_http_response() @@ -302,10 +299,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): # no content self.add_warning(response.reason) # store cookies for valid links - if self.config['cookies']: + if self.consumer.config['cookies']: for c in self.cookies: self.add_info("Cookie: %s" % c) - out = self.config.storeCookies(self.headers, self.urlparts[1]) + out = self.consumer.config.storeCookies(self.headers, self.urlparts[1]) for h in out: self.add_info(h) if response.status >= 200: @@ -335,14 +332,16 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): if self.url_connection: self.close_connection() self.url_connection = self.get_http_object(host, scheme) - url = urlparse.urlunsplit(self.urlparts) if self.no_anchor: - qurlparts[4] = '' + anchor = '' + else: + anchor = self.urlparts[4] if self.proxy: - path = urlparse.urlunsplit(self.urlparts) + path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1], + self.urlparts[2], self.urlparts[3], anchor)) else: path = urlparse.urlunsplit(('', '', self.urlparts[2], - self.urlparts[3], self.urlparts[4])) + self.urlparts[3], anchor)) self.url_connection.putrequest(self.method, path, skip_host=True) self.url_connection.putheader("Host", host) # userinfo is from http://user@pass:host/ @@ -360,8 +359,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): linkcheck.configuration.UserAgent) self.url_connection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") - if self.config['cookies']: - self.cookies = self.config.getCookies(self.urlparts[1], + if self.consumer.config['cookies']: + self.cookies = self.consumer.config.getCookies(self.urlparts[1], self.urlparts[2]) for c in self.cookies: self.url_connection.putheader("Cookie", c) @@ -375,7 +374,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): h = linkcheck.httplib2.HTTPSConnection(host) else: raise linkcheck.LinkCheckerError("invalid url scheme %s" % scheme) - if self.config.get("debug"): + if self.consumer.config.get("debug"): h.set_debuglevel(1) h.connect() return h @@ -447,15 +446,3 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport): def get_robots_txt_url (self): return "%s://%s/robots.txt" % tuple(self.urlparts[0:2]) - def robots_txt_allows_url (self): - roboturl = self.get_robots_txt_url() - linkcheck.log.debug(linkcheck.LOG_CHECK, "robots.txt url %r", - roboturl) - linkcheck.log.debug(linkcheck.LOG_CHECK, "url %r", self.url) - if not self.config.robots_txt_cache_has_key(roboturl): - rp = linkcheck.robotparser2.RobotFileParser() - rp.set_url(roboturl) - rp.read() - self.config.robots_txt_cache_set(roboturl, rp) - rp = self.config.robots_txt_cache_get(roboturl) - return rp.can_fetch(linkcheck.configuration.UserAgent, self.url) diff --git a/linkcheck/checker/ignoredurl.py b/linkcheck/checker/ignoredurl.py index 6c7d58f3..2bf8323c 100644 --- a/linkcheck/checker/ignoredurl.py +++ b/linkcheck/checker/ignoredurl.py @@ -25,7 +25,7 @@ class IgnoredUrl (urlbase.UrlBase): def local_check (self): self.add_warning(_("%s url ignored")%self.scheme.capitalize()) - self.log_me() + self.consumer.logger_new_url(self) def can_get_content (self): return False diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py index 7e289e9b..770445b5 100644 --- a/linkcheck/checker/mailtourl.py +++ b/linkcheck/checker/mailtourl.py @@ -16,7 +16,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -import re import sys import cgi import urllib diff --git a/linkcheck/checker/nntpurl.py b/linkcheck/checker/nntpurl.py index 12c03b5d..9fc77c13 100644 --- a/linkcheck/checker/nntpurl.py +++ b/linkcheck/checker/nntpurl.py @@ -46,7 +46,7 @@ class NntpUrl (urlbase.UrlBase): linkcheck.log.debug(linkcheck.LOG_CHECK, self.urlparts) def check_connection (self): - nntpserver = self.urlparts[1] or self.config["nntpserver"] + nntpserver = self.urlparts[1] or self.consumer.config["nntpserver"] if not nntpserver: self.add_warning(_("No NNTP server specified, skipping this URL")) return diff --git a/linkcheck/checker/telneturl.py b/linkcheck/checker/telneturl.py index 2781ee43..fb8ed8dd 100644 --- a/linkcheck/checker/telneturl.py +++ b/linkcheck/checker/telneturl.py @@ -49,14 +49,14 @@ class TelnetUrl (urlconnect.UrlConnect): def local_check (self): if not self.host: self.set_result(_("Host is empty"), valid=False) - self.log_me() + self.consumer.logger_new_url(self) return super(TelnetUrl, self).local_check() def check_connection (self): super(TelnetUrl, self).check_connection() self.url_connection = telnetlib.Telnet() - if self.config.get("debug"): + if self.consumer.config.get("debug"): self.url_connection.set_debuglevel(1) self.url_connection.open(self.host, self.port) if self.user: diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index ffed9da9..2e29c058 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -79,7 +79,7 @@ def print_app_info (): class UrlBase (object): """An URL with additional information like validity etc.""" - def __init__ (self, base_url, recursion_level, config, + def __init__ (self, base_url, recursion_level, consumer, parent_url = None, base_ref = None, line = 0, column = 0, name = ""): """Initialize check data, and store given variables. @@ -100,8 +100,9 @@ class UrlBase (object): self.parent_url = parent_url self.anchor = None self.recursion_level = recursion_level - self.config = config + self.consumer = consumer self.result = "" + self.cached = False self.valid = True self.warning = linkcheck.containers.SetList() self.info = linkcheck.containers.SetList() @@ -111,7 +112,6 @@ class UrlBase (object): self.dltime = -1 self.dlsize = -1 self.checktime = 0 - self.cached = False self.url_connection = None self.extern = (1, 0) self.data = None @@ -169,6 +169,7 @@ class UrlBase (object): self.info.extend(cache_data["info"]) self.valid = cache_data["valid"] self.dltime = cache_data["dltime"] + self.cached = True def get_cache_data (self): """return all data values that should be put in the cache""" @@ -186,13 +187,12 @@ class UrlBase (object): return [key] def is_cached (self): - key = self.get_cache_key() - return self.cached or self.config.url_seen_has_key(key) + return self.consumer.cache.url_is_cached(self.get_cache_key()) def get_cache_key (self): # note: the host is already lowercase if self.urlparts: - if self.config["anchorcaching"]: + if self.consumer.config["anchorcaching"]: # do not ignore anchor return urlparse.urlunsplit(self.urlparts) else: @@ -200,16 +200,6 @@ class UrlBase (object): return urlparse.urlunsplit(self.urlparts[:4]+['']) return None - def put_in_cache (self): - """put url data into cache""" - if self.is_cached(): - # another thread was faster and cached this url already - return - data = self.get_cache_data() - for key in self.get_cache_keys(): - self.config.url_cache_set(key, data) - self.config.url_seen_set(key) - def build_url (self): # make url absolute if self.base_ref: @@ -236,14 +226,6 @@ class UrlBase (object): # safe anchor for later checking self.anchor = self.urlparts[4] - def log_me (self): - """announce the url data as checked to the configured loggers""" - linkcheck.log.debug(linkcheck.LOG_CHECK, "logging url") - self.config.increment_linknumber() - if self.config["verbose"] or not self.valid or \ - (self.warning and self.config["warnings"]): - self.config.logger_new_url(self) - def check (self): try: self.local_check() @@ -260,28 +242,30 @@ class UrlBase (object): def local_check (self): linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self) - if self.recursion_level and self.config['wait']: + if self.recursion_level and self.consumer.config['wait']: linkcheck.log.debug(linkcheck.LOG_CHECK, - "sleeping for %d seconds", self.config['wait']) - time.sleep(self.config['wait']) + "sleeping for %d seconds", self.consumer.config['wait']) + time.sleep(self.consumer.config['wait']) t = time.time() - if not self.check_cache(): + if self.consumer.cache.check_cache(self): + # was cached from previous queue member + self.consumer.logger_new_url(self) return # apply filter linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern) - if self.extern[0] and (self.config["strict"] or self.extern[1]): + if self.extern[0] and (self.consumer.config["strict"] or self.extern[1]): self.add_warning( _("outside of domain filter, checked only syntax")) - self.log_me() + self.consumer.logger_new_url(self) return # check connection linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection") try: self.check_connection() - if self.cached: + if self.is_cached(): return - if self.config["anchors"]: + if self.consumer.config["anchors"]: self.check_anchors() except tuple(linkcheck.checker.ExcList): etype, evalue, etb = sys.exc_info() @@ -296,7 +280,7 @@ class UrlBase (object): self.set_result(str(evalue), valid=False) # check content - warningregex = self.config["warningregex"] + warningregex = self.consumer.config["warningregex"] if warningregex and self.valid: linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content") try: @@ -323,40 +307,37 @@ class UrlBase (object): valid=False) # close self.close_connection() - self.log_me() + self.consumer.logger_new_url(self) linkcheck.log.debug(linkcheck.LOG_CHECK, "caching") - self.put_in_cache() + self.consumer.cache.url_data_cache_add(self) def check_syntax (self): + """Called before self.check(), this function inspects the + url syntax. Success enables further checking, failure + immediately logs this url. This syntax check must not + use any network resources. + """ linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax") if not self.base_url: self.set_result(_("URL is empty"), valid=False) - self.log_me() + self.consumer.logger_new_url(self) return False if ws_at_start_or_end(self.base_url): + # leading or trailing whitespace is common, so make a + # separate error message for this self.set_result(_("URL has whitespace at beginning or end"), valid=False) - self.log_me() + self.consumer.logger_new_url(self) return False try: self.build_url() self.extern = self._get_extern() except linkcheck.LinkCheckerError, msg: self.set_result(str(msg), valid=False) - self.log_me() + self.consumer.logger_new_url(self) return False return True - def check_cache (self): - linkcheck.log.debug(linkcheck.LOG_CHECK, "checking cache") - for key in self.get_cache_keys(): - if self.config.url_cache_has_key(key): - self.copy_from_cache(self.config.url_cache_get(key)) - self.cached = True - self.log_me() - return False - return True - def close_connection (self): """close an opened url connection""" # brute force closing @@ -379,8 +360,8 @@ class UrlBase (object): self.is_parseable() and \ self.can_get_content() and \ not self.is_cached() and \ - (self.config["recursionlevel"] < 0 or - self.recursion_level < self.config["recursionlevel"]) and \ + (self.consumer.config["recursionlevel"] < 0 or + self.recursion_level < self.consumer.config["recursionlevel"]) and \ not self.extern[0] and self.content_allows_robots() def content_allows_robots (self): @@ -418,19 +399,19 @@ class UrlBase (object): self.add_warning(_("anchor #%s not found") % self.anchor) def _get_extern (self): - if not (self.config["externlinks"] or self.config["internlinks"]): + if not (self.consumer.config["externlinks"] or self.consumer.config["internlinks"]): return (0, 0) # deny and allow external checking linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", self.url) - if self.config["denyallow"]: - for entry in self.config["externlinks"]: + if self.consumer.config["denyallow"]: + for entry in self.consumer.config["externlinks"]: linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r", entry) match = entry['pattern'].search(self.url) if (entry['negate'] and not match) or \ (match and not entry['negate']): return (1, entry['strict']) - for entry in self.config["internlinks"]: + for entry in self.consumer.config["internlinks"]: linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r", entry) match = entry['pattern'].search(self.url) @@ -439,14 +420,14 @@ class UrlBase (object): return (0, 0) return (0, 0) else: - for entry in self.config["internlinks"]: + for entry in self.consumer.config["internlinks"]: linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r", entry) match = entry['pattern'].search(self.url) if (entry['negate'] and not match) or \ (match and not entry['negate']): return (0, 0) - for entry in self.config["externlinks"]: + for entry in self.consumer.config["externlinks"]: linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r", entry) match = entry['pattern'].search(self.url) @@ -482,7 +463,7 @@ class UrlBase (object): def check_size (self): """if a maximum size was given, call this function to check it against the content size of this url""" - maxbytes = self.config["warnsizebytes"] + maxbytes = self.consumer.config["warnsizebytes"] if maxbytes is not None and self.dlsize >= maxbytes: self.add_warning(_("Content size %s is larger than %s") % \ (linkcheck.strformat.strsize(self.dlsize), @@ -497,7 +478,7 @@ class UrlBase (object): self.parse_html() def get_user_password (self): - for auth in self.config["authentication"]: + for auth in self.consumer.config["authentication"]: if auth['pattern'].match(self.url): return auth['user'], auth['password'] return None, None @@ -535,10 +516,10 @@ class UrlBase (object): base = base_ref linkcheck.log.debug(linkcheck.LOG_CHECK, "Put url %r in queue", url) - self.config.append_url(linkcheck.checker.get_url_from(url, - self.recursion_level+1, self.config, - parent_url=self.url, base_ref=base, - line=line, column=column, name=name)) + self.consumer.append_url(linkcheck.checker.get_url_from(url, + self.recursion_level+1, self.consumer, + parent_url=self.url, base_ref=base, + line=line, column=column, name=name)) def parse_opera (self): """parse an opera bookmark file""" @@ -553,8 +534,9 @@ class UrlBase (object): elif line.startswith("URL="): url = line[4:] if url: - self.config.append_url(linkcheck.checker.get_url_from(url, - self.recursion_level+1, self.config, self.url, None, lineno, name)) + self.consumer.append_url(linkcheck.checker.get_url_from(url, + self.recursion_level+1, self.consumer, + self.url, None, lineno, name)) name = "" def parse_text (self): @@ -567,9 +549,9 @@ class UrlBase (object): lineno += 1 line = line.strip() if not line or line.startswith('#'): continue - self.config.append_url( + self.consumer.append_url( linkcheck.checker.get_url_from(line, self.recursion_level+1, - self.config, parent_url=self.url, line=lineno)) + self.consumer, parent_url=self.url, line=lineno)) def parse_css (self): """parse a CSS file for url() patterns""" @@ -578,9 +560,9 @@ class UrlBase (object): lineno += 1 for mo in linkcheck.linkparse.css_url_re.finditer(line): column = mo.start("url") - self.config.append_url( + self.consumer.append_url( linkcheck.checker.get_url_from(mo.group("url"), - self.recursion_level+1, self.config, + self.recursion_level+1, self.consumer, parent_url=self.url, line=lineno, column=column)) def __str__ (self): @@ -590,7 +572,6 @@ class UrlBase (object): "base_url=%s" % self.base_url, "parent_url=%s" % self.parent_url, "base_ref=%s" % self.base_ref, - "cached=%s" % self.cached, "recursion_level=%s" % self.recursion_level, "url_connection=%s" % self.url_connection, "line=%s" % self.line, diff --git a/linkcheck/checker/urlconnect.py b/linkcheck/checker/urlconnect.py index b90d2431..35070340 100644 --- a/linkcheck/checker/urlconnect.py +++ b/linkcheck/checker/urlconnect.py @@ -27,9 +27,9 @@ from linkcheck.i18n import _ class UrlConnect (urlbase.UrlBase): """Url link for which we have to connect to a specific host""" - def __init__ (self, base_url, recursion_level, config, parent_url=None, - base_ref=None, line=0, column=0, name=""): - super(UrlConnect, self).__init__(base_url, recursion_level, config, + def __init__ (self, base_url, recursion_level, consumer, + parent_url=None, base_ref=None, line=0, column=0, name=""): + super(UrlConnect, self).__init__(base_url, recursion_level, consumer, parent_url=parent_url, base_ref=base_ref, line=line, column=column, name=name) self.host = None