mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-29 04:00:34 +00:00
split off cache and url consumer routines into separate classes
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1432 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
c49ac001d1
commit
f2e7ca6040
12 changed files with 303 additions and 168 deletions
|
|
@ -137,55 +137,42 @@ acap # application configuration access protocol
|
|||
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
||||
|
||||
|
||||
def print_status (config, curtime, start_time):
|
||||
"""print check status looking at url queues"""
|
||||
tocheck = len(config.urls)
|
||||
links = config['linknumber']
|
||||
active = config.threader.active_threads()
|
||||
duration = linkcheck.strformat.strduration(curtime - start_time)
|
||||
print >> sys.stderr, \
|
||||
_("%5d urls queued, %4d links checked, %2d active threads, runtime %s")\
|
||||
% (tocheck, links, active, duration)
|
||||
|
||||
|
||||
# main check function
|
||||
def check_urls (config):
|
||||
def check_urls (consumer):
|
||||
"""Gets a complete configuration object as parameter where all
|
||||
runtime-dependent options are stored. If you call this function
|
||||
more than once, you can specify different configurations.
|
||||
|
||||
In the config object there are functions to get new URLs to check,
|
||||
and to perform the actual checking.
|
||||
"""
|
||||
config.logger_start_output()
|
||||
try:
|
||||
start_time = time.time()
|
||||
status_time = start_time
|
||||
while True:
|
||||
if config.has_more_urls():
|
||||
config.check_url(config.get_url())
|
||||
elif config.finished():
|
||||
break
|
||||
else:
|
||||
# active connections are downloading/parsing, so
|
||||
# wait a little
|
||||
time.sleep(0.1)
|
||||
if config['status']:
|
||||
curtime = time.time()
|
||||
if (curtime - status_time) > 5:
|
||||
print_status(config, curtime, start_time)
|
||||
status_time = curtime
|
||||
config.logger_end_output()
|
||||
_check_urls(consumer)
|
||||
except KeyboardInterrupt:
|
||||
config.finish()
|
||||
config.logger_end_output()
|
||||
active = config.threader.active_threads()
|
||||
consumer.finish()
|
||||
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
||||
_("keyboard interrupt; waiting for %d active threads to finish"),
|
||||
active)
|
||||
consumer.active_threads())
|
||||
raise
|
||||
|
||||
|
||||
def _check_urls (consumer):
|
||||
consumer.logger_start_output()
|
||||
start_time = time.time()
|
||||
status_time = start_time
|
||||
while not consumer.finished():
|
||||
url = consumer.get_url()
|
||||
if url is not None:
|
||||
consumer.check_url(url)
|
||||
else:
|
||||
# active connections are downloading/parsing, so
|
||||
# wait a little
|
||||
time.sleep(0.1)
|
||||
if consumer.config['status']:
|
||||
curtime = time.time()
|
||||
if (curtime - status_time) > 5:
|
||||
consumer.print_status(curtime, start_time)
|
||||
status_time = curtime
|
||||
consumer.logger_end_output()
|
||||
|
||||
|
||||
# file extensions we can parse recursively
|
||||
extensions = {
|
||||
"html": re.compile(r'(?i)\.s?html?$'),
|
||||
|
|
@ -237,9 +224,9 @@ def absolute_url (base_url, base_ref, parent_url):
|
|||
return ""
|
||||
|
||||
|
||||
def get_url_from (base_url, recursion_level, config, parent_url=None,
|
||||
base_ref=None, line=0, column=0, name=None,
|
||||
cmdline=None):
|
||||
def get_url_from (base_url, recursion_level, consumer,
|
||||
parent_url=None, base_ref=None, line=0, column=0,
|
||||
name=None, cmdline=None):
|
||||
"""get url data from given base data"""
|
||||
if cmdline and linkcheck.url.url_needs_quoting(base_url):
|
||||
base_url = linkcheck.url.url_quote(base_url)
|
||||
|
|
@ -269,9 +256,10 @@ def get_url_from (base_url, recursion_level, config, parent_url=None,
|
|||
# assume local file
|
||||
else:
|
||||
klass = linkcheck.checker.fileurl.FileUrl
|
||||
if cmdline and url and config['strict'] and \
|
||||
not (config['internlinks'] or config['externlinks']):
|
||||
if cmdline and url and consumer.config['strict'] and \
|
||||
not (consumer.config['internlinks'] or consumer.config['externlinks']):
|
||||
# set automatic intern/extern stuff if no filter was given
|
||||
set_intern_url(url, klass, config)
|
||||
return klass(base_url, recursion_level, config, parent_url, base_ref,
|
||||
set_intern_url(url, klass, consumer.config)
|
||||
return klass(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
|
|
|
|||
183
linkcheck/checker/consumer.py
Normal file
183
linkcheck/checker/consumer.py
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""url consumer class"""
|
||||
# Copyright (C) 2000-2004 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import sys
|
||||
try:
|
||||
import threading
|
||||
except ImportError:
|
||||
import dummy_threading as threading
|
||||
|
||||
import linkcheck.threader
|
||||
|
||||
from linkcheck.i18n import _
|
||||
|
||||
class Consumer (object):
|
||||
"""consume urls from the url queue in a threaded manner"""
|
||||
|
||||
def __init__ (self, config, cache):
|
||||
"""initialize consumer data and threads"""
|
||||
self.config = config
|
||||
self.cache = cache
|
||||
self.urls = []
|
||||
self.threader = linkcheck.threader.Threader()
|
||||
self._set_threads(config['threads'])
|
||||
self.logger = config['logger']
|
||||
self.fileoutput = config['fileoutput']
|
||||
self.linknumber = 0
|
||||
# one lock for the data
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def filter_url_queue (self):
|
||||
"""remove already cached urls from queue"""
|
||||
pass # deadlock!
|
||||
#self.lock.acquire()
|
||||
#try:
|
||||
# urls = []
|
||||
# for url_data in self.urls:
|
||||
# if self.cache.check_cache(url_data):
|
||||
# self.logger_new_url(url_data)
|
||||
# else:
|
||||
# urls.append(url_data)
|
||||
# self.urls = urls
|
||||
# print >> sys.stderr, \
|
||||
# _("removed %d cached urls from incoming queue") % len(removed)
|
||||
#finally:
|
||||
# self.lock.release()
|
||||
|
||||
def _set_threads (self, num):
|
||||
"""set number of checker threads to start"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"set threading with %d threads", num)
|
||||
self.threader.threads_max = num
|
||||
if num > 0:
|
||||
sys.setcheckinterval(50)
|
||||
else:
|
||||
sys.setcheckinterval(100)
|
||||
|
||||
def check_url (self, url_data):
|
||||
"""start new thread checking the given url"""
|
||||
self.threader.start_thread(url_data.check, ())
|
||||
|
||||
def append_url (self, url_data):
|
||||
"""add new url to list of urls to check"""
|
||||
# check syntax
|
||||
if not url_data.check_syntax():
|
||||
# wrong syntax, do not check any further
|
||||
return
|
||||
# check the cache
|
||||
if self.cache.check_cache(url_data):
|
||||
# already cached
|
||||
self.logger_new_url(url_data)
|
||||
return
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.urls.append(url_data)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def finished (self):
|
||||
"""return True if checking is finished"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return self.threader.finished() and len(self.urls) <= 0
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def get_url (self):
|
||||
"""get first url in queue and return it"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if not self.urls:
|
||||
return None
|
||||
u = self.urls[0]
|
||||
del self.urls[0]
|
||||
return u
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def finish (self):
|
||||
"""finish checking and send of-of-output message to logger"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.threader.finish()
|
||||
finally:
|
||||
self.lock.release()
|
||||
self.logger_end_output()
|
||||
|
||||
def print_status (self, curtime, start_time):
|
||||
"""print check status looking at url queues"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
active = self.threader.active_threads()
|
||||
links = self.linknumber
|
||||
tocheck = len(self.urls)
|
||||
duration = linkcheck.strformat.strduration(curtime - start_time)
|
||||
print >> sys.stderr, _("%5d urls queued, %4d links checked, "\
|
||||
"%2d active threads, runtime %s")\
|
||||
% (tocheck, links, active, duration)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def logger_start_output (self):
|
||||
"""start output of all configured loggers"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if not self.config['quiet']:
|
||||
self.logger.start_output()
|
||||
for logger in self.fileoutput:
|
||||
logger.start_output()
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def logger_new_url (self, url_data):
|
||||
"""send new url to all configured loggers"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.linknumber += 1
|
||||
do_filter = (self.linknumber % 1000) == 0
|
||||
if not self.config['quiet'] and \
|
||||
(self.config["verbose"] or not url_data.valid or
|
||||
(url_data.warning and self.config["warnings"])):
|
||||
self.logger.new_url(url_data)
|
||||
for log in self.fileoutput:
|
||||
log.new_url(url_data)
|
||||
finally:
|
||||
self.lock.release()
|
||||
# XXX deadlock!
|
||||
#if do_filter:
|
||||
# self.filter_queue(self)
|
||||
|
||||
def logger_end_output (self):
|
||||
"""end output of all configured loggers"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if not self.config['quiet']:
|
||||
self.logger.end_output(linknumber=self.linknumber)
|
||||
for logger in self.fileoutput:
|
||||
logger.end_output(linknumber=self.linknumber)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def active_threads (self):
|
||||
"""return number of active threads"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return self.threader.active_threads()
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
|
|
@ -53,15 +53,12 @@ def get_index_html (dirname):
|
|||
class FileUrl (urlbase.UrlBase):
|
||||
"Url link with file scheme"
|
||||
|
||||
def __init__ (self,
|
||||
base_url,
|
||||
config,
|
||||
recursion_level,
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url = None,
|
||||
base_ref = None, line=0, column=0, name=""):
|
||||
super(FileUrl, self).__init__(base_url, config, recursion_level,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
super(FileUrl, self).__init__(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
if not (parent_url or base_ref or self.base_url.startswith("file:")):
|
||||
self.base_url = os.path.expanduser(self.base_url)
|
||||
if not self.base_url.startswith("/"):
|
||||
|
|
|
|||
|
|
@ -32,11 +32,11 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
|
||||
def check_connection (self):
|
||||
# proxy support (we support only http)
|
||||
self.set_proxy(self.config["proxy"].get(self.scheme))
|
||||
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
|
||||
if self.proxy:
|
||||
http = httpurl.HttpUrl(self.base_url,
|
||||
self.recursion_level,
|
||||
self.config,
|
||||
self.consumer.config,
|
||||
parent_url=self.parent_url,
|
||||
base_ref=self.base_ref,
|
||||
line=self.line,
|
||||
|
|
@ -80,7 +80,7 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
# ready to connect
|
||||
try:
|
||||
self.url_connection = ftplib.FTP()
|
||||
if self.config.get("debug"):
|
||||
if self.consumer.config.get("debug"):
|
||||
self.url_connection.set_debuglevel(1)
|
||||
self.url_connection.connect(self.urlparts[1])
|
||||
self.url_connection.login(_user, _password)
|
||||
|
|
|
|||
|
|
@ -28,4 +28,4 @@ class HttpsUrl (httpurl.HttpUrl):
|
|||
super(HttpsUrl, self).local_check()
|
||||
else:
|
||||
self.add_warning(_("%s url ignored")%self.scheme.capitalize())
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
|
|
|
|||
|
|
@ -46,11 +46,11 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
|
|||
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
||||
"Url link with http scheme"
|
||||
|
||||
def __init__ (self, base_url, recursion_level, config, parent_url=None,
|
||||
base_ref=None, line=0, column=0, name=""):
|
||||
super(HttpUrl, self).__init__(base_url, recursion_level, config,
|
||||
parent_url=parent_url, base_ref=base_ref, line=line,
|
||||
column=column, name=name)
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url=None, base_ref=None, line=0, column=0, name=""):
|
||||
super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref, line=line,
|
||||
column=column, name=name)
|
||||
self.aliases = []
|
||||
self.max_redirects = 5
|
||||
self.has301status = False
|
||||
|
|
@ -109,13 +109,13 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
| extension-code
|
||||
"""
|
||||
# set the proxy, so a 407 status after this is an error
|
||||
self.set_proxy(self.config["proxy"].get(self.scheme))
|
||||
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
|
||||
if self.proxy:
|
||||
self.add_info(_("Using Proxy %r") % self.proxy)
|
||||
self.headers = None
|
||||
self.auth = None
|
||||
self.cookies = []
|
||||
if not self.robots_txt_allows_url():
|
||||
if not self.consumer.cache.robots_txt_allows_url(self):
|
||||
self.add_warning(
|
||||
_("Access denied by robots.txt, checked only syntax"))
|
||||
return
|
||||
|
|
@ -235,6 +235,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.set_result(
|
||||
_("recursive redirection encountered:\n %s") % \
|
||||
"\n => ".join(redirect_cache), valid=False)
|
||||
self.consumer.logger_new_url(self)
|
||||
return -1, response
|
||||
redirect_cache.append(redirected)
|
||||
# remember this alias
|
||||
|
|
@ -252,11 +253,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.has301status = True
|
||||
self.aliases.append(redirected)
|
||||
# check cache again on possibly changed URL
|
||||
key = self.get_cache_key()
|
||||
if self.config.url_cache_has_key(key):
|
||||
self.copy_from_cache(self.config.url_cache_get(key))
|
||||
self.cached = True
|
||||
self.log_me()
|
||||
if self.consumer.cache.check_cache(self):
|
||||
self.consumer.logger_new_url(self)
|
||||
return -1, response
|
||||
# check if we still have a http url, it could be another
|
||||
# scheme, eg https or news
|
||||
|
|
@ -266,15 +264,14 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
"the original url was %r.") % self.url)
|
||||
# make new Url object
|
||||
newobj = linkcheck.checker.get_url_from(
|
||||
redirected, self.recursion_level, self.config,
|
||||
redirected, self.recursion_level, self.consumer,
|
||||
parent_url=self.parent_url, base_ref=self.base_ref,
|
||||
line=self.line, column=self.column, name=self.name)
|
||||
newobj.warning = self.warning
|
||||
newobj.info = self.info
|
||||
# append new object to queue
|
||||
self.config.append_url(newobj)
|
||||
self.consumer.append_url(newobj)
|
||||
# pretend to be finished and logged
|
||||
self.cached = True
|
||||
return -1, response
|
||||
# new response data
|
||||
response = self._get_http_response()
|
||||
|
|
@ -302,10 +299,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
# no content
|
||||
self.add_warning(response.reason)
|
||||
# store cookies for valid links
|
||||
if self.config['cookies']:
|
||||
if self.consumer.config['cookies']:
|
||||
for c in self.cookies:
|
||||
self.add_info("Cookie: %s" % c)
|
||||
out = self.config.storeCookies(self.headers, self.urlparts[1])
|
||||
out = self.consumer.config.storeCookies(self.headers, self.urlparts[1])
|
||||
for h in out:
|
||||
self.add_info(h)
|
||||
if response.status >= 200:
|
||||
|
|
@ -335,14 +332,16 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
if self.url_connection:
|
||||
self.close_connection()
|
||||
self.url_connection = self.get_http_object(host, scheme)
|
||||
url = urlparse.urlunsplit(self.urlparts)
|
||||
if self.no_anchor:
|
||||
qurlparts[4] = ''
|
||||
anchor = ''
|
||||
else:
|
||||
anchor = self.urlparts[4]
|
||||
if self.proxy:
|
||||
path = urlparse.urlunsplit(self.urlparts)
|
||||
path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
|
||||
self.urlparts[2], self.urlparts[3], anchor))
|
||||
else:
|
||||
path = urlparse.urlunsplit(('', '', self.urlparts[2],
|
||||
self.urlparts[3], self.urlparts[4]))
|
||||
self.urlparts[3], anchor))
|
||||
self.url_connection.putrequest(self.method, path, skip_host=True)
|
||||
self.url_connection.putheader("Host", host)
|
||||
# userinfo is from http://user@pass:host/
|
||||
|
|
@ -360,8 +359,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
linkcheck.configuration.UserAgent)
|
||||
self.url_connection.putheader("Accept-Encoding",
|
||||
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
|
||||
if self.config['cookies']:
|
||||
self.cookies = self.config.getCookies(self.urlparts[1],
|
||||
if self.consumer.config['cookies']:
|
||||
self.cookies = self.consumer.config.getCookies(self.urlparts[1],
|
||||
self.urlparts[2])
|
||||
for c in self.cookies:
|
||||
self.url_connection.putheader("Cookie", c)
|
||||
|
|
@ -375,7 +374,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
h = linkcheck.httplib2.HTTPSConnection(host)
|
||||
else:
|
||||
raise linkcheck.LinkCheckerError("invalid url scheme %s" % scheme)
|
||||
if self.config.get("debug"):
|
||||
if self.consumer.config.get("debug"):
|
||||
h.set_debuglevel(1)
|
||||
h.connect()
|
||||
return h
|
||||
|
|
@ -447,15 +446,3 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
def get_robots_txt_url (self):
|
||||
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
|
||||
|
||||
def robots_txt_allows_url (self):
|
||||
roboturl = self.get_robots_txt_url()
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "robots.txt url %r",
|
||||
roboturl)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "url %r", self.url)
|
||||
if not self.config.robots_txt_cache_has_key(roboturl):
|
||||
rp = linkcheck.robotparser2.RobotFileParser()
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
self.config.robots_txt_cache_set(roboturl, rp)
|
||||
rp = self.config.robots_txt_cache_get(roboturl)
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent, self.url)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ class IgnoredUrl (urlbase.UrlBase):
|
|||
|
||||
def local_check (self):
|
||||
self.add_warning(_("%s url ignored")%self.scheme.capitalize())
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
|
||||
def can_get_content (self):
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@
|
|||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import urllib
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ class NntpUrl (urlbase.UrlBase):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, self.urlparts)
|
||||
|
||||
def check_connection (self):
|
||||
nntpserver = self.urlparts[1] or self.config["nntpserver"]
|
||||
nntpserver = self.urlparts[1] or self.consumer.config["nntpserver"]
|
||||
if not nntpserver:
|
||||
self.add_warning(_("No NNTP server specified, skipping this URL"))
|
||||
return
|
||||
|
|
|
|||
|
|
@ -49,14 +49,14 @@ class TelnetUrl (urlconnect.UrlConnect):
|
|||
def local_check (self):
|
||||
if not self.host:
|
||||
self.set_result(_("Host is empty"), valid=False)
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
return
|
||||
super(TelnetUrl, self).local_check()
|
||||
|
||||
def check_connection (self):
|
||||
super(TelnetUrl, self).check_connection()
|
||||
self.url_connection = telnetlib.Telnet()
|
||||
if self.config.get("debug"):
|
||||
if self.consumer.config.get("debug"):
|
||||
self.url_connection.set_debuglevel(1)
|
||||
self.url_connection.open(self.host, self.port)
|
||||
if self.user:
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ def print_app_info ():
|
|||
class UrlBase (object):
|
||||
"""An URL with additional information like validity etc."""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, config,
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url = None, base_ref = None,
|
||||
line = 0, column = 0, name = ""):
|
||||
"""Initialize check data, and store given variables.
|
||||
|
|
@ -100,8 +100,9 @@ class UrlBase (object):
|
|||
self.parent_url = parent_url
|
||||
self.anchor = None
|
||||
self.recursion_level = recursion_level
|
||||
self.config = config
|
||||
self.consumer = consumer
|
||||
self.result = ""
|
||||
self.cached = False
|
||||
self.valid = True
|
||||
self.warning = linkcheck.containers.SetList()
|
||||
self.info = linkcheck.containers.SetList()
|
||||
|
|
@ -111,7 +112,6 @@ class UrlBase (object):
|
|||
self.dltime = -1
|
||||
self.dlsize = -1
|
||||
self.checktime = 0
|
||||
self.cached = False
|
||||
self.url_connection = None
|
||||
self.extern = (1, 0)
|
||||
self.data = None
|
||||
|
|
@ -169,6 +169,7 @@ class UrlBase (object):
|
|||
self.info.extend(cache_data["info"])
|
||||
self.valid = cache_data["valid"]
|
||||
self.dltime = cache_data["dltime"]
|
||||
self.cached = True
|
||||
|
||||
def get_cache_data (self):
|
||||
"""return all data values that should be put in the cache"""
|
||||
|
|
@ -186,13 +187,12 @@ class UrlBase (object):
|
|||
return [key]
|
||||
|
||||
def is_cached (self):
|
||||
key = self.get_cache_key()
|
||||
return self.cached or self.config.url_seen_has_key(key)
|
||||
return self.consumer.cache.url_is_cached(self.get_cache_key())
|
||||
|
||||
def get_cache_key (self):
|
||||
# note: the host is already lowercase
|
||||
if self.urlparts:
|
||||
if self.config["anchorcaching"]:
|
||||
if self.consumer.config["anchorcaching"]:
|
||||
# do not ignore anchor
|
||||
return urlparse.urlunsplit(self.urlparts)
|
||||
else:
|
||||
|
|
@ -200,16 +200,6 @@ class UrlBase (object):
|
|||
return urlparse.urlunsplit(self.urlparts[:4]+[''])
|
||||
return None
|
||||
|
||||
def put_in_cache (self):
|
||||
"""put url data into cache"""
|
||||
if self.is_cached():
|
||||
# another thread was faster and cached this url already
|
||||
return
|
||||
data = self.get_cache_data()
|
||||
for key in self.get_cache_keys():
|
||||
self.config.url_cache_set(key, data)
|
||||
self.config.url_seen_set(key)
|
||||
|
||||
def build_url (self):
|
||||
# make url absolute
|
||||
if self.base_ref:
|
||||
|
|
@ -236,14 +226,6 @@ class UrlBase (object):
|
|||
# safe anchor for later checking
|
||||
self.anchor = self.urlparts[4]
|
||||
|
||||
def log_me (self):
|
||||
"""announce the url data as checked to the configured loggers"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "logging url")
|
||||
self.config.increment_linknumber()
|
||||
if self.config["verbose"] or not self.valid or \
|
||||
(self.warning and self.config["warnings"]):
|
||||
self.config.logger_new_url(self)
|
||||
|
||||
def check (self):
|
||||
try:
|
||||
self.local_check()
|
||||
|
|
@ -260,28 +242,30 @@ class UrlBase (object):
|
|||
|
||||
def local_check (self):
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
|
||||
if self.recursion_level and self.config['wait']:
|
||||
if self.recursion_level and self.consumer.config['wait']:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"sleeping for %d seconds", self.config['wait'])
|
||||
time.sleep(self.config['wait'])
|
||||
"sleeping for %d seconds", self.consumer.config['wait'])
|
||||
time.sleep(self.consumer.config['wait'])
|
||||
t = time.time()
|
||||
if not self.check_cache():
|
||||
if self.consumer.cache.check_cache(self):
|
||||
# was cached from previous queue member
|
||||
self.consumer.logger_new_url(self)
|
||||
return
|
||||
# apply filter
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern)
|
||||
if self.extern[0] and (self.config["strict"] or self.extern[1]):
|
||||
if self.extern[0] and (self.consumer.config["strict"] or self.extern[1]):
|
||||
self.add_warning(
|
||||
_("outside of domain filter, checked only syntax"))
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
return
|
||||
|
||||
# check connection
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection")
|
||||
try:
|
||||
self.check_connection()
|
||||
if self.cached:
|
||||
if self.is_cached():
|
||||
return
|
||||
if self.config["anchors"]:
|
||||
if self.consumer.config["anchors"]:
|
||||
self.check_anchors()
|
||||
except tuple(linkcheck.checker.ExcList):
|
||||
etype, evalue, etb = sys.exc_info()
|
||||
|
|
@ -296,7 +280,7 @@ class UrlBase (object):
|
|||
self.set_result(str(evalue), valid=False)
|
||||
|
||||
# check content
|
||||
warningregex = self.config["warningregex"]
|
||||
warningregex = self.consumer.config["warningregex"]
|
||||
if warningregex and self.valid:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
|
||||
try:
|
||||
|
|
@ -323,40 +307,37 @@ class UrlBase (object):
|
|||
valid=False)
|
||||
# close
|
||||
self.close_connection()
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "caching")
|
||||
self.put_in_cache()
|
||||
self.consumer.cache.url_data_cache_add(self)
|
||||
|
||||
def check_syntax (self):
|
||||
"""Called before self.check(), this function inspects the
|
||||
url syntax. Success enables further checking, failure
|
||||
immediately logs this url. This syntax check must not
|
||||
use any network resources.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
|
||||
if not self.base_url:
|
||||
self.set_result(_("URL is empty"), valid=False)
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
return False
|
||||
if ws_at_start_or_end(self.base_url):
|
||||
# leading or trailing whitespace is common, so make a
|
||||
# separate error message for this
|
||||
self.set_result(_("URL has whitespace at beginning or end"),
|
||||
valid=False)
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
return False
|
||||
try:
|
||||
self.build_url()
|
||||
self.extern = self._get_extern()
|
||||
except linkcheck.LinkCheckerError, msg:
|
||||
self.set_result(str(msg), valid=False)
|
||||
self.log_me()
|
||||
self.consumer.logger_new_url(self)
|
||||
return False
|
||||
return True
|
||||
|
||||
def check_cache (self):
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking cache")
|
||||
for key in self.get_cache_keys():
|
||||
if self.config.url_cache_has_key(key):
|
||||
self.copy_from_cache(self.config.url_cache_get(key))
|
||||
self.cached = True
|
||||
self.log_me()
|
||||
return False
|
||||
return True
|
||||
|
||||
def close_connection (self):
|
||||
"""close an opened url connection"""
|
||||
# brute force closing
|
||||
|
|
@ -379,8 +360,8 @@ class UrlBase (object):
|
|||
self.is_parseable() and \
|
||||
self.can_get_content() and \
|
||||
not self.is_cached() and \
|
||||
(self.config["recursionlevel"] < 0 or
|
||||
self.recursion_level < self.config["recursionlevel"]) and \
|
||||
(self.consumer.config["recursionlevel"] < 0 or
|
||||
self.recursion_level < self.consumer.config["recursionlevel"]) and \
|
||||
not self.extern[0] and self.content_allows_robots()
|
||||
|
||||
def content_allows_robots (self):
|
||||
|
|
@ -418,19 +399,19 @@ class UrlBase (object):
|
|||
self.add_warning(_("anchor #%s not found") % self.anchor)
|
||||
|
||||
def _get_extern (self):
|
||||
if not (self.config["externlinks"] or self.config["internlinks"]):
|
||||
if not (self.consumer.config["externlinks"] or self.consumer.config["internlinks"]):
|
||||
return (0, 0)
|
||||
# deny and allow external checking
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", self.url)
|
||||
if self.config["denyallow"]:
|
||||
for entry in self.config["externlinks"]:
|
||||
if self.consumer.config["denyallow"]:
|
||||
for entry in self.consumer.config["externlinks"]:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
|
||||
entry)
|
||||
match = entry['pattern'].search(self.url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
return (1, entry['strict'])
|
||||
for entry in self.config["internlinks"]:
|
||||
for entry in self.consumer.config["internlinks"]:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
|
||||
entry)
|
||||
match = entry['pattern'].search(self.url)
|
||||
|
|
@ -439,14 +420,14 @@ class UrlBase (object):
|
|||
return (0, 0)
|
||||
return (0, 0)
|
||||
else:
|
||||
for entry in self.config["internlinks"]:
|
||||
for entry in self.consumer.config["internlinks"]:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
|
||||
entry)
|
||||
match = entry['pattern'].search(self.url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
return (0, 0)
|
||||
for entry in self.config["externlinks"]:
|
||||
for entry in self.consumer.config["externlinks"]:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
|
||||
entry)
|
||||
match = entry['pattern'].search(self.url)
|
||||
|
|
@ -482,7 +463,7 @@ class UrlBase (object):
|
|||
def check_size (self):
|
||||
"""if a maximum size was given, call this function to check it
|
||||
against the content size of this url"""
|
||||
maxbytes = self.config["warnsizebytes"]
|
||||
maxbytes = self.consumer.config["warnsizebytes"]
|
||||
if maxbytes is not None and self.dlsize >= maxbytes:
|
||||
self.add_warning(_("Content size %s is larger than %s") % \
|
||||
(linkcheck.strformat.strsize(self.dlsize),
|
||||
|
|
@ -497,7 +478,7 @@ class UrlBase (object):
|
|||
self.parse_html()
|
||||
|
||||
def get_user_password (self):
|
||||
for auth in self.config["authentication"]:
|
||||
for auth in self.consumer.config["authentication"]:
|
||||
if auth['pattern'].match(self.url):
|
||||
return auth['user'], auth['password']
|
||||
return None, None
|
||||
|
|
@ -535,10 +516,10 @@ class UrlBase (object):
|
|||
base = base_ref
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Put url %r in queue",
|
||||
url)
|
||||
self.config.append_url(linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.config,
|
||||
parent_url=self.url, base_ref=base,
|
||||
line=line, column=column, name=name))
|
||||
self.consumer.append_url(linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer,
|
||||
parent_url=self.url, base_ref=base,
|
||||
line=line, column=column, name=name))
|
||||
|
||||
def parse_opera (self):
|
||||
"""parse an opera bookmark file"""
|
||||
|
|
@ -553,8 +534,9 @@ class UrlBase (object):
|
|||
elif line.startswith("URL="):
|
||||
url = line[4:]
|
||||
if url:
|
||||
self.config.append_url(linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.config, self.url, None, lineno, name))
|
||||
self.consumer.append_url(linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer,
|
||||
self.url, None, lineno, name))
|
||||
name = ""
|
||||
|
||||
def parse_text (self):
|
||||
|
|
@ -567,9 +549,9 @@ class UrlBase (object):
|
|||
lineno += 1
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'): continue
|
||||
self.config.append_url(
|
||||
self.consumer.append_url(
|
||||
linkcheck.checker.get_url_from(line, self.recursion_level+1,
|
||||
self.config, parent_url=self.url, line=lineno))
|
||||
self.consumer, parent_url=self.url, line=lineno))
|
||||
|
||||
def parse_css (self):
|
||||
"""parse a CSS file for url() patterns"""
|
||||
|
|
@ -578,9 +560,9 @@ class UrlBase (object):
|
|||
lineno += 1
|
||||
for mo in linkcheck.linkparse.css_url_re.finditer(line):
|
||||
column = mo.start("url")
|
||||
self.config.append_url(
|
||||
self.consumer.append_url(
|
||||
linkcheck.checker.get_url_from(mo.group("url"),
|
||||
self.recursion_level+1, self.config,
|
||||
self.recursion_level+1, self.consumer,
|
||||
parent_url=self.url, line=lineno, column=column))
|
||||
|
||||
def __str__ (self):
|
||||
|
|
@ -590,7 +572,6 @@ class UrlBase (object):
|
|||
"base_url=%s" % self.base_url,
|
||||
"parent_url=%s" % self.parent_url,
|
||||
"base_ref=%s" % self.base_ref,
|
||||
"cached=%s" % self.cached,
|
||||
"recursion_level=%s" % self.recursion_level,
|
||||
"url_connection=%s" % self.url_connection,
|
||||
"line=%s" % self.line,
|
||||
|
|
|
|||
|
|
@ -27,9 +27,9 @@ from linkcheck.i18n import _
|
|||
class UrlConnect (urlbase.UrlBase):
|
||||
"""Url link for which we have to connect to a specific host"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, config, parent_url=None,
|
||||
base_ref=None, line=0, column=0, name=""):
|
||||
super(UrlConnect, self).__init__(base_url, recursion_level, config,
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url=None, base_ref=None, line=0, column=0, name=""):
|
||||
super(UrlConnect, self).__init__(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
self.host = None
|
||||
|
|
|
|||
Loading…
Reference in a new issue