Replace the old threading algorithm with a new one based on Queue.Queue and consumer threads

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3146 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2006-05-13 13:44:52 +00:00
parent d05c68ef74
commit f002c5f965
28 changed files with 684 additions and 326 deletions

View file

@ -14,6 +14,12 @@
Changed: linkcheck/HtmlParser/htmllex.[lc],
linkcheck/tests/test_parser.py
* Revamp the threading algorithm by using a URL queue, with a
constant number of consumer threads called 'workers'.
This fixes the remaining "dequeue mutated during iteration" errors.
Type: feature
Changed: *.py
3.4 "The Chumscrubbers" (released 4.2.2006)
* Ignore decoding errors when retrieving the robots.txt URL.

View file

@ -37,6 +37,7 @@ include doc/de/*.1
include doc/fr/*.1
include doc/Makefile doc/rest2htmlnav
recursive-include linkcheck/checker/tests/data *.txt *.html *.result *.asc *.css *.ico
recursive-include linkcheck/configuration/tests/data *.ini
include linkcheck/tests/*.py
include linkcheck/checker/tests/*.py
include linkcheck/dns/tests/*.py

10
TODO
View file

@ -1,7 +1,11 @@
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483752
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/475160
- Improved print_status
- use format_time from quodlibet for times
- Ctrl-C is not really working.
- To limit the used memory, put a maximum size on the URL queue
(eg. 20000 URLs). If reached, the worker calling queue.put() will
wait for another worker to call queue.get() before continuing.
Problem: dead lock when all workers called queue.put().
- [FEATURE] postmortem debugging with pdb.pm()

View file

@ -58,6 +58,17 @@ class LinkCheckerError (Exception):
pass
def add_intern_pattern (url_data, config):
"""
Add intern URL regex to config.
"""
pat = url_data.get_intern_pattern()
if pat:
assert linkcheck.log.debug(LOG_CHECK,
"Add intern pattern %r from command line", pat)
config['internlinks'].append(get_link_pat(pat))
def get_link_pat (arg, strict=False):
"""
Get a link pattern matcher for intern/extern links.

19
linkcheck/cache/__init__.py vendored Normal file
View file

@ -0,0 +1,19 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Store and provide cached data during checking in a thread-safe manner.
"""

111
linkcheck/cache/connection.py vendored Normal file
View file

@ -0,0 +1,111 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2005-2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Store and retrieve open connections.
"""
import time
import threading
from linkcheck.decorators import synchronized
# lock for robots.txt caching
_lock = threading.Lock()
class ConnectionPool (object):
"""
Thread-safe cache, storing a set of connections for URL retrieval.
"""
def __init__ (self):
"""
Initialize an empty connection dictionary which will have entries
of the form::
key -> [connection, status, expiration time]
Connection can be any open connection object (HTTP, FTP, ...).
Status is either 'available' or 'busy'.
Expiration time is the point of time in seconds when this
connection will be timed out.
The identifier key is usually a tuple (type, host, user, pass),
but it can be any immutable Python object.
"""
# open connections
# {(type, host, user, pass) -> [connection, status, expiration time]}
self.connections = {}
@synchronized(_lock)
def add (self, key, conn, timeout):
"""
Add connection to the pool with given identifier key and timeout
in seconds.
"""
self.connections[key] = [conn, 'available', time.time() + timeout]
@synchronized(_lock)
def get (self, key):
"""
Get open connection if available, for at most 30 seconds.
@return: Open connection object or None if no connection is available.
@rtype None or FTPConnection or HTTP(S)Connection
"""
if key not in self.connections:
# not found
return None
conn_data = self.connections[key]
t = time.time()
if t > conn_data[2]:
# timed out
try:
conn_data[1].close()
except:
# ignore close errors
pass
del self.connections[key]
return None
# wait at most 300*0.1=30 seconds for connection to become available
for dummy in xrange(300):
if conn_data[1] != 'busy':
conn_data[1] = 'busy'
conn_data[2] = t
return conn_data[0]
time.sleep(0.1)
# connection is in use
return None
@synchronized(_lock)
def release (self, key):
"""
Mark an open and reusable connection as available.
"""
if key in self.connections:
self.connections[key][1] = 'available'
@synchronized(_lock)
def expire_connections (self):
"""
Remove expired connections from this pool.
"""
t = time.time()
to_delete = []
for key, conn_data in self.connections.iteritems():
if conn_data[1] == 'available' and t > conn_data[2]:
to_delete.append(key)
for key in to_delete:
del self.connections[key]

73
linkcheck/cache/cookie.py vendored Normal file
View file

@ -0,0 +1,73 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Store and retrieve cookies.
"""
import threading
from linkcheck.decorators import synchronized
import linkcheck
import linkcheck.log
import linkcheck.cookies
# lock for caching
_lock = threading.Lock()
class CookieJar (object):
"""
Cookie storage, implementing the default cookie handling policy for
LinkChecker.
"""
def __init__ (self):
self.cache = {}
@synchronized(_lock)
def add (self, headers, scheme, host, path):
"""
Parse cookie values, add to cache.
"""
jar = set()
for h in headers.getallmatchingheaders("Set-Cookie"):
# RFC 2109 (Netscape) cookie type
try:
c = linkcheck.cookies.NetscapeCookie(h, scheme, host, path)
jar.add(c)
except linkcheck.cookies.CookieError:
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
for h in headers.getallmatchingheaders("Set-Cookie2"):
# RFC 2965 cookie type
try:
c = linkcheck.cookies.Rfc2965Cookie(h, scheme, host, path)
jar.add(c)
except linkcheck.cookies.CookieError:
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
self.cache[host] = jar
return jar
@synchronized(_lock)
def get (self, scheme, host, port, path):
"""
Cookie cache getter function.
"""
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Get cookies for host %r path %r", host, path)
jar = self.cache.setdefault(host, set())
return [x for x in jar if x.check_expired() and \
x.is_valid_for(scheme, host, port, path)]

View file

@ -15,23 +15,44 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
GeoIP wrapper.
Store and retrieve country names for IPs.
"""
import os
import threading
from linkcheck.decorators import synchronized
def get_country (gi, host):
# I don't know if the geoip library is already thread-safe, but
# we take no risks here.
_lock = threading.Lock()
# initialize GeoIP database
geoip = None
try:
import GeoIP
geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
if os.name == 'posix' and os.path.exists(geoip_dat):
geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
del geoip_dat
except ImportError:
pass
@synchronized(_lock)
def get_country (host):
"""
Get translated country name.
@return: country string or None
"""
c = gi.country_code_by_name(host)
if geoip is None:
return None
c = geoip.country_code_by_name(host)
if c and c in countries:
return "%s, %s" % (c, countries[c])
return None
# GeoIP country map with {short name -> translated full name} entries
countries = {
"AP": "Asia/Pacific Region",
"EU": "Europe",

52
linkcheck/cache/robots_txt.py vendored Normal file
View file

@ -0,0 +1,52 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Cache robots.txt contents.
"""
import threading
from linkcheck.decorators import synchronized
import linkcheck.robotparser2
import linkcheck.configuration
# lock for caching
_lock = threading.Lock()
class RobotsTxt (object):
"""
Thread-safe cache of downloaded robots.txt files.
format: {cache key (string) -> robots.txt content (RobotFileParser)}
"""
def __init__ (self):
self.cache = {}
@synchronized(_lock)
def allows_url (self, roboturl, url, user, password):
"""
Ask robots.txt allowance.
"""
if roboturl not in self.cache:
rp = linkcheck.robotparser2.RobotFileParser(
user=user, password=password)
rp.set_url(roboturl)
rp.read()
self.cache[roboturl] = rp
else:
rp = self.cache[roboturl]
return rp.can_fetch(linkcheck.configuration.UserAgent, url)

210
linkcheck/cache/urlqueue.py vendored Normal file
View file

@ -0,0 +1,210 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2006 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Handle a queue of URLs to check.
"""
import threading
import Queue
import time
import linkcheck
import linkcheck.log
class UrlQueue (Queue.Queue):
"""
A queue supporting several consumer tasks. The task_done() idea is
from the Python 2.5 Subversion repository.
"""
def __init__ (self, maxsize=0):
"""
Initialize the queue state and task counters.
"""
Queue.Queue.__init__(self, maxsize=maxsize)
self.all_tasks_done = threading.Condition(self.mutex)
self.unfinished_tasks = 0
self.finished_tasks = 0
self.in_progress = {}
self.checked = {}
self.shutdown = False
def get (self):
"""
Get first not-in-progress url from the queue and
return it. If no such url is available return None. The
url might be already cached.
"""
self.not_empty.acquire()
try:
while self._empty():
self.not_empty.wait()
url_data = self._get()
key = url_data.cache_url_key
if url_data.has_result:
# Already checked and copied from cache.
pass
elif key in self.checked:
# Already checked; copy result. And even ignore
# the case where url happens to be in_progress.
url_data.copy_from_cache(self.checked[key])
elif key in self.in_progress:
# It's being checked currently; put it back in the queue.
Queue.Queue._put(self, url_data)
url_data = None
else:
self.in_progress[key] = url_data
self.not_full.notify()
return url_data
finally:
self.not_empty.release()
def _put (self, url_data):
"""
Put URL in queue, increase number of unfished tasks.
"""
if self.shutdown:
# don't accept more URLs
return
key = url_data.cache_url_key
if key in self.checked:
# Put at beginning of queue to get consumed quickly.
url_data.copy_from_cache(self.checked[key])
self.queue.appendleft(url_data)
else:
self.queue.append(url_data)
self.unfinished_tasks += 1
def task_done (self, url_data):
"""
Indicate that a formerly enqueued task is complete.
Used by Queue consumer threads. For each get() used to fetch a task,
a subsequent call to task_done() tells the queue that the processing
on the task is complete.
If a join() is currently blocking, it will resume when all items
have been processed (meaning that a task_done() call was received
for every item that had been put() into the queue).
Raises a ValueError if called more times than there were items
placed in the queue.
"""
self.all_tasks_done.acquire()
try:
if url_data is not None:
key = url_data.cache_url_key
if key is not None and key not in self.checked:
self._cache_url(key, url_data)
self.finished_tasks += 1
unfinished = self.unfinished_tasks - 1
if unfinished <= 0:
if unfinished < 0:
raise ValueError('task_done() called too many times')
self.all_tasks_done.notifyAll()
self.unfinished_tasks = unfinished
finally:
self.all_tasks_done.release()
def _cache_url (self, key, url_data):
"""
Put URL result data into cache.
"""
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Caching %r", key)
assert key in self.in_progress, \
"%r not in %s" % (key, self.in_progress)
del self.in_progress[key]
data = url_data.get_cache_data()
self.checked[key] = data
# check for aliases (eg. through HTTP redirections)
if hasattr(url_data, "aliases"):
data = url_data.get_alias_cache_data()
for key in url_data.aliases:
if key in self.checked or key in self.in_progress:
continue
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Caching alias %r", key)
self.checked[key] = data
def join (self, timeout=None):
"""Blocks until all items in the Queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls task_done()
to indicate the item was retrieved and all work on it is complete.
When the count of unfinished tasks drops to zero, join() unblocks.
"""
self.all_tasks_done.acquire()
try:
if timeout is None:
while self.unfinished_tasks:
self.all_tasks_done.wait()
else:
if timeout < 0:
raise ValueError("'timeout' must be a positive number")
endtime = time.time() + timeout
while self.unfinished_tasks:
remaining = endtime - time.time()
if remaining <= 0.0:
return
self.all_tasks_done.wait(remaining)
finally:
self.all_tasks_done.release()
def do_shutdown (self):
"""
Shutdown the queue by not accepting any more URLs.
"""
self.mutex.acquire()
try:
unfinished = self.unfinished_tasks - len(self.queue)
self.queue.clear()
if unfinished <= 0:
if unfinished < 0:
raise ValueError('shutdown is in error')
self.all_tasks_done.notifyAll()
self.unfinished_tasks = unfinished
self.shutdown = True
finally:
self.mutex.release()
def status (self):
"""
Get tuple (finished tasks, unfinished tasks, queue size).
"""
self.mutex.acquire()
try:
return (self.finished_tasks, self.unfinished_tasks, len(self.queue))
finally:
self.mutex.release()
def checked_redirect (self, redirect, url_data):
"""
Check if redirect is already in cache. Used for URL redirections
to avoid double checking of already cached URLs.
If the redirect URL is found in the cache, the result data is
already copied.
"""
self.mutex.acquire()
try:
if redirect in self.checked:
url_data.copy_from_cache(self.checked[redirect])
return True
return False
finally:
self.mutex.release()

View file

@ -18,19 +18,14 @@
Main functions for link checking.
"""
import time
import sys
import os
import cgi
import socket
import codecs
import traceback
import select
import re
import urllib
import nntplib
import ftplib
import linkcheck.httplib2
import linkcheck.strformat
import linkcheck.dns.exception
@ -153,110 +148,6 @@ acap # application configuration access protocol
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
_encoding = linkcheck.i18n.default_encoding
stderr = codecs.getwriter(_encoding)(sys.stderr, errors="ignore")
def internal_error ():
"""
Print internal error message to stderr.
"""
print >> stderr, os.linesep
print >> stderr, _("""********** Oops, I did it again. *************
You have found an internal error in LinkChecker. Please write a bug report
at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
or send mail to %s and include the following information:
- the URL or file you are testing
- your commandline arguments and/or configuration.
- the output of a debug run with option "-Dall" of the executed command
- the system information below.
Disclosing some of the information above due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""") % linkcheck.configuration.Email
etype, value = sys.exc_info()[:2]
print >> stderr, etype, value
traceback.print_exc()
print_app_info()
print >> stderr, os.linesep, \
_("******** LinkChecker internal error, over and out ********")
sys.exit(1)
def print_app_info ():
"""
Print system and application info to stderr.
"""
print >> stderr, _("System info:")
print >> stderr, linkcheck.configuration.App
print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"):
value = os.getenv(key)
if value is not None:
print >> stderr, key, "=", repr(value)
def check_urls (consumer):
"""
Main check function; checks all configured URLs until interrupted
with Ctrl-C. If you call this function more than once, you can specify
different configurations with the consumer parameter.
@param consumer: an object where all runtime-dependent options are
stored
@type consumer: linkcheck.consumer.Consumer
@return: None
"""
try:
_check_urls(consumer)
except (KeyboardInterrupt, SystemExit):
consumer.abort()
except:
consumer.abort()
internal_error()
def _check_urls (consumer):
"""
Checks all configured URLs. Prints status information, calls logger
methods.
@param consumer: an object where all runtime-dependent options are
stored
@type consumer: linkcheck.consumer.Consumer
@return: None
"""
start_time = time.time()
status_time = start_time
while not consumer.finished():
url_data = consumer.incoming_get_url()
if url_data is None:
# wait for incoming queue to fill
time.sleep(0.1)
elif url_data.cached:
# was cached -> can be logged
consumer.log_url(url_data)
else:
# go check this url
if url_data.parent_url and not \
linkcheck.url.url_is_absolute(url_data.base_url):
name = url_data.parent_url
else:
name = u""
if url_data.base_url:
name += url_data.base_url
if not name:
name = None
consumer.check_url(url_data, name)
if consumer.config('status'):
curtime = time.time()
if (curtime - status_time) > 5:
consumer.print_status(curtime, start_time)
status_time = curtime
consumer.end_log_output()
# file extensions we can parse recursively
extensions = {
"html": re.compile(r'(?i)\.s?html?$'),
@ -298,9 +189,9 @@ def absolute_url (base_url, base_ref, parent_url):
return u""
def get_url_from (base_url, recursion_level, consumer,
def get_url_from (base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=0, column=0,
name=u"", cmdline=False):
name=u"", assume_local=False):
"""
Get url data from given base data.
@ -308,8 +199,8 @@ def get_url_from (base_url, recursion_level, consumer,
@type base_url: string or None
@param recursion_level: current recursion level
@type recursion_level: number
@param consumer: consumer object
@type consumer: linkcheck.checker.consumer.Consumer
@param aggregate: aggregate object
@type aggregate: linkcheck.checker.aggregate.Consumer
@param parent_url: parent url
@type parent_url: string or None
@param base_ref: base url from <base> tag
@ -329,7 +220,14 @@ def get_url_from (base_url, recursion_level, consumer,
base_ref = linkcheck.strformat.unicode_safe(base_ref)
name = linkcheck.strformat.unicode_safe(name)
url = absolute_url(base_url, base_ref, parent_url).lower()
# test scheme
klass = get_urlclass_from(url, assume_local)
return klass(base_url, recursion_level, aggregate,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
def get_urlclass_from (url, assume_local):
"""Return checker class for given URL."""
if url.startswith("http:"):
klass = linkcheck.checker.httpurl.HttpUrl
elif url.startswith("ftp:"):
@ -351,24 +249,13 @@ def get_url_from (base_url, recursion_level, consumer,
elif ignored_schemes_re.search(url):
# ignored url
klass = linkcheck.checker.ignoredurl.IgnoredUrl
elif cmdline:
# assume local file on command line
elif assume_local:
# assume local file
klass = linkcheck.checker.fileurl.FileUrl
else:
# error url, no further checking, just log this
klass = linkcheck.checker.errorurl.ErrorUrl
url_data = klass(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
if cmdline:
# add intern URL regex to config for every URL that was given
# on the command line
pat = url_data.get_intern_pattern()
assert linkcheck.log.debug(linkcheck.LOG_CMDLINE,
"Add intern pattern %r from command line", pat)
if pat:
consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
return url_data
return klass
def get_index_html (urls):

View file

@ -23,11 +23,13 @@ import os
import time
import urlparse
import urllib
import urllib2
import urlbase
import linkcheck
import linkcheck.log
import linkcheck.checker
import linkcheck.fileutil
# if file extension lookup was unsuccessful, look at the content
contents = {
@ -83,7 +85,7 @@ class FileUrl (urlbase.UrlBase):
"""
def init (self, base_ref, base_url, parent_url, recursion_level,
consumer, line, column, name):
aggregate, line, column, name):
"""
Besides the usual initialization the URL is normed according
to the platform:
@ -91,7 +93,7 @@ class FileUrl (urlbase.UrlBase):
- under Windows platform the drive specifier is normed
"""
super(FileUrl, self).init(base_ref, base_url, parent_url,
recursion_level, consumer, line, column, name)
recursion_level, aggregate, line, column, name)
if self.base_url is None:
return
base_url = self.base_url
@ -129,7 +131,8 @@ class FileUrl (urlbase.UrlBase):
if self.is_directory():
self.set_result(_("directory"))
else:
super(FileUrl, self).check_connection()
url = linkcheck.fileutil.pathencode(self.url)
self.url_connection = urllib2.urlopen(url)
self.check_case_sensitivity()
def check_case_sensitivity (self):
@ -147,7 +150,6 @@ class FileUrl (urlbase.UrlBase):
"system path %r. You should always use "
"the system path in URLs.") % (path, realpath),
tag="file-system-path")
pass
def get_content (self):
"""
@ -208,7 +210,7 @@ class FileUrl (urlbase.UrlBase):
path = self.urlparts[2]
if os.name == 'nt':
path = prepare_urlpath_for_nt(path)
return urllib.url2pathname(path)
return linkcheck.fileutil.pathencode(urllib.url2pathname(path))
def is_directory (self):
"""

View file

@ -53,12 +53,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
order: login, changing directory, list the file.
"""
# proxy support (we support only http)
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
if self.proxy:
# using a (HTTP) proxy
http = httpurl.HttpUrl(self.base_url,
self.recursion_level,
self.consumer,
self.aggregate,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
@ -87,7 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# ready to connect
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
conn = self.consumer.get_connection(key)
conn = self.aggregate.connections.get(key)
if conn is not None and conn.sock is not None:
# reuse cached FTP connection
self.url_connection = conn
@ -248,6 +248,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("ftp", self.urlparts[1], _user, _password)
cache_add = self.consumer.add_connection
cache_add = self.aggregate.connections.add
cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
self.url_connection = None

View file

@ -129,8 +129,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
return self.consumer.robots_txt_allows_url(roboturl, url,
user, password)
return self.aggregate.robots_txt.allows_url(roboturl, url,
user, password)
def check_connection (self):
"""
@ -150,7 +150,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
valid request
"""
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
# initialize check data
self.headers = None
self.auth = None
@ -360,19 +360,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
tag="http-moved-permanent")
self.has301status = True
# check cache again on the changed URL
if self.consumer.checked_redirect(redirected, self):
if self.aggregate.urlqueue.checked_redirect(redirected, self):
return -1, response
# in case of changed scheme make new URL object
if self.urlparts[0] != self.scheme:
newobj = linkcheck.checker.get_url_from(
redirected, self.recursion_level, self.consumer,
redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name,
cmdline=False)
assume_local=False)
newobj.warnings = self.warnings
newobj.info = self.info
# append new object to queue
self.consumer.append_url(newobj)
self.aggregate.append_url(newobj)
# pretend to be finished and logged
return -1, response
# new response data
@ -406,14 +406,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
linkcheck.strformat.unicode_safe(response.reason),
tag="http-empty-content")
# store cookies for valid links
if self.consumer.config('cookies'):
if self.aggregate.config['cookies']:
for c in self.cookies:
self.add_info(_("Store cookie: %s.") % c)
try:
out = self.consumer.store_cookies(self.headers,
self.urlparts[0],
self.urlparts[1],
self.urlparts[2])
out = self.aggregate.cookies.add(self.headers,
self.urlparts[0],
self.urlparts[1],
self.urlparts[2])
for h in out:
self.add_info(linkcheck.strformat.unicode_safe(h))
except Cookie.CookieError, msg:
@ -471,13 +471,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
linkcheck.configuration.UserAgent)
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.consumer.config('cookies'):
if self.aggregate.config['cookies']:
scheme = self.urlparts[0]
host = self.urlparts[1]
port = linkcheck.url.default_ports.get(scheme, 80)
host, port = urllib.splitnport(host, port)
path = self.urlparts[2]
self.cookies = self.consumer.get_cookies(scheme, host, port, path)
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
for c in self.cookies:
name = c.client_header_name()
value = c.client_header_value()
@ -505,7 +505,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
_user, _password = self.get_user_password()
key = (scheme, self.urlparts[1], _user, _password)
conn = self.consumer.get_connection(key)
conn = self.aggregate.connections.get(key)
if conn is not None:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"reuse cached HTTP(S) connection %s", conn)
@ -634,7 +634,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# add to cached connections
_user, _password = self.get_user_password()
key = ("http", self.urlparts[1], _user, _password)
cache_add = self.consumer.add_connection
cache_add = self.aggregate.connections.add
# note: only cache the connection when it is persistent
# and all pending content has been received
if not self.persistent or not self.has_content or \

View file

@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
Connect to NNTP server and try to request the URL article
resource (if specified).
"""
nntpserver = self.host or self.consumer.config("nntpserver")
nntpserver = self.host or self.aggregate.config["nntpserver"]
if not nntpserver:
self.add_warning(
_("No NNTP server was specified, skipping this URL."),

View file

@ -63,7 +63,7 @@ class ProxySupport (object):
"""
Check if self.host is in the no-proxy-for ignore list.
"""
for ro in self.consumer.config("noproxyfor"):
for ro in self.aggregate.config["noproxyfor"]:
if ro.search(self.host):
return True
return False

View file

@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
label is "login: ", expected password label is "Password: ".
"""
self.url_connection = telnetlib.Telnet()
if self.consumer.config("debug"):
if self.aggregate.config["debug"]:
self.url_connection.set_debuglevel(1)
self.url_connection.open(self.host, self.port)
if self.user:

View file

@ -25,8 +25,6 @@ import unittest
import linkcheck
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.configuration
import linkcheck.logger
@ -93,7 +91,17 @@ class TestLogger (linkcheck.logger.Logger):
self.diff.append(line)
def get_test_consumer (confargs, logargs):
def get_file (filename=None):
"""
Get file name located within 'data' directory.
"""
directory = os.path.join("linkcheck", "checker", "tests", "data")
if filename:
return unicode(os.path.join(directory, filename))
return unicode(directory)
def get_test_aggregate (confargs, logargs):
"""
Initialize a test configuration object.
"""
@ -101,14 +109,15 @@ def get_test_consumer (confargs, logargs):
config.logger_add('test', TestLogger)
config['recursionlevel'] = 1
config['logger'] = config.logger_new('test', **logargs)
# uncomment for debugging
#config.init_logging(debug=["all"])
config["anchors"] = True
config["verbose"] = True
config['threads'] = 0
config['status'] = False
config['cookies'] = True
config['geoip'] = None
config.update(confargs)
cache = linkcheck.checker.cache.Cache()
return linkcheck.checker.consumer.Consumer(config, cache)
return linkcheck.director.get_aggregate(config)
class LinkCheckTest (unittest.TestCase):
@ -122,21 +131,14 @@ class LinkCheckTest (unittest.TestCase):
"""
return linkcheck.url.url_norm(url)[0]
def get_file (self, filename):
"""
Get file name located within 'data' directory.
"""
return unicode(os.path.join("linkcheck", "checker", "tests",
"data", filename))
def get_resultlines (self, filename):
"""
Return contents of file, as list of lines without line endings,
ignoring empty lines and lines starting with a hash sign (#).
"""
resultfile = self.get_file(filename+".result")
resultfile = get_file(filename+".result")
d = {'curdir': os.getcwd(),
'datadir': 'linkcheck/checker/tests/data',
'datadir': get_file(),
}
f = codecs.open(resultfile, "r", "iso-8859-15")
resultlines = [line.rstrip('\r\n') % d for line in f \
@ -144,27 +146,30 @@ class LinkCheckTest (unittest.TestCase):
f.close()
return resultlines
def file_test (self, filename, confargs=None, cmdline=True):
def file_test (self, filename, confargs=None, assume_local=True):
"""
Check <filename> with expected result in <filename>.result.
"""
url = self.get_file(filename)
url = get_file(filename)
if confargs is None:
confargs = {}
logargs = {'expected': self.get_resultlines(filename)}
consumer = get_test_consumer(confargs, logargs)
aggregate = get_test_aggregate(confargs, logargs)
url_data = linkcheck.checker.get_url_from(
url, 0, consumer, cmdline=cmdline)
consumer.append_url(url_data)
linkcheck.checker.check_urls(consumer)
if consumer.config('logger').diff:
url, 0, aggregate, assume_local=assume_local)
if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff
if diff:
sep = unicode(os.linesep)
l = [url] + consumer.config('logger').diff
l = [url] + diff
l = sep.join(l)
self.fail(l.encode("iso8859-1", "ignore"))
def direct (self, url, resultlines, fields=None, recursionlevel=0,
confargs=None, cmdline=False):
confargs=None, assume_local=False):
"""
Check url with expected result.
"""
@ -176,14 +181,17 @@ class LinkCheckTest (unittest.TestCase):
logargs = {'expected': resultlines}
if fields is not None:
logargs['fields'] = fields
consumer = get_test_consumer(confargs, logargs)
aggregate = get_test_aggregate(confargs, logargs)
url_data = linkcheck.checker.get_url_from(
url, 0, consumer, cmdline=cmdline)
consumer.append_url(url_data)
linkcheck.checker.check_urls(consumer)
if consumer.config('logger').diff:
url, 0, aggregate, assume_local=assume_local)
if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff
if diff:
sep = unicode(os.linesep)
l = [u"Differences found testing %s" % url]
l.extend(x.rstrip() for x in consumer.config('logger').diff[2:])
l.extend(x.rstrip() for x in diff[2:])
self.fail(sep.join(l).encode("iso8859-1", "ignore"))

View file

@ -1,7 +1,3 @@
url
cache key None
real url None
error
url file://%(curdir)s/%(datadir)s/misc.html
cache key file://%(curdir)s/%(datadir)s/misc.html
real url file://%(curdir)s/%(datadir)s/misc.html
@ -21,3 +17,8 @@ url favicon.ico (cached)
cache key file://%(curdir)s/%(datadir)s/favicon.ico
real url file://%(curdir)s/%(datadir)s/favicon.ico
valid
url
cache key None
real url None
error

View file

@ -36,8 +36,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
u"http.html" % self.port
resultlines = self.get_resultlines("http.html")
self.direct(url, resultlines, recursionlevel=1, cmdline=True)
self.redirect_http_test()
self.direct(url, resultlines, recursionlevel=1, assume_local=True)
self.redirect1_http_test()
self.redirect2_http_test()
self.noproxyfor_test()
finally:
self.stop_server()
@ -64,9 +65,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"original URL was u'http://localhost:%d/redirect1'." % self.port,
u"valid",
]
self.direct(url, resultlines, recursionlevel=0, cmdline=True)
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
def redirect_http_test (self):
def redirect1_http_test (self):
url = u"http://localhost:%d/redirect1" % self.port
nurl = url
rurl = url.replace("redirect", "newurl")
@ -77,7 +78,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"info Redirected to %s." % rurl,
u"error",
]
self.direct(url, resultlines, recursionlevel=0, cmdline=True)
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
def redirect2_http_test (self):
url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
self.port
nurl = url
@ -94,7 +97,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"name Recursive Redirect",
u"valid",
]
self.direct(url, resultlines, recursionlevel=99, cmdline=True)
self.direct(url, resultlines, recursionlevel=99, assume_local=True)
def noproxyfor_test (self):
"""
@ -113,7 +116,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
u"valid",
]
self.direct(url, resultlines, recursionlevel=0,
confargs=confargs, cmdline=True)
confargs=confargs, assume_local=True)
del os.environ["http_proxy"]

View file

@ -32,6 +32,7 @@ import traceback
import linkcheck
import linkcheck.linkparse
import linkcheck.checker
import linkcheck.director
import linkcheck.strformat
import linkcheck.containers
import linkcheck.log
@ -55,7 +56,7 @@ class UrlBase (object):
An URL with additional information like validity etc.
"""
def __init__ (self, base_url, recursion_level, consumer,
def __init__ (self, base_url, recursion_level, aggregate,
parent_url = None, base_ref = None,
line = -1, column = -1, name = u""):
"""
@ -63,7 +64,7 @@ class UrlBase (object):
@param base_url: unquoted and possibly unnormed url
@param recursion_level: on what check level lies the base url
@param consumer: consumer instance
@param aggregate: aggregate instance
@param parent_url: quoted and normed url of parent or None
@param base_ref: quoted and normed url of <base href=""> or None
@param line: line number of url in parent content
@ -71,13 +72,13 @@ class UrlBase (object):
@param name: name of url or empty
"""
self.init(base_ref, base_url, parent_url, recursion_level,
consumer, line, column, name)
aggregate, line, column, name)
self.reset()
self.check_syntax()
def init (self, base_ref, base_url, parent_url, recursion_level,
consumer, line, column, name):
aggregate, line, column, name):
"""
Initialize internal data.
"""
@ -86,7 +87,7 @@ class UrlBase (object):
self.base_url = base_url
self.parent_url = parent_url
self.recursion_level = recursion_level
self.consumer = consumer
self.aggregate = aggregate
self.line = line
self.column = column
self.name = name
@ -203,6 +204,7 @@ class UrlBase (object):
Fill attributes from cache data.
"""
self.result = cache_data["result"]
self.has_result = True
self.warnings.extend(cache_data["warnings"])
self.info.extend(cache_data["info"])
self.valid = cache_data["valid"]
@ -240,8 +242,8 @@ class UrlBase (object):
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Content cache key %r", self.cache_content_key)
# construct cache key
if self.consumer.config("anchorcaching") and \
self.consumer.config("anchors"):
if self.aggregate.config["anchorcaching"] and \
self.aggregate.config["anchors"]:
# do not ignore anchor
parts = self.urlparts[:]
parts[4] = self.anchor
@ -343,32 +345,28 @@ class UrlBase (object):
"""
Main check function for checking this URL.
"""
if self.consumer.config("trace"):
if self.aggregate.config["trace"]:
linkcheck.trace.trace_on()
try:
self.local_check()
self.consumer.checked(self)
except (socket.error, select.error):
self.consumer.interrupted(self)
# on Unix, ctrl-c can raise
# error: (4, 'Interrupted system call')
etype, value = sys.exc_info()[:2]
if etype == 4:
if etype == errno.EINTR:
raise KeyboardInterrupt(value)
else:
raise
except KeyboardInterrupt:
self.consumer.interrupted(self)
raise
except:
self.consumer.interrupted(self)
linkcheck.checker.internal_error()
linkcheck.director.internal_error()
def add_country_info (self):
"""
Try to ask GeoIP database for country info.
"""
country = self.consumer.get_country_name(self.host)
country = linkcheck.cache.geoip.get_country(self.host)
if country is not None:
self.add_info(_("URL is located in %s.") % _(country))
@ -377,10 +375,11 @@ class UrlBase (object):
Local check function can be overridden in subclasses.
"""
assert linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
if self.recursion_level and self.consumer.config('wait'):
wait = self.aggregate.config['wait']
if self.recursion_level and wait:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"sleeping for %d seconds", self.consumer.config('wait'))
time.sleep(self.consumer.config('wait'))
"sleeping for %d seconds", wait)
time.sleep(wait)
t = time.time()
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
@ -392,7 +391,7 @@ class UrlBase (object):
try:
self.check_connection()
self.add_country_info()
if self.consumer.config("anchors"):
if self.aggregate.config["anchors"]:
self.check_anchors()
except tuple(linkcheck.checker.ExcList):
value = self.handle_exception()
@ -406,7 +405,7 @@ class UrlBase (object):
valid=False)
# check content
warningregex = self.consumer.config("warningregex")
warningregex = self.aggregate.config["warningregex"]
if warningregex and self.valid:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"checking content")
@ -486,8 +485,8 @@ class UrlBase (object):
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"... no, cannot get content.")
return False
if self.consumer.config("recursionlevel") >= 0 and \
self.recursion_level >= self.consumer.config("recursionlevel"):
rec_level = self.aggregate.config["recursionlevel"]
if rec_level >= 0 and self.recursion_level >= rec_level:
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
"... no, maximum recursion level reached.")
return False
@ -551,7 +550,7 @@ class UrlBase (object):
@return: None
"""
for entry in self.consumer.config("externlinks"):
for entry in self.aggregate.config["externlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
@ -559,7 +558,7 @@ class UrlBase (object):
"Extern URL %r", url)
self.extern = (1, entry['strict'])
return
for entry in self.consumer.config("internlinks"):
for entry in self.aggregate.config["internlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
@ -607,7 +606,7 @@ class UrlBase (object):
If a maximum size was given, call this function to check it
against the content size of this url.
"""
maxbytes = self.consumer.config("warnsizebytes")
maxbytes = self.aggregate.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(_("Content size %s is larger than %s.") %
(linkcheck.strformat.strsize(self.dlsize),
@ -626,7 +625,7 @@ class UrlBase (object):
Get tuple (user, password) from configured authentication.
Both user and password can be None if not specified.
"""
for auth in self.consumer.config("authentication"):
for auth in self.aggregate.config["authentication"]:
if auth['pattern'].match(self.url):
return auth['user'], auth['password']
return None, None
@ -651,10 +650,10 @@ class UrlBase (object):
else:
base_ref = h.base_ref
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.consumer, parent_url=self.url,
self.recursion_level+1, self.aggregate, parent_url=self.url,
base_ref=base_ref, line=line, column=column, name=name,
cmdline=False)
self.consumer.append_url(url_data)
assume_local=False)
self.aggregate.urlqueue.put(url_data)
def parse_opera (self):
"""
@ -674,10 +673,10 @@ class UrlBase (object):
url = line[4:]
if url:
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.consumer,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, name=name,
cmdline=False)
self.consumer.append_url(url_data)
assume_local=False)
self.aggregate.urlqueue.put(url_data)
name = ""
def parse_text (self):
@ -694,10 +693,10 @@ class UrlBase (object):
if not line or line.startswith('#'):
continue
url_data = linkcheck.checker.get_url_from(line,
self.recursion_level+1, self.consumer,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno,
cmdline=False)
self.consumer.append_url(url_data)
assume_local=False)
self.aggregate.urlqueue.put(url_data)
def parse_css (self):
"""
@ -712,10 +711,10 @@ class UrlBase (object):
column = mo.start("url")
url = linkcheck.strformat.unquote(mo.group("url").strip())
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.consumer,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, column=column,
cmdline=False)
self.consumer.append_url(url_data)
assume_local=False)
self.aggregate.urlqueue.put(url_data)
def serialized (self):
"""
@ -758,7 +757,7 @@ class UrlBase (object):
@rtype: string
"""
s = self.serialized()
return self.consumer.config('logger').encode(s)
return self.aggregate.config['logger'].encode(s)
def __repr__ (self):
"""

View file

@ -28,11 +28,6 @@ import linkcheck
import linkcheck.log
import linkcheck.containers
import confparse
try:
import GeoIP
_has_geoip = True
except ImportError:
_has_geoip = False
Version = _linkchecker_configdata.version
AppName = u"LinkChecker"
@ -83,6 +78,7 @@ class Configuration (dict):
self["internlinks"] = []
self["noproxyfor"] = []
self["interactive"] = False
self["maxqueuesize"] = 0
# on ftp, password is set by Pythons ftplib
self["authentication"] = []
self["proxy"] = urllib.getproxies()
@ -149,18 +145,6 @@ class Configuration (dict):
self["warnsizebytes"] = None
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
self["threads"] = 10
self.init_geoip()
def init_geoip (self):
"""
If GeoIP.dat file is found, initialize a standard geoip DB and
store it in self["geoip"]; else this value will be None.
"""
geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
if _has_geoip and os.path.exists(geoip_dat):
self["geoip"] = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
else:
self["geoip"] = None
def init_logging (self, debug=None):
"""

View file

@ -23,12 +23,14 @@ import os
import linkcheck.configuration
def get_file (filename):
def get_file (filename=None):
"""
Get file name located within 'data' directory.
"""
return unicode(os.path.join("linkcheck", "configuration", "tests",
"data", filename))
directory = os.path.join("linkcheck", "configuration", "tests", "data")
if filename:
return unicode(os.path.join(directory, filename))
return unicode(directory)
class TestConfig (unittest.TestCase):

View file

@ -297,40 +297,3 @@ class Rfc2965Cookie (HttpCookie):
# XXX more methods (equality test)
class CookieJar (set):
"""
Cookie storage, implementing the default cookie handling policy for
LinkChecker.
"""
def add_cookies (self, headers, scheme, host, path):
"""
Parse cookie values, add to jar.
"""
to_add = set()
for h in headers.getallmatchingheaders("Set-Cookie"):
# RFC 2109 (Netscape) cookie type
try:
to_add.add(NetscapeCookie(h, scheme, host, path))
except CookieError:
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
for h in headers.getallmatchingheaders("Set-Cookie2"):
# RFC 2965 cookie type
try:
to_add.add(Rfc2965Cookie(h, scheme, host, path))
except CookieError:
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
"Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
for x in to_add:
self.add(x)
return to_add
def remove_expired (self):
"""
Remove expired cookies from jar.
"""
to_remove = [x for x in self if not x.check_expired()]
return self.difference_update(to_remove)

View file

@ -31,8 +31,7 @@ import linkcheck.url
import linkcheck.i18n
import linkcheck.strformat
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.director
_logfile = None
_supported_langs = ('de', 'fr', 'nl', 'C')
@ -99,13 +98,16 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
config["externlinks"].append(
linkcheck.get_link_pat("^%s$" % linkcheck.url.safe_url_pattern))
config["externlinks"].append(linkcheck.get_link_pat(".*", strict=True))
# start checking
aggregate = linkcheck.director.get_aggregate(config)
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
# start checking
url = form["url"].value
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=False)
consumer.append_url(url_data)
linkcheck.checker.check_urls(consumer)
url_data = linkcheck.checker.get_url_from(url, 0, aggregate,
assume_local=False)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
def get_host_name (form):

View file

@ -20,18 +20,16 @@ Test url build method from url data objects.
import unittest
import linkcheck.configuration
import linkcheck.director
import linkcheck.checker.httpurl
import linkcheck.checker.cache
import linkcheck.checker.consumer
def get_test_consumer ():
def get_test_aggregate ():
"""
Initialize a test configuration object.
"""
config = linkcheck.configuration.Configuration()
config['logger'] = config.logger_new('none')
cache = linkcheck.checker.cache.Cache()
return linkcheck.checker.consumer.Consumer(config, cache)
return linkcheck.director.get_aggregate(config)
class TestUrlBuild (unittest.TestCase):
@ -43,9 +41,9 @@ class TestUrlBuild (unittest.TestCase):
parent_url = "http://localhost:8001/linkcheck/checker/tests/data/http.html"
base_url = "http://"
recursion_level = 0
consumer = get_test_consumer()
aggregate = get_test_aggregate()
o = linkcheck.checker.httpurl.HttpUrl(base_url, recursion_level,
consumer, parent_url=parent_url)
aggregate, parent_url=parent_url)
o.build_url()
self.assertEquals(o.url, 'http://')

View file

@ -38,8 +38,7 @@ optparse._ = _
import linkcheck.log
import linkcheck.i18n
import linkcheck.checker
import linkcheck.checker.cache
import linkcheck.checker.consumer
import linkcheck.director
import linkcheck.configuration
import linkcheck.fileutil
import linkcheck.strformat
@ -654,14 +653,15 @@ if len(args) <= 0:
else:
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
# initialize the cache and the consumer model
cache = linkcheck.checker.cache.Cache()
consumer = linkcheck.checker.consumer.Consumer(config, cache)
# prepare checking queue
aggregate = linkcheck.director.get_aggregate(config)
if options.trace:
config["trace"] = True
import linkcheck.trace
linkcheck.trace.trace_filter([r"^linkcheck"])
linkcheck.trace.trace_on()
# add urls to queue
get_url_from = linkcheck.checker.get_url_from
for url in args:
if url.lower().startswith("www."):
# syntactic sugar
@ -669,14 +669,14 @@ for url in args:
elif url.lower().startswith("ftp."):
# syntactic sugar
url = "ftp://%s" % url
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
consumer.append_url(url_data)
############################# check the URLs ################################
url_data = get_url_from(url, 0, aggregate, assume_local=True)
linkcheck.add_intern_pattern(url_data, config)
aggregate.urlqueue.put(url_data)
# set up profiling/psyco
if do_profile and not has_profile:
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("The `profile' Python module is not installed,"
" therefore the --profile option is disabled."))
if do_profile and has_profile:
run = True
if os.path.exists(_profile):
@ -690,7 +690,7 @@ if do_profile and has_profile:
run = False
if run:
import profile
profile.run("linkcheck.checker.check_urls(consumer)", _profile)
profile.run("manager.check_urls()", _profile)
elif options.psyco:
try:
import psyco
@ -705,8 +705,8 @@ elif options.psyco:
except ImportError:
# no psyco available, just ignore
pass
linkcheck.checker.check_urls(consumer)
#############################################################################
# start checking
linkcheck.director.check_urls(aggregate)
# interactive input end
if config['interactive']:

View file

@ -537,7 +537,8 @@ o a (Fast)CGI web interface (requires HTTP server)
'clean': MyClean,
},
packages = ['linkcheck', 'linkcheck.logger', 'linkcheck.checker',
'linkcheck.configuration',
'linkcheck.director', 'linkcheck.configuration',
'linkcheck.cache',
'linkcheck.dns', 'linkcheck.dns.rdtypes',
'linkcheck.dns.rdtypes.ANY', 'linkcheck.dns.rdtypes.IN',
'linkcheck.HtmlParser', 'linkcheck.ftpparse', ],