mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-15 18:13:09 +00:00
Replace the old threading algorithm with a new one based on Queue.Queue and consumer threads
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3146 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
d05c68ef74
commit
f002c5f965
28 changed files with 684 additions and 326 deletions
|
|
@ -14,6 +14,12 @@
|
|||
Changed: linkcheck/HtmlParser/htmllex.[lc],
|
||||
linkcheck/tests/test_parser.py
|
||||
|
||||
* Revamp the threading algorithm by using a URL queue, with a
|
||||
constant number of consumer threads called 'workers'.
|
||||
This fixes the remaining "dequeue mutated during iteration" errors.
|
||||
Type: feature
|
||||
Changed: *.py
|
||||
|
||||
3.4 "The Chumscrubbers" (released 4.2.2006)
|
||||
|
||||
* Ignore decoding errors when retrieving the robots.txt URL.
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ include doc/de/*.1
|
|||
include doc/fr/*.1
|
||||
include doc/Makefile doc/rest2htmlnav
|
||||
recursive-include linkcheck/checker/tests/data *.txt *.html *.result *.asc *.css *.ico
|
||||
recursive-include linkcheck/configuration/tests/data *.ini
|
||||
include linkcheck/tests/*.py
|
||||
include linkcheck/checker/tests/*.py
|
||||
include linkcheck/dns/tests/*.py
|
||||
|
|
|
|||
10
TODO
10
TODO
|
|
@ -1,7 +1,11 @@
|
|||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/483752
|
||||
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/475160
|
||||
- Improved print_status
|
||||
|
||||
- use format_time from quodlibet for times
|
||||
- Ctrl-C is not really working.
|
||||
|
||||
- To limit the used memory, put a maximum size on the URL queue
|
||||
(eg. 20000 URLs). If reached, the worker calling queue.put() will
|
||||
wait for another worker to call queue.get() before continuing.
|
||||
Problem: dead lock when all workers called queue.put().
|
||||
|
||||
- [FEATURE] postmortem debugging with pdb.pm()
|
||||
|
||||
|
|
|
|||
|
|
@ -58,6 +58,17 @@ class LinkCheckerError (Exception):
|
|||
pass
|
||||
|
||||
|
||||
def add_intern_pattern (url_data, config):
|
||||
"""
|
||||
Add intern URL regex to config.
|
||||
"""
|
||||
pat = url_data.get_intern_pattern()
|
||||
if pat:
|
||||
assert linkcheck.log.debug(LOG_CHECK,
|
||||
"Add intern pattern %r from command line", pat)
|
||||
config['internlinks'].append(get_link_pat(pat))
|
||||
|
||||
|
||||
def get_link_pat (arg, strict=False):
|
||||
"""
|
||||
Get a link pattern matcher for intern/extern links.
|
||||
|
|
|
|||
19
linkcheck/cache/__init__.py
vendored
Normal file
19
linkcheck/cache/__init__.py
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Store and provide cached data during checking in a thread-safe manner.
|
||||
"""
|
||||
111
linkcheck/cache/connection.py
vendored
Normal file
111
linkcheck/cache/connection.py
vendored
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2005-2006 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Store and retrieve open connections.
|
||||
"""
|
||||
|
||||
import time
|
||||
import threading
|
||||
from linkcheck.decorators import synchronized
|
||||
|
||||
# lock for robots.txt caching
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
class ConnectionPool (object):
|
||||
"""
|
||||
Thread-safe cache, storing a set of connections for URL retrieval.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""
|
||||
Initialize an empty connection dictionary which will have entries
|
||||
of the form::
|
||||
key -> [connection, status, expiration time]
|
||||
|
||||
Connection can be any open connection object (HTTP, FTP, ...).
|
||||
Status is either 'available' or 'busy'.
|
||||
Expiration time is the point of time in seconds when this
|
||||
connection will be timed out.
|
||||
|
||||
The identifier key is usually a tuple (type, host, user, pass),
|
||||
but it can be any immutable Python object.
|
||||
"""
|
||||
# open connections
|
||||
# {(type, host, user, pass) -> [connection, status, expiration time]}
|
||||
self.connections = {}
|
||||
|
||||
@synchronized(_lock)
|
||||
def add (self, key, conn, timeout):
|
||||
"""
|
||||
Add connection to the pool with given identifier key and timeout
|
||||
in seconds.
|
||||
"""
|
||||
self.connections[key] = [conn, 'available', time.time() + timeout]
|
||||
|
||||
@synchronized(_lock)
|
||||
def get (self, key):
|
||||
"""
|
||||
Get open connection if available, for at most 30 seconds.
|
||||
|
||||
@return: Open connection object or None if no connection is available.
|
||||
@rtype None or FTPConnection or HTTP(S)Connection
|
||||
"""
|
||||
if key not in self.connections:
|
||||
# not found
|
||||
return None
|
||||
conn_data = self.connections[key]
|
||||
t = time.time()
|
||||
if t > conn_data[2]:
|
||||
# timed out
|
||||
try:
|
||||
conn_data[1].close()
|
||||
except:
|
||||
# ignore close errors
|
||||
pass
|
||||
del self.connections[key]
|
||||
return None
|
||||
# wait at most 300*0.1=30 seconds for connection to become available
|
||||
for dummy in xrange(300):
|
||||
if conn_data[1] != 'busy':
|
||||
conn_data[1] = 'busy'
|
||||
conn_data[2] = t
|
||||
return conn_data[0]
|
||||
time.sleep(0.1)
|
||||
# connection is in use
|
||||
return None
|
||||
|
||||
@synchronized(_lock)
|
||||
def release (self, key):
|
||||
"""
|
||||
Mark an open and reusable connection as available.
|
||||
"""
|
||||
if key in self.connections:
|
||||
self.connections[key][1] = 'available'
|
||||
|
||||
@synchronized(_lock)
|
||||
def expire_connections (self):
|
||||
"""
|
||||
Remove expired connections from this pool.
|
||||
"""
|
||||
t = time.time()
|
||||
to_delete = []
|
||||
for key, conn_data in self.connections.iteritems():
|
||||
if conn_data[1] == 'available' and t > conn_data[2]:
|
||||
to_delete.append(key)
|
||||
for key in to_delete:
|
||||
del self.connections[key]
|
||||
73
linkcheck/cache/cookie.py
vendored
Normal file
73
linkcheck/cache/cookie.py
vendored
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Store and retrieve cookies.
|
||||
"""
|
||||
import threading
|
||||
from linkcheck.decorators import synchronized
|
||||
import linkcheck
|
||||
import linkcheck.log
|
||||
import linkcheck.cookies
|
||||
|
||||
# lock for caching
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
class CookieJar (object):
|
||||
"""
|
||||
Cookie storage, implementing the default cookie handling policy for
|
||||
LinkChecker.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
self.cache = {}
|
||||
|
||||
@synchronized(_lock)
|
||||
def add (self, headers, scheme, host, path):
|
||||
"""
|
||||
Parse cookie values, add to cache.
|
||||
"""
|
||||
jar = set()
|
||||
for h in headers.getallmatchingheaders("Set-Cookie"):
|
||||
# RFC 2109 (Netscape) cookie type
|
||||
try:
|
||||
c = linkcheck.cookies.NetscapeCookie(h, scheme, host, path)
|
||||
jar.add(c)
|
||||
except linkcheck.cookies.CookieError:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
|
||||
for h in headers.getallmatchingheaders("Set-Cookie2"):
|
||||
# RFC 2965 cookie type
|
||||
try:
|
||||
c = linkcheck.cookies.Rfc2965Cookie(h, scheme, host, path)
|
||||
jar.add(c)
|
||||
except linkcheck.cookies.CookieError:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
|
||||
self.cache[host] = jar
|
||||
return jar
|
||||
|
||||
@synchronized(_lock)
|
||||
def get (self, scheme, host, port, path):
|
||||
"""
|
||||
Cookie cache getter function.
|
||||
"""
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Get cookies for host %r path %r", host, path)
|
||||
jar = self.cache.setdefault(host, set())
|
||||
return [x for x in jar if x.check_expired() and \
|
||||
x.is_valid_for(scheme, host, port, path)]
|
||||
|
|
@ -15,23 +15,44 @@
|
|||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
GeoIP wrapper.
|
||||
Store and retrieve country names for IPs.
|
||||
"""
|
||||
import os
|
||||
import threading
|
||||
from linkcheck.decorators import synchronized
|
||||
|
||||
def get_country (gi, host):
|
||||
# I don't know if the geoip library is already thread-safe, but
|
||||
# we take no risks here.
|
||||
_lock = threading.Lock()
|
||||
|
||||
# initialize GeoIP database
|
||||
geoip = None
|
||||
try:
|
||||
import GeoIP
|
||||
geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
|
||||
if os.name == 'posix' and os.path.exists(geoip_dat):
|
||||
geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
|
||||
del geoip_dat
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@synchronized(_lock)
|
||||
def get_country (host):
|
||||
"""
|
||||
Get translated country name.
|
||||
|
||||
@return: country string or None
|
||||
"""
|
||||
c = gi.country_code_by_name(host)
|
||||
if geoip is None:
|
||||
return None
|
||||
c = geoip.country_code_by_name(host)
|
||||
if c and c in countries:
|
||||
return "%s, %s" % (c, countries[c])
|
||||
return None
|
||||
|
||||
|
||||
# GeoIP country map with {short name -> translated full name} entries
|
||||
|
||||
countries = {
|
||||
"AP": "Asia/Pacific Region",
|
||||
"EU": "Europe",
|
||||
52
linkcheck/cache/robots_txt.py
vendored
Normal file
52
linkcheck/cache/robots_txt.py
vendored
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2006 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Cache robots.txt contents.
|
||||
"""
|
||||
import threading
|
||||
from linkcheck.decorators import synchronized
|
||||
import linkcheck.robotparser2
|
||||
import linkcheck.configuration
|
||||
|
||||
|
||||
# lock for caching
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
class RobotsTxt (object):
|
||||
"""
|
||||
Thread-safe cache of downloaded robots.txt files.
|
||||
format: {cache key (string) -> robots.txt content (RobotFileParser)}
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
self.cache = {}
|
||||
|
||||
@synchronized(_lock)
|
||||
def allows_url (self, roboturl, url, user, password):
|
||||
"""
|
||||
Ask robots.txt allowance.
|
||||
"""
|
||||
if roboturl not in self.cache:
|
||||
rp = linkcheck.robotparser2.RobotFileParser(
|
||||
user=user, password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
self.cache[roboturl] = rp
|
||||
else:
|
||||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
|
||||
210
linkcheck/cache/urlqueue.py
vendored
Normal file
210
linkcheck/cache/urlqueue.py
vendored
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2006 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
Handle a queue of URLs to check.
|
||||
"""
|
||||
import threading
|
||||
import Queue
|
||||
import time
|
||||
import linkcheck
|
||||
import linkcheck.log
|
||||
|
||||
|
||||
class UrlQueue (Queue.Queue):
|
||||
"""
|
||||
A queue supporting several consumer tasks. The task_done() idea is
|
||||
from the Python 2.5 Subversion repository.
|
||||
"""
|
||||
|
||||
def __init__ (self, maxsize=0):
|
||||
"""
|
||||
Initialize the queue state and task counters.
|
||||
"""
|
||||
Queue.Queue.__init__(self, maxsize=maxsize)
|
||||
self.all_tasks_done = threading.Condition(self.mutex)
|
||||
self.unfinished_tasks = 0
|
||||
self.finished_tasks = 0
|
||||
self.in_progress = {}
|
||||
self.checked = {}
|
||||
self.shutdown = False
|
||||
|
||||
def get (self):
|
||||
"""
|
||||
Get first not-in-progress url from the queue and
|
||||
return it. If no such url is available return None. The
|
||||
url might be already cached.
|
||||
"""
|
||||
self.not_empty.acquire()
|
||||
try:
|
||||
while self._empty():
|
||||
self.not_empty.wait()
|
||||
url_data = self._get()
|
||||
key = url_data.cache_url_key
|
||||
if url_data.has_result:
|
||||
# Already checked and copied from cache.
|
||||
pass
|
||||
elif key in self.checked:
|
||||
# Already checked; copy result. And even ignore
|
||||
# the case where url happens to be in_progress.
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
elif key in self.in_progress:
|
||||
# It's being checked currently; put it back in the queue.
|
||||
Queue.Queue._put(self, url_data)
|
||||
url_data = None
|
||||
else:
|
||||
self.in_progress[key] = url_data
|
||||
self.not_full.notify()
|
||||
return url_data
|
||||
finally:
|
||||
self.not_empty.release()
|
||||
|
||||
def _put (self, url_data):
|
||||
"""
|
||||
Put URL in queue, increase number of unfished tasks.
|
||||
"""
|
||||
if self.shutdown:
|
||||
# don't accept more URLs
|
||||
return
|
||||
key = url_data.cache_url_key
|
||||
if key in self.checked:
|
||||
# Put at beginning of queue to get consumed quickly.
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
self.queue.appendleft(url_data)
|
||||
else:
|
||||
self.queue.append(url_data)
|
||||
self.unfinished_tasks += 1
|
||||
|
||||
def task_done (self, url_data):
|
||||
"""
|
||||
Indicate that a formerly enqueued task is complete.
|
||||
|
||||
Used by Queue consumer threads. For each get() used to fetch a task,
|
||||
a subsequent call to task_done() tells the queue that the processing
|
||||
on the task is complete.
|
||||
|
||||
If a join() is currently blocking, it will resume when all items
|
||||
have been processed (meaning that a task_done() call was received
|
||||
for every item that had been put() into the queue).
|
||||
|
||||
Raises a ValueError if called more times than there were items
|
||||
placed in the queue.
|
||||
"""
|
||||
self.all_tasks_done.acquire()
|
||||
try:
|
||||
if url_data is not None:
|
||||
key = url_data.cache_url_key
|
||||
if key is not None and key not in self.checked:
|
||||
self._cache_url(key, url_data)
|
||||
self.finished_tasks += 1
|
||||
unfinished = self.unfinished_tasks - 1
|
||||
if unfinished <= 0:
|
||||
if unfinished < 0:
|
||||
raise ValueError('task_done() called too many times')
|
||||
self.all_tasks_done.notifyAll()
|
||||
self.unfinished_tasks = unfinished
|
||||
finally:
|
||||
self.all_tasks_done.release()
|
||||
|
||||
def _cache_url (self, key, url_data):
|
||||
"""
|
||||
Put URL result data into cache.
|
||||
"""
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Caching %r", key)
|
||||
assert key in self.in_progress, \
|
||||
"%r not in %s" % (key, self.in_progress)
|
||||
del self.in_progress[key]
|
||||
data = url_data.get_cache_data()
|
||||
self.checked[key] = data
|
||||
# check for aliases (eg. through HTTP redirections)
|
||||
if hasattr(url_data, "aliases"):
|
||||
data = url_data.get_alias_cache_data()
|
||||
for key in url_data.aliases:
|
||||
if key in self.checked or key in self.in_progress:
|
||||
continue
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Caching alias %r", key)
|
||||
self.checked[key] = data
|
||||
|
||||
def join (self, timeout=None):
|
||||
"""Blocks until all items in the Queue have been gotten and processed.
|
||||
|
||||
The count of unfinished tasks goes up whenever an item is added to the
|
||||
queue. The count goes down whenever a consumer thread calls task_done()
|
||||
to indicate the item was retrieved and all work on it is complete.
|
||||
|
||||
When the count of unfinished tasks drops to zero, join() unblocks.
|
||||
"""
|
||||
self.all_tasks_done.acquire()
|
||||
try:
|
||||
if timeout is None:
|
||||
while self.unfinished_tasks:
|
||||
self.all_tasks_done.wait()
|
||||
else:
|
||||
if timeout < 0:
|
||||
raise ValueError("'timeout' must be a positive number")
|
||||
endtime = time.time() + timeout
|
||||
while self.unfinished_tasks:
|
||||
remaining = endtime - time.time()
|
||||
if remaining <= 0.0:
|
||||
return
|
||||
self.all_tasks_done.wait(remaining)
|
||||
finally:
|
||||
self.all_tasks_done.release()
|
||||
|
||||
def do_shutdown (self):
|
||||
"""
|
||||
Shutdown the queue by not accepting any more URLs.
|
||||
"""
|
||||
self.mutex.acquire()
|
||||
try:
|
||||
unfinished = self.unfinished_tasks - len(self.queue)
|
||||
self.queue.clear()
|
||||
if unfinished <= 0:
|
||||
if unfinished < 0:
|
||||
raise ValueError('shutdown is in error')
|
||||
self.all_tasks_done.notifyAll()
|
||||
self.unfinished_tasks = unfinished
|
||||
self.shutdown = True
|
||||
finally:
|
||||
self.mutex.release()
|
||||
|
||||
def status (self):
|
||||
"""
|
||||
Get tuple (finished tasks, unfinished tasks, queue size).
|
||||
"""
|
||||
self.mutex.acquire()
|
||||
try:
|
||||
return (self.finished_tasks, self.unfinished_tasks, len(self.queue))
|
||||
finally:
|
||||
self.mutex.release()
|
||||
|
||||
def checked_redirect (self, redirect, url_data):
|
||||
"""
|
||||
Check if redirect is already in cache. Used for URL redirections
|
||||
to avoid double checking of already cached URLs.
|
||||
If the redirect URL is found in the cache, the result data is
|
||||
already copied.
|
||||
"""
|
||||
self.mutex.acquire()
|
||||
try:
|
||||
if redirect in self.checked:
|
||||
url_data.copy_from_cache(self.checked[redirect])
|
||||
return True
|
||||
return False
|
||||
finally:
|
||||
self.mutex.release()
|
||||
|
|
@ -18,19 +18,14 @@
|
|||
Main functions for link checking.
|
||||
"""
|
||||
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import cgi
|
||||
import socket
|
||||
import codecs
|
||||
import traceback
|
||||
import select
|
||||
import re
|
||||
import urllib
|
||||
import nntplib
|
||||
import ftplib
|
||||
|
||||
import linkcheck.httplib2
|
||||
import linkcheck.strformat
|
||||
import linkcheck.dns.exception
|
||||
|
|
@ -153,110 +148,6 @@ acap # application configuration access protocol
|
|||
|
||||
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
||||
|
||||
_encoding = linkcheck.i18n.default_encoding
|
||||
stderr = codecs.getwriter(_encoding)(sys.stderr, errors="ignore")
|
||||
|
||||
def internal_error ():
|
||||
"""
|
||||
Print internal error message to stderr.
|
||||
"""
|
||||
print >> stderr, os.linesep
|
||||
print >> stderr, _("""********** Oops, I did it again. *************
|
||||
|
||||
You have found an internal error in LinkChecker. Please write a bug report
|
||||
at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
|
||||
or send mail to %s and include the following information:
|
||||
- the URL or file you are testing
|
||||
- your commandline arguments and/or configuration.
|
||||
- the output of a debug run with option "-Dall" of the executed command
|
||||
- the system information below.
|
||||
|
||||
Disclosing some of the information above due to privacy reasons is ok.
|
||||
I will try to help you nonetheless, but you have to give me something
|
||||
I can work with ;) .
|
||||
""") % linkcheck.configuration.Email
|
||||
etype, value = sys.exc_info()[:2]
|
||||
print >> stderr, etype, value
|
||||
traceback.print_exc()
|
||||
print_app_info()
|
||||
print >> stderr, os.linesep, \
|
||||
_("******** LinkChecker internal error, over and out ********")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def print_app_info ():
|
||||
"""
|
||||
Print system and application info to stderr.
|
||||
"""
|
||||
print >> stderr, _("System info:")
|
||||
print >> stderr, linkcheck.configuration.App
|
||||
print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
|
||||
for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"):
|
||||
value = os.getenv(key)
|
||||
if value is not None:
|
||||
print >> stderr, key, "=", repr(value)
|
||||
|
||||
|
||||
def check_urls (consumer):
|
||||
"""
|
||||
Main check function; checks all configured URLs until interrupted
|
||||
with Ctrl-C. If you call this function more than once, you can specify
|
||||
different configurations with the consumer parameter.
|
||||
|
||||
@param consumer: an object where all runtime-dependent options are
|
||||
stored
|
||||
@type consumer: linkcheck.consumer.Consumer
|
||||
@return: None
|
||||
"""
|
||||
try:
|
||||
_check_urls(consumer)
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
consumer.abort()
|
||||
except:
|
||||
consumer.abort()
|
||||
internal_error()
|
||||
|
||||
|
||||
def _check_urls (consumer):
|
||||
"""
|
||||
Checks all configured URLs. Prints status information, calls logger
|
||||
methods.
|
||||
|
||||
@param consumer: an object where all runtime-dependent options are
|
||||
stored
|
||||
@type consumer: linkcheck.consumer.Consumer
|
||||
@return: None
|
||||
"""
|
||||
start_time = time.time()
|
||||
status_time = start_time
|
||||
while not consumer.finished():
|
||||
url_data = consumer.incoming_get_url()
|
||||
if url_data is None:
|
||||
# wait for incoming queue to fill
|
||||
time.sleep(0.1)
|
||||
elif url_data.cached:
|
||||
# was cached -> can be logged
|
||||
consumer.log_url(url_data)
|
||||
else:
|
||||
# go check this url
|
||||
if url_data.parent_url and not \
|
||||
linkcheck.url.url_is_absolute(url_data.base_url):
|
||||
name = url_data.parent_url
|
||||
else:
|
||||
name = u""
|
||||
if url_data.base_url:
|
||||
name += url_data.base_url
|
||||
if not name:
|
||||
name = None
|
||||
consumer.check_url(url_data, name)
|
||||
if consumer.config('status'):
|
||||
curtime = time.time()
|
||||
if (curtime - status_time) > 5:
|
||||
consumer.print_status(curtime, start_time)
|
||||
status_time = curtime
|
||||
consumer.end_log_output()
|
||||
|
||||
|
||||
# file extensions we can parse recursively
|
||||
extensions = {
|
||||
"html": re.compile(r'(?i)\.s?html?$'),
|
||||
|
|
@ -298,9 +189,9 @@ def absolute_url (base_url, base_ref, parent_url):
|
|||
return u""
|
||||
|
||||
|
||||
def get_url_from (base_url, recursion_level, consumer,
|
||||
def get_url_from (base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=0, column=0,
|
||||
name=u"", cmdline=False):
|
||||
name=u"", assume_local=False):
|
||||
"""
|
||||
Get url data from given base data.
|
||||
|
||||
|
|
@ -308,8 +199,8 @@ def get_url_from (base_url, recursion_level, consumer,
|
|||
@type base_url: string or None
|
||||
@param recursion_level: current recursion level
|
||||
@type recursion_level: number
|
||||
@param consumer: consumer object
|
||||
@type consumer: linkcheck.checker.consumer.Consumer
|
||||
@param aggregate: aggregate object
|
||||
@type aggregate: linkcheck.checker.aggregate.Consumer
|
||||
@param parent_url: parent url
|
||||
@type parent_url: string or None
|
||||
@param base_ref: base url from <base> tag
|
||||
|
|
@ -329,7 +220,14 @@ def get_url_from (base_url, recursion_level, consumer,
|
|||
base_ref = linkcheck.strformat.unicode_safe(base_ref)
|
||||
name = linkcheck.strformat.unicode_safe(name)
|
||||
url = absolute_url(base_url, base_ref, parent_url).lower()
|
||||
# test scheme
|
||||
klass = get_urlclass_from(url, assume_local)
|
||||
return klass(base_url, recursion_level, aggregate,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
|
||||
|
||||
def get_urlclass_from (url, assume_local):
|
||||
"""Return checker class for given URL."""
|
||||
if url.startswith("http:"):
|
||||
klass = linkcheck.checker.httpurl.HttpUrl
|
||||
elif url.startswith("ftp:"):
|
||||
|
|
@ -351,24 +249,13 @@ def get_url_from (base_url, recursion_level, consumer,
|
|||
elif ignored_schemes_re.search(url):
|
||||
# ignored url
|
||||
klass = linkcheck.checker.ignoredurl.IgnoredUrl
|
||||
elif cmdline:
|
||||
# assume local file on command line
|
||||
elif assume_local:
|
||||
# assume local file
|
||||
klass = linkcheck.checker.fileurl.FileUrl
|
||||
else:
|
||||
# error url, no further checking, just log this
|
||||
klass = linkcheck.checker.errorurl.ErrorUrl
|
||||
url_data = klass(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
if cmdline:
|
||||
# add intern URL regex to config for every URL that was given
|
||||
# on the command line
|
||||
pat = url_data.get_intern_pattern()
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CMDLINE,
|
||||
"Add intern pattern %r from command line", pat)
|
||||
if pat:
|
||||
consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
|
||||
return url_data
|
||||
return klass
|
||||
|
||||
|
||||
def get_index_html (urls):
|
||||
|
|
|
|||
|
|
@ -23,11 +23,13 @@ import os
|
|||
import time
|
||||
import urlparse
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import urlbase
|
||||
import linkcheck
|
||||
import linkcheck.log
|
||||
import linkcheck.checker
|
||||
import linkcheck.fileutil
|
||||
|
||||
# if file extension lookup was unsuccessful, look at the content
|
||||
contents = {
|
||||
|
|
@ -83,7 +85,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
"""
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
consumer, line, column, name):
|
||||
aggregate, line, column, name):
|
||||
"""
|
||||
Besides the usual initialization the URL is normed according
|
||||
to the platform:
|
||||
|
|
@ -91,7 +93,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
- under Windows platform the drive specifier is normed
|
||||
"""
|
||||
super(FileUrl, self).init(base_ref, base_url, parent_url,
|
||||
recursion_level, consumer, line, column, name)
|
||||
recursion_level, aggregate, line, column, name)
|
||||
if self.base_url is None:
|
||||
return
|
||||
base_url = self.base_url
|
||||
|
|
@ -129,7 +131,8 @@ class FileUrl (urlbase.UrlBase):
|
|||
if self.is_directory():
|
||||
self.set_result(_("directory"))
|
||||
else:
|
||||
super(FileUrl, self).check_connection()
|
||||
url = linkcheck.fileutil.pathencode(self.url)
|
||||
self.url_connection = urllib2.urlopen(url)
|
||||
self.check_case_sensitivity()
|
||||
|
||||
def check_case_sensitivity (self):
|
||||
|
|
@ -147,7 +150,6 @@ class FileUrl (urlbase.UrlBase):
|
|||
"system path %r. You should always use "
|
||||
"the system path in URLs.") % (path, realpath),
|
||||
tag="file-system-path")
|
||||
pass
|
||||
|
||||
def get_content (self):
|
||||
"""
|
||||
|
|
@ -208,7 +210,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
path = self.urlparts[2]
|
||||
if os.name == 'nt':
|
||||
path = prepare_urlpath_for_nt(path)
|
||||
return urllib.url2pathname(path)
|
||||
return linkcheck.fileutil.pathencode(urllib.url2pathname(path))
|
||||
|
||||
def is_directory (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -53,12 +53,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
order: login, changing directory, list the file.
|
||||
"""
|
||||
# proxy support (we support only http)
|
||||
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
|
||||
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
||||
if self.proxy:
|
||||
# using a (HTTP) proxy
|
||||
http = httpurl.HttpUrl(self.base_url,
|
||||
self.recursion_level,
|
||||
self.consumer,
|
||||
self.aggregate,
|
||||
parent_url=self.parent_url,
|
||||
base_ref=self.base_ref,
|
||||
line=self.line,
|
||||
|
|
@ -87,7 +87,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# ready to connect
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("ftp", self.urlparts[1], _user, _password)
|
||||
conn = self.consumer.get_connection(key)
|
||||
conn = self.aggregate.connections.get(key)
|
||||
if conn is not None and conn.sock is not None:
|
||||
# reuse cached FTP connection
|
||||
self.url_connection = conn
|
||||
|
|
@ -248,6 +248,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# add to cached connections
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("ftp", self.urlparts[1], _user, _password)
|
||||
cache_add = self.consumer.add_connection
|
||||
cache_add = self.aggregate.connections.add
|
||||
cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
|
||||
self.url_connection = None
|
||||
|
|
|
|||
|
|
@ -129,8 +129,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
roboturl = self.get_robots_txt_url()
|
||||
user, password = self.get_user_password()
|
||||
return self.consumer.robots_txt_allows_url(roboturl, url,
|
||||
user, password)
|
||||
return self.aggregate.robots_txt.allows_url(roboturl, url,
|
||||
user, password)
|
||||
|
||||
def check_connection (self):
|
||||
"""
|
||||
|
|
@ -150,7 +150,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
valid request
|
||||
"""
|
||||
# set the proxy, so a 407 status after this is an error
|
||||
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
|
||||
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
||||
# initialize check data
|
||||
self.headers = None
|
||||
self.auth = None
|
||||
|
|
@ -360,19 +360,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
tag="http-moved-permanent")
|
||||
self.has301status = True
|
||||
# check cache again on the changed URL
|
||||
if self.consumer.checked_redirect(redirected, self):
|
||||
if self.aggregate.urlqueue.checked_redirect(redirected, self):
|
||||
return -1, response
|
||||
# in case of changed scheme make new URL object
|
||||
if self.urlparts[0] != self.scheme:
|
||||
newobj = linkcheck.checker.get_url_from(
|
||||
redirected, self.recursion_level, self.consumer,
|
||||
redirected, self.recursion_level, self.aggregate,
|
||||
parent_url=self.parent_url, base_ref=self.base_ref,
|
||||
line=self.line, column=self.column, name=self.name,
|
||||
cmdline=False)
|
||||
assume_local=False)
|
||||
newobj.warnings = self.warnings
|
||||
newobj.info = self.info
|
||||
# append new object to queue
|
||||
self.consumer.append_url(newobj)
|
||||
self.aggregate.append_url(newobj)
|
||||
# pretend to be finished and logged
|
||||
return -1, response
|
||||
# new response data
|
||||
|
|
@ -406,14 +406,14 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
linkcheck.strformat.unicode_safe(response.reason),
|
||||
tag="http-empty-content")
|
||||
# store cookies for valid links
|
||||
if self.consumer.config('cookies'):
|
||||
if self.aggregate.config['cookies']:
|
||||
for c in self.cookies:
|
||||
self.add_info(_("Store cookie: %s.") % c)
|
||||
try:
|
||||
out = self.consumer.store_cookies(self.headers,
|
||||
self.urlparts[0],
|
||||
self.urlparts[1],
|
||||
self.urlparts[2])
|
||||
out = self.aggregate.cookies.add(self.headers,
|
||||
self.urlparts[0],
|
||||
self.urlparts[1],
|
||||
self.urlparts[2])
|
||||
for h in out:
|
||||
self.add_info(linkcheck.strformat.unicode_safe(h))
|
||||
except Cookie.CookieError, msg:
|
||||
|
|
@ -471,13 +471,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
linkcheck.configuration.UserAgent)
|
||||
self.url_connection.putheader("Accept-Encoding",
|
||||
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
|
||||
if self.consumer.config('cookies'):
|
||||
if self.aggregate.config['cookies']:
|
||||
scheme = self.urlparts[0]
|
||||
host = self.urlparts[1]
|
||||
port = linkcheck.url.default_ports.get(scheme, 80)
|
||||
host, port = urllib.splitnport(host, port)
|
||||
path = self.urlparts[2]
|
||||
self.cookies = self.consumer.get_cookies(scheme, host, port, path)
|
||||
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
|
||||
for c in self.cookies:
|
||||
name = c.client_header_name()
|
||||
value = c.client_header_value()
|
||||
|
|
@ -505,7 +505,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
_user, _password = self.get_user_password()
|
||||
key = (scheme, self.urlparts[1], _user, _password)
|
||||
conn = self.consumer.get_connection(key)
|
||||
conn = self.aggregate.connections.get(key)
|
||||
if conn is not None:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"reuse cached HTTP(S) connection %s", conn)
|
||||
|
|
@ -634,7 +634,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# add to cached connections
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("http", self.urlparts[1], _user, _password)
|
||||
cache_add = self.consumer.add_connection
|
||||
cache_add = self.aggregate.connections.add
|
||||
# note: only cache the connection when it is persistent
|
||||
# and all pending content has been received
|
||||
if not self.persistent or not self.has_content or \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
|
|||
Connect to NNTP server and try to request the URL article
|
||||
resource (if specified).
|
||||
"""
|
||||
nntpserver = self.host or self.consumer.config("nntpserver")
|
||||
nntpserver = self.host or self.aggregate.config["nntpserver"]
|
||||
if not nntpserver:
|
||||
self.add_warning(
|
||||
_("No NNTP server was specified, skipping this URL."),
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ class ProxySupport (object):
|
|||
"""
|
||||
Check if self.host is in the no-proxy-for ignore list.
|
||||
"""
|
||||
for ro in self.consumer.config("noproxyfor"):
|
||||
for ro in self.aggregate.config["noproxyfor"]:
|
||||
if ro.search(self.host):
|
||||
return True
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
|
|||
label is "login: ", expected password label is "Password: ".
|
||||
"""
|
||||
self.url_connection = telnetlib.Telnet()
|
||||
if self.consumer.config("debug"):
|
||||
if self.aggregate.config["debug"]:
|
||||
self.url_connection.set_debuglevel(1)
|
||||
self.url_connection.open(self.host, self.port)
|
||||
if self.user:
|
||||
|
|
|
|||
|
|
@ -25,8 +25,6 @@ import unittest
|
|||
|
||||
import linkcheck
|
||||
import linkcheck.checker
|
||||
import linkcheck.checker.cache
|
||||
import linkcheck.checker.consumer
|
||||
import linkcheck.configuration
|
||||
import linkcheck.logger
|
||||
|
||||
|
|
@ -93,7 +91,17 @@ class TestLogger (linkcheck.logger.Logger):
|
|||
self.diff.append(line)
|
||||
|
||||
|
||||
def get_test_consumer (confargs, logargs):
|
||||
def get_file (filename=None):
|
||||
"""
|
||||
Get file name located within 'data' directory.
|
||||
"""
|
||||
directory = os.path.join("linkcheck", "checker", "tests", "data")
|
||||
if filename:
|
||||
return unicode(os.path.join(directory, filename))
|
||||
return unicode(directory)
|
||||
|
||||
|
||||
def get_test_aggregate (confargs, logargs):
|
||||
"""
|
||||
Initialize a test configuration object.
|
||||
"""
|
||||
|
|
@ -101,14 +109,15 @@ def get_test_consumer (confargs, logargs):
|
|||
config.logger_add('test', TestLogger)
|
||||
config['recursionlevel'] = 1
|
||||
config['logger'] = config.logger_new('test', **logargs)
|
||||
# uncomment for debugging
|
||||
#config.init_logging(debug=["all"])
|
||||
config["anchors"] = True
|
||||
config["verbose"] = True
|
||||
config['threads'] = 0
|
||||
config['status'] = False
|
||||
config['cookies'] = True
|
||||
config['geoip'] = None
|
||||
config.update(confargs)
|
||||
cache = linkcheck.checker.cache.Cache()
|
||||
return linkcheck.checker.consumer.Consumer(config, cache)
|
||||
return linkcheck.director.get_aggregate(config)
|
||||
|
||||
|
||||
class LinkCheckTest (unittest.TestCase):
|
||||
|
|
@ -122,21 +131,14 @@ class LinkCheckTest (unittest.TestCase):
|
|||
"""
|
||||
return linkcheck.url.url_norm(url)[0]
|
||||
|
||||
def get_file (self, filename):
|
||||
"""
|
||||
Get file name located within 'data' directory.
|
||||
"""
|
||||
return unicode(os.path.join("linkcheck", "checker", "tests",
|
||||
"data", filename))
|
||||
|
||||
def get_resultlines (self, filename):
|
||||
"""
|
||||
Return contents of file, as list of lines without line endings,
|
||||
ignoring empty lines and lines starting with a hash sign (#).
|
||||
"""
|
||||
resultfile = self.get_file(filename+".result")
|
||||
resultfile = get_file(filename+".result")
|
||||
d = {'curdir': os.getcwd(),
|
||||
'datadir': 'linkcheck/checker/tests/data',
|
||||
'datadir': get_file(),
|
||||
}
|
||||
f = codecs.open(resultfile, "r", "iso-8859-15")
|
||||
resultlines = [line.rstrip('\r\n') % d for line in f \
|
||||
|
|
@ -144,27 +146,30 @@ class LinkCheckTest (unittest.TestCase):
|
|||
f.close()
|
||||
return resultlines
|
||||
|
||||
def file_test (self, filename, confargs=None, cmdline=True):
|
||||
def file_test (self, filename, confargs=None, assume_local=True):
|
||||
"""
|
||||
Check <filename> with expected result in <filename>.result.
|
||||
"""
|
||||
url = self.get_file(filename)
|
||||
url = get_file(filename)
|
||||
if confargs is None:
|
||||
confargs = {}
|
||||
logargs = {'expected': self.get_resultlines(filename)}
|
||||
consumer = get_test_consumer(confargs, logargs)
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = linkcheck.checker.get_url_from(
|
||||
url, 0, consumer, cmdline=cmdline)
|
||||
consumer.append_url(url_data)
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
if consumer.config('logger').diff:
|
||||
url, 0, aggregate, assume_local=assume_local)
|
||||
if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
if diff:
|
||||
sep = unicode(os.linesep)
|
||||
l = [url] + consumer.config('logger').diff
|
||||
l = [url] + diff
|
||||
l = sep.join(l)
|
||||
self.fail(l.encode("iso8859-1", "ignore"))
|
||||
|
||||
def direct (self, url, resultlines, fields=None, recursionlevel=0,
|
||||
confargs=None, cmdline=False):
|
||||
confargs=None, assume_local=False):
|
||||
"""
|
||||
Check url with expected result.
|
||||
"""
|
||||
|
|
@ -176,14 +181,17 @@ class LinkCheckTest (unittest.TestCase):
|
|||
logargs = {'expected': resultlines}
|
||||
if fields is not None:
|
||||
logargs['fields'] = fields
|
||||
consumer = get_test_consumer(confargs, logargs)
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = linkcheck.checker.get_url_from(
|
||||
url, 0, consumer, cmdline=cmdline)
|
||||
consumer.append_url(url_data)
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
if consumer.config('logger').diff:
|
||||
url, 0, aggregate, assume_local=assume_local)
|
||||
if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
if diff:
|
||||
sep = unicode(os.linesep)
|
||||
l = [u"Differences found testing %s" % url]
|
||||
l.extend(x.rstrip() for x in consumer.config('logger').diff[2:])
|
||||
l.extend(x.rstrip() for x in diff[2:])
|
||||
self.fail(sep.join(l).encode("iso8859-1", "ignore"))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,3 @@
|
|||
url
|
||||
cache key None
|
||||
real url None
|
||||
error
|
||||
url file://%(curdir)s/%(datadir)s/misc.html
|
||||
cache key file://%(curdir)s/%(datadir)s/misc.html
|
||||
real url file://%(curdir)s/%(datadir)s/misc.html
|
||||
|
|
@ -21,3 +17,8 @@ url favicon.ico (cached)
|
|||
cache key file://%(curdir)s/%(datadir)s/favicon.ico
|
||||
real url file://%(curdir)s/%(datadir)s/favicon.ico
|
||||
valid
|
||||
|
||||
url
|
||||
cache key None
|
||||
real url None
|
||||
error
|
||||
|
|
|
|||
|
|
@ -36,8 +36,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
|
|||
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
|
||||
u"http.html" % self.port
|
||||
resultlines = self.get_resultlines("http.html")
|
||||
self.direct(url, resultlines, recursionlevel=1, cmdline=True)
|
||||
self.redirect_http_test()
|
||||
self.direct(url, resultlines, recursionlevel=1, assume_local=True)
|
||||
self.redirect1_http_test()
|
||||
self.redirect2_http_test()
|
||||
self.noproxyfor_test()
|
||||
finally:
|
||||
self.stop_server()
|
||||
|
|
@ -64,9 +65,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
|
|||
u"original URL was u'http://localhost:%d/redirect1'." % self.port,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0, cmdline=True)
|
||||
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
|
||||
|
||||
def redirect_http_test (self):
|
||||
def redirect1_http_test (self):
|
||||
url = u"http://localhost:%d/redirect1" % self.port
|
||||
nurl = url
|
||||
rurl = url.replace("redirect", "newurl")
|
||||
|
|
@ -77,7 +78,9 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
|
|||
u"info Redirected to %s." % rurl,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0, cmdline=True)
|
||||
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
|
||||
|
||||
def redirect2_http_test (self):
|
||||
url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
|
||||
self.port
|
||||
nurl = url
|
||||
|
|
@ -94,7 +97,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
|
|||
u"name Recursive Redirect",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=99, cmdline=True)
|
||||
self.direct(url, resultlines, recursionlevel=99, assume_local=True)
|
||||
|
||||
def noproxyfor_test (self):
|
||||
"""
|
||||
|
|
@ -113,7 +116,7 @@ class TestHttp (linkcheck.checker.tests.httptest.HttpServerTest):
|
|||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0,
|
||||
confargs=confargs, cmdline=True)
|
||||
confargs=confargs, assume_local=True)
|
||||
del os.environ["http_proxy"]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import traceback
|
|||
import linkcheck
|
||||
import linkcheck.linkparse
|
||||
import linkcheck.checker
|
||||
import linkcheck.director
|
||||
import linkcheck.strformat
|
||||
import linkcheck.containers
|
||||
import linkcheck.log
|
||||
|
|
@ -55,7 +56,7 @@ class UrlBase (object):
|
|||
An URL with additional information like validity etc.
|
||||
"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
def __init__ (self, base_url, recursion_level, aggregate,
|
||||
parent_url = None, base_ref = None,
|
||||
line = -1, column = -1, name = u""):
|
||||
"""
|
||||
|
|
@ -63,7 +64,7 @@ class UrlBase (object):
|
|||
|
||||
@param base_url: unquoted and possibly unnormed url
|
||||
@param recursion_level: on what check level lies the base url
|
||||
@param consumer: consumer instance
|
||||
@param aggregate: aggregate instance
|
||||
@param parent_url: quoted and normed url of parent or None
|
||||
@param base_ref: quoted and normed url of <base href=""> or None
|
||||
@param line: line number of url in parent content
|
||||
|
|
@ -71,13 +72,13 @@ class UrlBase (object):
|
|||
@param name: name of url or empty
|
||||
"""
|
||||
self.init(base_ref, base_url, parent_url, recursion_level,
|
||||
consumer, line, column, name)
|
||||
aggregate, line, column, name)
|
||||
self.reset()
|
||||
self.check_syntax()
|
||||
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
consumer, line, column, name):
|
||||
aggregate, line, column, name):
|
||||
"""
|
||||
Initialize internal data.
|
||||
"""
|
||||
|
|
@ -86,7 +87,7 @@ class UrlBase (object):
|
|||
self.base_url = base_url
|
||||
self.parent_url = parent_url
|
||||
self.recursion_level = recursion_level
|
||||
self.consumer = consumer
|
||||
self.aggregate = aggregate
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.name = name
|
||||
|
|
@ -203,6 +204,7 @@ class UrlBase (object):
|
|||
Fill attributes from cache data.
|
||||
"""
|
||||
self.result = cache_data["result"]
|
||||
self.has_result = True
|
||||
self.warnings.extend(cache_data["warnings"])
|
||||
self.info.extend(cache_data["info"])
|
||||
self.valid = cache_data["valid"]
|
||||
|
|
@ -240,8 +242,8 @@ class UrlBase (object):
|
|||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Content cache key %r", self.cache_content_key)
|
||||
# construct cache key
|
||||
if self.consumer.config("anchorcaching") and \
|
||||
self.consumer.config("anchors"):
|
||||
if self.aggregate.config["anchorcaching"] and \
|
||||
self.aggregate.config["anchors"]:
|
||||
# do not ignore anchor
|
||||
parts = self.urlparts[:]
|
||||
parts[4] = self.anchor
|
||||
|
|
@ -343,32 +345,28 @@ class UrlBase (object):
|
|||
"""
|
||||
Main check function for checking this URL.
|
||||
"""
|
||||
if self.consumer.config("trace"):
|
||||
if self.aggregate.config["trace"]:
|
||||
linkcheck.trace.trace_on()
|
||||
try:
|
||||
self.local_check()
|
||||
self.consumer.checked(self)
|
||||
except (socket.error, select.error):
|
||||
self.consumer.interrupted(self)
|
||||
# on Unix, ctrl-c can raise
|
||||
# error: (4, 'Interrupted system call')
|
||||
etype, value = sys.exc_info()[:2]
|
||||
if etype == 4:
|
||||
if etype == errno.EINTR:
|
||||
raise KeyboardInterrupt(value)
|
||||
else:
|
||||
raise
|
||||
except KeyboardInterrupt:
|
||||
self.consumer.interrupted(self)
|
||||
raise
|
||||
except:
|
||||
self.consumer.interrupted(self)
|
||||
linkcheck.checker.internal_error()
|
||||
linkcheck.director.internal_error()
|
||||
|
||||
def add_country_info (self):
|
||||
"""
|
||||
Try to ask GeoIP database for country info.
|
||||
"""
|
||||
country = self.consumer.get_country_name(self.host)
|
||||
country = linkcheck.cache.geoip.get_country(self.host)
|
||||
if country is not None:
|
||||
self.add_info(_("URL is located in %s.") % _(country))
|
||||
|
||||
|
|
@ -377,10 +375,11 @@ class UrlBase (object):
|
|||
Local check function can be overridden in subclasses.
|
||||
"""
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
|
||||
if self.recursion_level and self.consumer.config('wait'):
|
||||
wait = self.aggregate.config['wait']
|
||||
if self.recursion_level and wait:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"sleeping for %d seconds", self.consumer.config('wait'))
|
||||
time.sleep(self.consumer.config('wait'))
|
||||
"sleeping for %d seconds", wait)
|
||||
time.sleep(wait)
|
||||
t = time.time()
|
||||
self.set_extern(self.url)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
|
|
@ -392,7 +391,7 @@ class UrlBase (object):
|
|||
try:
|
||||
self.check_connection()
|
||||
self.add_country_info()
|
||||
if self.consumer.config("anchors"):
|
||||
if self.aggregate.config["anchors"]:
|
||||
self.check_anchors()
|
||||
except tuple(linkcheck.checker.ExcList):
|
||||
value = self.handle_exception()
|
||||
|
|
@ -406,7 +405,7 @@ class UrlBase (object):
|
|||
valid=False)
|
||||
|
||||
# check content
|
||||
warningregex = self.consumer.config("warningregex")
|
||||
warningregex = self.aggregate.config["warningregex"]
|
||||
if warningregex and self.valid:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"checking content")
|
||||
|
|
@ -486,8 +485,8 @@ class UrlBase (object):
|
|||
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"... no, cannot get content.")
|
||||
return False
|
||||
if self.consumer.config("recursionlevel") >= 0 and \
|
||||
self.recursion_level >= self.consumer.config("recursionlevel"):
|
||||
rec_level = self.aggregate.config["recursionlevel"]
|
||||
if rec_level >= 0 and self.recursion_level >= rec_level:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"... no, maximum recursion level reached.")
|
||||
return False
|
||||
|
|
@ -551,7 +550,7 @@ class UrlBase (object):
|
|||
|
||||
@return: None
|
||||
"""
|
||||
for entry in self.consumer.config("externlinks"):
|
||||
for entry in self.aggregate.config["externlinks"]:
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
|
|
@ -559,7 +558,7 @@ class UrlBase (object):
|
|||
"Extern URL %r", url)
|
||||
self.extern = (1, entry['strict'])
|
||||
return
|
||||
for entry in self.consumer.config("internlinks"):
|
||||
for entry in self.aggregate.config["internlinks"]:
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
|
|
@ -607,7 +606,7 @@ class UrlBase (object):
|
|||
If a maximum size was given, call this function to check it
|
||||
against the content size of this url.
|
||||
"""
|
||||
maxbytes = self.consumer.config("warnsizebytes")
|
||||
maxbytes = self.aggregate.config["warnsizebytes"]
|
||||
if maxbytes is not None and self.dlsize >= maxbytes:
|
||||
self.add_warning(_("Content size %s is larger than %s.") %
|
||||
(linkcheck.strformat.strsize(self.dlsize),
|
||||
|
|
@ -626,7 +625,7 @@ class UrlBase (object):
|
|||
Get tuple (user, password) from configured authentication.
|
||||
Both user and password can be None if not specified.
|
||||
"""
|
||||
for auth in self.consumer.config("authentication"):
|
||||
for auth in self.aggregate.config["authentication"]:
|
||||
if auth['pattern'].match(self.url):
|
||||
return auth['user'], auth['password']
|
||||
return None, None
|
||||
|
|
@ -651,10 +650,10 @@ class UrlBase (object):
|
|||
else:
|
||||
base_ref = h.base_ref
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer, parent_url=self.url,
|
||||
self.recursion_level+1, self.aggregate, parent_url=self.url,
|
||||
base_ref=base_ref, line=line, column=column, name=name,
|
||||
cmdline=False)
|
||||
self.consumer.append_url(url_data)
|
||||
assume_local=False)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_opera (self):
|
||||
"""
|
||||
|
|
@ -674,10 +673,10 @@ class UrlBase (object):
|
|||
url = line[4:]
|
||||
if url:
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno, name=name,
|
||||
cmdline=False)
|
||||
self.consumer.append_url(url_data)
|
||||
assume_local=False)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
name = ""
|
||||
|
||||
def parse_text (self):
|
||||
|
|
@ -694,10 +693,10 @@ class UrlBase (object):
|
|||
if not line or line.startswith('#'):
|
||||
continue
|
||||
url_data = linkcheck.checker.get_url_from(line,
|
||||
self.recursion_level+1, self.consumer,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno,
|
||||
cmdline=False)
|
||||
self.consumer.append_url(url_data)
|
||||
assume_local=False)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_css (self):
|
||||
"""
|
||||
|
|
@ -712,10 +711,10 @@ class UrlBase (object):
|
|||
column = mo.start("url")
|
||||
url = linkcheck.strformat.unquote(mo.group("url").strip())
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno, column=column,
|
||||
cmdline=False)
|
||||
self.consumer.append_url(url_data)
|
||||
assume_local=False)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def serialized (self):
|
||||
"""
|
||||
|
|
@ -758,7 +757,7 @@ class UrlBase (object):
|
|||
@rtype: string
|
||||
"""
|
||||
s = self.serialized()
|
||||
return self.consumer.config('logger').encode(s)
|
||||
return self.aggregate.config['logger'].encode(s)
|
||||
|
||||
def __repr__ (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -28,11 +28,6 @@ import linkcheck
|
|||
import linkcheck.log
|
||||
import linkcheck.containers
|
||||
import confparse
|
||||
try:
|
||||
import GeoIP
|
||||
_has_geoip = True
|
||||
except ImportError:
|
||||
_has_geoip = False
|
||||
|
||||
Version = _linkchecker_configdata.version
|
||||
AppName = u"LinkChecker"
|
||||
|
|
@ -83,6 +78,7 @@ class Configuration (dict):
|
|||
self["internlinks"] = []
|
||||
self["noproxyfor"] = []
|
||||
self["interactive"] = False
|
||||
self["maxqueuesize"] = 0
|
||||
# on ftp, password is set by Pythons ftplib
|
||||
self["authentication"] = []
|
||||
self["proxy"] = urllib.getproxies()
|
||||
|
|
@ -149,18 +145,6 @@ class Configuration (dict):
|
|||
self["warnsizebytes"] = None
|
||||
self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
|
||||
self["threads"] = 10
|
||||
self.init_geoip()
|
||||
|
||||
def init_geoip (self):
|
||||
"""
|
||||
If GeoIP.dat file is found, initialize a standard geoip DB and
|
||||
store it in self["geoip"]; else this value will be None.
|
||||
"""
|
||||
geoip_dat = "/usr/share/GeoIP/GeoIP.dat"
|
||||
if _has_geoip and os.path.exists(geoip_dat):
|
||||
self["geoip"] = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
|
||||
else:
|
||||
self["geoip"] = None
|
||||
|
||||
def init_logging (self, debug=None):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -23,12 +23,14 @@ import os
|
|||
import linkcheck.configuration
|
||||
|
||||
|
||||
def get_file (filename):
|
||||
def get_file (filename=None):
|
||||
"""
|
||||
Get file name located within 'data' directory.
|
||||
"""
|
||||
return unicode(os.path.join("linkcheck", "configuration", "tests",
|
||||
"data", filename))
|
||||
directory = os.path.join("linkcheck", "configuration", "tests", "data")
|
||||
if filename:
|
||||
return unicode(os.path.join(directory, filename))
|
||||
return unicode(directory)
|
||||
|
||||
|
||||
class TestConfig (unittest.TestCase):
|
||||
|
|
|
|||
|
|
@ -297,40 +297,3 @@ class Rfc2965Cookie (HttpCookie):
|
|||
|
||||
# XXX more methods (equality test)
|
||||
|
||||
|
||||
class CookieJar (set):
|
||||
"""
|
||||
Cookie storage, implementing the default cookie handling policy for
|
||||
LinkChecker.
|
||||
"""
|
||||
|
||||
def add_cookies (self, headers, scheme, host, path):
|
||||
"""
|
||||
Parse cookie values, add to jar.
|
||||
"""
|
||||
to_add = set()
|
||||
for h in headers.getallmatchingheaders("Set-Cookie"):
|
||||
# RFC 2109 (Netscape) cookie type
|
||||
try:
|
||||
to_add.add(NetscapeCookie(h, scheme, host, path))
|
||||
except CookieError:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Invalid cookie header for %s:%s%s: %r", scheme, host, path, h)
|
||||
for h in headers.getallmatchingheaders("Set-Cookie2"):
|
||||
# RFC 2965 cookie type
|
||||
try:
|
||||
to_add.add(Rfc2965Cookie(h, scheme, host, path))
|
||||
except CookieError:
|
||||
assert linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Invalid cookie2 header for %s:%s%s: %r", scheme, host, path, h)
|
||||
for x in to_add:
|
||||
self.add(x)
|
||||
return to_add
|
||||
|
||||
def remove_expired (self):
|
||||
"""
|
||||
Remove expired cookies from jar.
|
||||
"""
|
||||
to_remove = [x for x in self if not x.check_expired()]
|
||||
return self.difference_update(to_remove)
|
||||
|
||||
|
|
|
|||
|
|
@ -31,8 +31,7 @@ import linkcheck.url
|
|||
import linkcheck.i18n
|
||||
import linkcheck.strformat
|
||||
import linkcheck.checker
|
||||
import linkcheck.checker.cache
|
||||
import linkcheck.checker.consumer
|
||||
import linkcheck.director
|
||||
|
||||
_logfile = None
|
||||
_supported_langs = ('de', 'fr', 'nl', 'C')
|
||||
|
|
@ -99,13 +98,16 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
|
|||
config["externlinks"].append(
|
||||
linkcheck.get_link_pat("^%s$" % linkcheck.url.safe_url_pattern))
|
||||
config["externlinks"].append(linkcheck.get_link_pat(".*", strict=True))
|
||||
# start checking
|
||||
aggregate = linkcheck.director.get_aggregate(config)
|
||||
|
||||
cache = linkcheck.checker.cache.Cache()
|
||||
consumer = linkcheck.checker.consumer.Consumer(config, cache)
|
||||
# start checking
|
||||
url = form["url"].value
|
||||
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=False)
|
||||
consumer.append_url(url_data)
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
url_data = linkcheck.checker.get_url_from(url, 0, aggregate,
|
||||
assume_local=False)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
|
||||
|
||||
def get_host_name (form):
|
||||
|
|
|
|||
|
|
@ -20,18 +20,16 @@ Test url build method from url data objects.
|
|||
|
||||
import unittest
|
||||
import linkcheck.configuration
|
||||
import linkcheck.director
|
||||
import linkcheck.checker.httpurl
|
||||
import linkcheck.checker.cache
|
||||
import linkcheck.checker.consumer
|
||||
|
||||
def get_test_consumer ():
|
||||
def get_test_aggregate ():
|
||||
"""
|
||||
Initialize a test configuration object.
|
||||
"""
|
||||
config = linkcheck.configuration.Configuration()
|
||||
config['logger'] = config.logger_new('none')
|
||||
cache = linkcheck.checker.cache.Cache()
|
||||
return linkcheck.checker.consumer.Consumer(config, cache)
|
||||
return linkcheck.director.get_aggregate(config)
|
||||
|
||||
|
||||
class TestUrlBuild (unittest.TestCase):
|
||||
|
|
@ -43,9 +41,9 @@ class TestUrlBuild (unittest.TestCase):
|
|||
parent_url = "http://localhost:8001/linkcheck/checker/tests/data/http.html"
|
||||
base_url = "http://"
|
||||
recursion_level = 0
|
||||
consumer = get_test_consumer()
|
||||
aggregate = get_test_aggregate()
|
||||
o = linkcheck.checker.httpurl.HttpUrl(base_url, recursion_level,
|
||||
consumer, parent_url=parent_url)
|
||||
aggregate, parent_url=parent_url)
|
||||
o.build_url()
|
||||
self.assertEquals(o.url, 'http://')
|
||||
|
||||
|
|
|
|||
24
linkchecker
24
linkchecker
|
|
@ -38,8 +38,7 @@ optparse._ = _
|
|||
import linkcheck.log
|
||||
import linkcheck.i18n
|
||||
import linkcheck.checker
|
||||
import linkcheck.checker.cache
|
||||
import linkcheck.checker.consumer
|
||||
import linkcheck.director
|
||||
import linkcheck.configuration
|
||||
import linkcheck.fileutil
|
||||
import linkcheck.strformat
|
||||
|
|
@ -654,14 +653,15 @@ if len(args) <= 0:
|
|||
else:
|
||||
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
|
||||
|
||||
# initialize the cache and the consumer model
|
||||
cache = linkcheck.checker.cache.Cache()
|
||||
consumer = linkcheck.checker.consumer.Consumer(config, cache)
|
||||
# prepare checking queue
|
||||
aggregate = linkcheck.director.get_aggregate(config)
|
||||
if options.trace:
|
||||
config["trace"] = True
|
||||
import linkcheck.trace
|
||||
linkcheck.trace.trace_filter([r"^linkcheck"])
|
||||
linkcheck.trace.trace_on()
|
||||
# add urls to queue
|
||||
get_url_from = linkcheck.checker.get_url_from
|
||||
for url in args:
|
||||
if url.lower().startswith("www."):
|
||||
# syntactic sugar
|
||||
|
|
@ -669,14 +669,14 @@ for url in args:
|
|||
elif url.lower().startswith("ftp."):
|
||||
# syntactic sugar
|
||||
url = "ftp://%s" % url
|
||||
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
|
||||
consumer.append_url(url_data)
|
||||
############################# check the URLs ################################
|
||||
url_data = get_url_from(url, 0, aggregate, assume_local=True)
|
||||
linkcheck.add_intern_pattern(url_data, config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
# set up profiling/psyco
|
||||
if do_profile and not has_profile:
|
||||
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
|
||||
_("The `profile' Python module is not installed,"
|
||||
" therefore the --profile option is disabled."))
|
||||
|
||||
if do_profile and has_profile:
|
||||
run = True
|
||||
if os.path.exists(_profile):
|
||||
|
|
@ -690,7 +690,7 @@ if do_profile and has_profile:
|
|||
run = False
|
||||
if run:
|
||||
import profile
|
||||
profile.run("linkcheck.checker.check_urls(consumer)", _profile)
|
||||
profile.run("manager.check_urls()", _profile)
|
||||
elif options.psyco:
|
||||
try:
|
||||
import psyco
|
||||
|
|
@ -705,8 +705,8 @@ elif options.psyco:
|
|||
except ImportError:
|
||||
# no psyco available, just ignore
|
||||
pass
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
#############################################################################
|
||||
# start checking
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
|
||||
# interactive input end
|
||||
if config['interactive']:
|
||||
|
|
|
|||
3
setup.py
3
setup.py
|
|
@ -537,7 +537,8 @@ o a (Fast)CGI web interface (requires HTTP server)
|
|||
'clean': MyClean,
|
||||
},
|
||||
packages = ['linkcheck', 'linkcheck.logger', 'linkcheck.checker',
|
||||
'linkcheck.configuration',
|
||||
'linkcheck.director', 'linkcheck.configuration',
|
||||
'linkcheck.cache',
|
||||
'linkcheck.dns', 'linkcheck.dns.rdtypes',
|
||||
'linkcheck.dns.rdtypes.ANY', 'linkcheck.dns.rdtypes.IN',
|
||||
'linkcheck.HtmlParser', 'linkcheck.ftpparse', ],
|
||||
|
|
|
|||
Loading…
Reference in a new issue