mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-23 23:54:44 +00:00
use a decorator to synchronize all the checker threads
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2614 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
3f6d2fa753
commit
55d7d5f7df
13 changed files with 246 additions and 294 deletions
|
|
@ -134,13 +134,14 @@ def _check_urls (consumer):
|
|||
start_time = time.time()
|
||||
status_time = start_time
|
||||
while not consumer.finished():
|
||||
consumer.check_url()
|
||||
if consumer.config['status']:
|
||||
if not consumer.check_url():
|
||||
time.sleep(0.1)
|
||||
if consumer.config('status'):
|
||||
curtime = time.time()
|
||||
if (curtime - status_time) > 5:
|
||||
consumer.print_status(curtime, start_time)
|
||||
status_time = curtime
|
||||
consumer.logger_end_output()
|
||||
consumer.end_log_output()
|
||||
|
||||
|
||||
# file extensions we can parse recursively
|
||||
|
|
@ -252,7 +253,7 @@ def get_url_from (base_url, recursion_level, consumer,
|
|||
pat = url_data.get_intern_pattern()
|
||||
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Pattern %r", pat)
|
||||
if pat:
|
||||
consumer.config['internlinks'].append(linkcheck.get_link_pat(pat))
|
||||
consumer.config_append('internlinks', linkcheck.get_link_pat(pat))
|
||||
return url_data
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -48,13 +48,11 @@ def _check_morsel (m, host, path):
|
|||
return m.output(header='').strip()
|
||||
|
||||
|
||||
class Cache (linkcheck.lock.AssertLock):
|
||||
class Cache (object):
|
||||
"""
|
||||
Store and provide routines for cached data. Currently there are
|
||||
caches for cookies, checked URLs, FTP connections and robots.txt
|
||||
contents.
|
||||
|
||||
All public operations (except __init__()) are thread-safe.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
|
|
@ -63,10 +61,13 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
"""
|
||||
super(Cache, self).__init__()
|
||||
# already checked URLs
|
||||
# {cache key (string) -> cache data (dict)}
|
||||
self.checked = {}
|
||||
# URLs that are being checked
|
||||
# {cache key (string) -> urldata (UrlData)}
|
||||
self.in_progress = {}
|
||||
# to-be-checked URLs
|
||||
# [urldata (UrlData)]
|
||||
self.incoming = collections.deque()
|
||||
# downloaded robots.txt files
|
||||
self.robots_txt = {}
|
||||
|
|
@ -79,11 +80,7 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
"""
|
||||
Check if incoming queue is empty.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return len(self.incoming) <= 0
|
||||
finally:
|
||||
self.release()
|
||||
return len(self.incoming) <= 0
|
||||
|
||||
def incoming_get_url (self):
|
||||
"""
|
||||
|
|
@ -91,57 +88,44 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
return it. If no such url is available return None. The
|
||||
url might be already cached.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
for i, url_data in enumerate(self.incoming):
|
||||
key = url_data.cache_url_key
|
||||
if key in self.checked:
|
||||
del self.incoming[i]
|
||||
# url is cached and can be logged
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
return url_data
|
||||
elif key not in self.in_progress:
|
||||
del self.incoming[i]
|
||||
self.in_progress[key] = url_data
|
||||
return url_data
|
||||
return None
|
||||
finally:
|
||||
self.release()
|
||||
for i, url_data in enumerate(self.incoming):
|
||||
key = url_data.cache_url_key
|
||||
if key in self.checked:
|
||||
del self.incoming[i]
|
||||
# url is cached and can be logged
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
return url_data
|
||||
elif key not in self.in_progress:
|
||||
del self.incoming[i]
|
||||
self.in_progress[key] = url_data
|
||||
return url_data
|
||||
return None
|
||||
|
||||
def incoming_len (self):
|
||||
"""
|
||||
Return number of entries in incoming queue.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return len(self.incoming)
|
||||
finally:
|
||||
self.release()
|
||||
return len(self.incoming)
|
||||
|
||||
def incoming_add (self, url_data):
|
||||
"""
|
||||
Add a new URL to list of URLs to check.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Add url %s...", repr(url_data))
|
||||
# check syntax
|
||||
if not url_data.check_syntax():
|
||||
# wrong syntax, do not check any further
|
||||
return False
|
||||
# check the cache
|
||||
key = url_data.cache_url_key
|
||||
if key in self.checked:
|
||||
# url is cached and can be logged
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
return False
|
||||
# url is not cached, so add to incoming queue
|
||||
self.incoming.append(url_data)
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "...added.")
|
||||
return True
|
||||
finally:
|
||||
self.release()
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Add url %s...", repr(url_data))
|
||||
if url_data.has_result:
|
||||
# do not check any further
|
||||
return False
|
||||
# check the cache
|
||||
key = url_data.cache_url_key
|
||||
if key in self.checked:
|
||||
# url is cached and can be logged
|
||||
url_data.copy_from_cache(self.checked[key])
|
||||
return False
|
||||
# url is not cached, so add to incoming queue
|
||||
self.incoming.append(url_data)
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "...added.")
|
||||
return True
|
||||
|
||||
def has_incoming (self, key):
|
||||
"""
|
||||
|
|
@ -150,11 +134,7 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
@param key: Usually obtained from url_data.cache_url_key
|
||||
@type key: String
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return key in self.incoming
|
||||
finally:
|
||||
self.release()
|
||||
return key in self.incoming
|
||||
|
||||
def has_in_progress (self, key):
|
||||
"""
|
||||
|
|
@ -163,44 +143,32 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
@param key: Usually obtained from url_data.cache_url_key
|
||||
@type key: String
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return key in self.in_progress
|
||||
finally:
|
||||
self.release()
|
||||
return key in self.in_progress
|
||||
|
||||
def in_progress_remove (self, url_data, ignore_missing=False):
|
||||
"""
|
||||
Remove url from in-progress cache. If url is not cached and
|
||||
ignore_missing evaluates True, raise AssertionError.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
key = url_data.cache_url_key
|
||||
if key in self.in_progress:
|
||||
del self.in_progress[key]
|
||||
else:
|
||||
assert ignore_missing, repr(key)
|
||||
finally:
|
||||
self.release()
|
||||
key = url_data.cache_url_key
|
||||
if key in self.in_progress:
|
||||
del self.in_progress[key]
|
||||
else:
|
||||
assert ignore_missing, repr(key)
|
||||
|
||||
def checked_add (self, url_data):
|
||||
"""
|
||||
Cache checked url data.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
data = url_data.get_cache_data()
|
||||
key = url_data.cache_url_key
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cache key %r...", key)
|
||||
assert key not in self.checked, \
|
||||
key + u", " + unicode(self.checked[key])
|
||||
assert key in self.in_progress, key
|
||||
# move entry from self.in_progress to self.checked
|
||||
del self.in_progress[key]
|
||||
self.checked[key] = data
|
||||
finally:
|
||||
self.release()
|
||||
data = url_data.get_cache_data()
|
||||
key = url_data.cache_url_key
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cache key %r...", key)
|
||||
assert key not in self.checked, \
|
||||
key + u", " + unicode(self.checked[key])
|
||||
assert key in self.in_progress, key
|
||||
# move entry from self.in_progress to self.checked
|
||||
del self.in_progress[key]
|
||||
self.checked[key] = data
|
||||
|
||||
def checked_redirect (self, redirect, url_data):
|
||||
"""
|
||||
|
|
@ -209,96 +177,69 @@ class Cache (linkcheck.lock.AssertLock):
|
|||
If the redirect URL is found in the cache, the result data is
|
||||
already copied.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
if redirect in self.checked:
|
||||
url_data.copy_from_cache(self.checked[redirect])
|
||||
return True
|
||||
return False
|
||||
finally:
|
||||
self.release()
|
||||
if redirect in self.checked:
|
||||
url_data.copy_from_cache(self.checked[redirect])
|
||||
return True
|
||||
return False
|
||||
|
||||
def robots_txt_allows_url (self, roboturl, url, user, password):
|
||||
"""
|
||||
Ask robots.txt allowance.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
if roboturl not in self.robots_txt:
|
||||
rp = linkcheck.robotparser2.RobotFileParser(
|
||||
user=user, password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
self.robots_txt[roboturl] = rp
|
||||
else:
|
||||
rp = self.robots_txt[roboturl]
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
|
||||
finally:
|
||||
self.release()
|
||||
if roboturl not in self.robots_txt:
|
||||
rp = linkcheck.robotparser2.RobotFileParser(
|
||||
user=user, password=password)
|
||||
rp.set_url(roboturl)
|
||||
rp.read()
|
||||
self.robots_txt[roboturl] = rp
|
||||
else:
|
||||
rp = self.robots_txt[roboturl]
|
||||
return rp.can_fetch(linkcheck.configuration.UserAgent, url)
|
||||
|
||||
def get_connection (self, key):
|
||||
"""
|
||||
Get open connection to given host. Return None if no such
|
||||
connection is available (or the old one timed out).
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return self.pool.get_connection(key)
|
||||
finally:
|
||||
self.release()
|
||||
return self.pool.get_connection(key)
|
||||
|
||||
def add_connection (self, key, connection, timeout):
|
||||
"""
|
||||
Store open connection into pool for reuse.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
self.pool.add_connection(key, connection, timeout)
|
||||
finally:
|
||||
self.release()
|
||||
self.pool.add_connection(key, connection, timeout)
|
||||
|
||||
def release_connection (self, key):
|
||||
"""
|
||||
Remove connection from pool.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
self.pool.release_connection(key)
|
||||
finally:
|
||||
self.release()
|
||||
self.pool.release_connection(key)
|
||||
|
||||
def store_cookies (self, headers, host):
|
||||
"""
|
||||
Thread-safe cookie cache setter function. Can raise the
|
||||
exception Cookie.CookieError.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
output = []
|
||||
for h in headers.getallmatchingheaders("Set-Cookie"):
|
||||
output.append(h)
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h)
|
||||
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
|
||||
c.load(h)
|
||||
return output
|
||||
finally:
|
||||
self.release()
|
||||
output = []
|
||||
for h in headers.getallmatchingheaders("Set-Cookie"):
|
||||
output.append(h)
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h)
|
||||
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
|
||||
c.load(h)
|
||||
return output
|
||||
|
||||
def get_cookies (self, host, path):
|
||||
"""
|
||||
Thread-safe cookie cache getter function.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Get cookies for host %r path %r", host, path)
|
||||
if not self.cookies.has_key(host):
|
||||
return []
|
||||
cookievals = []
|
||||
for m in self.cookies[host].values():
|
||||
val = _check_morsel(m, host, path)
|
||||
if val:
|
||||
cookievals.append(val)
|
||||
return cookievals
|
||||
finally:
|
||||
self.release()
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
"Get cookies for host %r path %r", host, path)
|
||||
if not self.cookies.has_key(host):
|
||||
return []
|
||||
cookievals = []
|
||||
for m in self.cookies[host].values():
|
||||
val = _check_morsel(m, host, path)
|
||||
if val:
|
||||
cookievals.append(val)
|
||||
return cookievals
|
||||
|
||||
|
|
|
|||
|
|
@ -20,14 +20,22 @@ Url consumer class.
|
|||
|
||||
import sys
|
||||
import time
|
||||
try:
|
||||
import thread
|
||||
except ImportError:
|
||||
import dummy_thread as thread
|
||||
|
||||
import linkcheck.threader
|
||||
import linkcheck.log
|
||||
import linkcheck.lock
|
||||
import linkcheck.strformat
|
||||
import linkcheck.checker.geoip
|
||||
from linkcheck.decorators import synchronized
|
||||
from urlbase import stderr
|
||||
|
||||
# global lock for synchronizing all the checker threads
|
||||
_lock = thread.allocate_lock()
|
||||
|
||||
|
||||
def print_tocheck (tocheck):
|
||||
msg = _n("%5d URL queued,", "%5d URLs queued,", tocheck) % tocheck
|
||||
|
|
@ -49,7 +57,7 @@ def print_duration (duration):
|
|||
print >> stderr, msg,
|
||||
|
||||
|
||||
class Consumer (linkcheck.lock.AssertLock):
|
||||
class Consumer (object):
|
||||
"""
|
||||
Consume URLs from the URL queue in a thread-safe manner.
|
||||
"""
|
||||
|
|
@ -59,33 +67,46 @@ class Consumer (linkcheck.lock.AssertLock):
|
|||
Initialize consumer data and threads.
|
||||
"""
|
||||
super(Consumer, self).__init__()
|
||||
self.config = config
|
||||
self.cache = cache
|
||||
self.threader = linkcheck.threader.Threader(num=config['threads'])
|
||||
self.logger = config['logger']
|
||||
self.fileoutput = config['fileoutput']
|
||||
self.logger_start_output()
|
||||
self._config = config
|
||||
self._cache = cache
|
||||
self._threader = linkcheck.threader.Threader(num=config['threads'])
|
||||
self.start_log_output()
|
||||
|
||||
@synchronized(_lock)
|
||||
def config (self, key):
|
||||
return self._config[key]
|
||||
|
||||
@synchronized(_lock)
|
||||
def config_append (self, key, val):
|
||||
self._config[key].append(val)
|
||||
|
||||
@synchronized(_lock)
|
||||
def __getattr__ (self, name):
|
||||
if hasattr(self._cache, name):
|
||||
return getattr(self._cache, name)
|
||||
raise AttributeError(name)
|
||||
|
||||
@synchronized(_lock)
|
||||
def append_url (self, url_data):
|
||||
"""
|
||||
Append url to incoming check list.
|
||||
"""
|
||||
if not self.cache.incoming_add(url_data):
|
||||
if not self._cache.incoming_add(url_data):
|
||||
# can be logged
|
||||
self.logger_log_url(url_data)
|
||||
self._log_url(url_data)
|
||||
|
||||
@synchronized(_lock)
|
||||
def check_url (self):
|
||||
"""
|
||||
Start new thread checking the given url.
|
||||
"""
|
||||
url_data = self.cache.incoming_get_url()
|
||||
url_data = self._cache.incoming_get_url()
|
||||
if url_data is None:
|
||||
# active connections are downloading/parsing, so
|
||||
# wait a little
|
||||
time.sleep(0.1)
|
||||
# active connections are downloading/parsing
|
||||
pass
|
||||
elif url_data.cached:
|
||||
# was cached -> can be logged
|
||||
self.logger_log_url(url_data)
|
||||
self._log_url(url_data)
|
||||
else:
|
||||
# go check this url
|
||||
# this calls either self.checked() or self.interrupted()
|
||||
|
|
@ -95,47 +116,48 @@ class Consumer (linkcheck.lock.AssertLock):
|
|||
else:
|
||||
name = u""
|
||||
name += url_data.base_url
|
||||
self.threader.start_thread(url_data.check, (), name=name)
|
||||
self._threader.start_thread(url_data.check, (), name=name)
|
||||
return url_data and not url_data.cached
|
||||
|
||||
@synchronized(_lock)
|
||||
def checked (self, url_data):
|
||||
"""
|
||||
Put checked url in cache and log it.
|
||||
"""
|
||||
# log before putting it in the cache (otherwise we would see
|
||||
# a "(cached)" after every url
|
||||
self.logger_log_url(url_data)
|
||||
self._log_url(url_data)
|
||||
if not url_data.cached:
|
||||
self.cache.checked_add(url_data)
|
||||
self._cache.checked_add(url_data)
|
||||
else:
|
||||
self.cache.in_progress_remove(url_data)
|
||||
self._cache.in_progress_remove(url_data)
|
||||
|
||||
@synchronized(_lock)
|
||||
def interrupted (self, url_data):
|
||||
"""
|
||||
Remove url from active list.
|
||||
"""
|
||||
self.cache.in_progress_remove(url_data, ignore_missing=True)
|
||||
self._cache.in_progress_remove(url_data, ignore_missing=True)
|
||||
|
||||
@synchronized(_lock)
|
||||
def finished (self):
|
||||
"""
|
||||
Return True if checking is finished.
|
||||
"""
|
||||
# avoid deadlock by requesting cache data before locking
|
||||
tocheck = self.cache.incoming_len()
|
||||
self.acquire()
|
||||
try:
|
||||
return self.threader.finished() and tocheck == 0
|
||||
finally:
|
||||
self.release()
|
||||
return self._threader.finished() and \
|
||||
self._cache.incoming_len() == 0
|
||||
|
||||
@synchronized(_lock)
|
||||
def finish (self):
|
||||
self._threader.finish()
|
||||
|
||||
@synchronized(_lock)
|
||||
def no_more_threads (self):
|
||||
"""
|
||||
Return True if no more active threads are running.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return self.threader.finished()
|
||||
finally:
|
||||
self.release()
|
||||
return self._threader.finished()
|
||||
|
||||
def abort (self):
|
||||
"""
|
||||
|
|
@ -148,7 +170,7 @@ class Consumer (linkcheck.lock.AssertLock):
|
|||
if num_waited > wait_max:
|
||||
linkcheck.log.error(linkcheck.LOG_CHECK,
|
||||
"Thread wait timeout")
|
||||
self.logger_end_output()
|
||||
self.end_log_output()
|
||||
sys.exit(1)
|
||||
num = self.active_threads()
|
||||
msg = \
|
||||
|
|
@ -156,94 +178,69 @@ class Consumer (linkcheck.lock.AssertLock):
|
|||
"keyboard interrupt; waiting for %d active threads to finish",
|
||||
num)
|
||||
linkcheck.log.warn(linkcheck.LOG_CHECK, msg, num)
|
||||
self.acquire()
|
||||
try:
|
||||
self.threader.finish()
|
||||
finally:
|
||||
self.release()
|
||||
self.finish()
|
||||
num_waited += 1
|
||||
time.sleep(2)
|
||||
self.logger_end_output()
|
||||
self.end_log_output()
|
||||
|
||||
@synchronized(_lock)
|
||||
def print_status (self, curtime, start_time):
|
||||
"""
|
||||
Print check status looking at url queues.
|
||||
"""
|
||||
# avoid deadlock by requesting cache data before locking
|
||||
tocheck = self.cache.incoming_len()
|
||||
active = self.active_threads()
|
||||
self.acquire()
|
||||
try:
|
||||
print >> stderr, _("Status:"),
|
||||
print_active(active)
|
||||
print_links(self.logger.number)
|
||||
print_tocheck(tocheck)
|
||||
print_duration(curtime - start_time)
|
||||
print >> stderr
|
||||
finally:
|
||||
self.release()
|
||||
print >> stderr, _("Status:"),
|
||||
print_active(self._threader.active_threads())
|
||||
print_links(self._config['logger'].number)
|
||||
print_tocheck(self._cache.incoming_len())
|
||||
print_duration(curtime - start_time)
|
||||
print >> stderr
|
||||
|
||||
def logger_start_output (self):
|
||||
@synchronized(_lock)
|
||||
def start_log_output (self):
|
||||
"""
|
||||
Start output of all configured loggers.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
self.logger.start_output()
|
||||
for logger in self.fileoutput:
|
||||
logger.start_output()
|
||||
finally:
|
||||
self.release()
|
||||
self._config['logger'].start_output()
|
||||
for logger in self._config['fileoutput']:
|
||||
logger.start_output()
|
||||
|
||||
def logger_log_url (self, url_data):
|
||||
def _log_url (self, url_data):
|
||||
"""
|
||||
Send new url to all configured loggers.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
do_print = self.config["verbose"] or not url_data.valid or \
|
||||
(url_data.warning and self.config["warnings"])
|
||||
self.logger.log_filter_url(url_data, do_print)
|
||||
for log in self.fileoutput:
|
||||
log.log_filter_url(url_data, do_print)
|
||||
finally:
|
||||
self.release()
|
||||
do_print = self._config["verbose"] or not url_data.valid or \
|
||||
(url_data.warning and self._config["warnings"])
|
||||
self._config['logger'].log_filter_url(url_data, do_print)
|
||||
for log in self._config['fileoutput']:
|
||||
log.log_filter_url(url_data, do_print)
|
||||
# do_filter = (self.linknumber % 1000) == 0
|
||||
# XXX deadlock!
|
||||
#if do_filter:
|
||||
# self.filter_queue(self)
|
||||
|
||||
def logger_end_output (self):
|
||||
@synchronized(_lock)
|
||||
def end_log_output (self):
|
||||
"""
|
||||
End output of all configured loggers.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
self.logger.end_output()
|
||||
for logger in self.fileoutput:
|
||||
logger.end_output()
|
||||
finally:
|
||||
self.release()
|
||||
self._config['logger'].end_output()
|
||||
for logger in self._config['fileoutput']:
|
||||
logger.end_output()
|
||||
|
||||
@synchronized(_lock)
|
||||
def active_threads (self):
|
||||
"""
|
||||
Return number of active threads.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
return self.threader.active_threads()
|
||||
finally:
|
||||
self.release()
|
||||
return self._threader.active_threads()
|
||||
|
||||
@synchronized(_lock)
|
||||
def get_country_name (self, host):
|
||||
"""
|
||||
Return country code for host if found, else None.
|
||||
"""
|
||||
self.acquire()
|
||||
try:
|
||||
gi = self.config["geoip"]
|
||||
if gi:
|
||||
return linkcheck.checker.geoip.get_country(gi, host)
|
||||
return None
|
||||
finally:
|
||||
self.release()
|
||||
gi = self._config["geoip"]
|
||||
if gi:
|
||||
return linkcheck.checker.geoip.get_country(gi, host)
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -58,12 +58,12 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
order: login, changing directory, list the file.
|
||||
"""
|
||||
# proxy support (we support only http)
|
||||
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
|
||||
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
|
||||
if self.proxy:
|
||||
# using a (HTTP) proxy
|
||||
http = httpurl.HttpUrl(self.base_url,
|
||||
self.recursion_level,
|
||||
self.consumer.config,
|
||||
self.consumer,
|
||||
parent_url=self.parent_url,
|
||||
base_ref=self.base_ref,
|
||||
line=self.line,
|
||||
|
|
@ -92,7 +92,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# ready to connect
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("ftp", self.urlparts[1], _user, _password)
|
||||
conn = self.consumer.cache.get_connection(key)
|
||||
conn = self.consumer.get_connection(key)
|
||||
if conn is not None and conn.sock is not None:
|
||||
# reuse cached FTP connection
|
||||
self.url_connection = conn
|
||||
|
|
@ -250,6 +250,6 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# add to cached connections
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("ftp", self.urlparts[1], _user, _password)
|
||||
cache_add = self.consumer.cache.add_connection
|
||||
cache_add = self.consumer.add_connection
|
||||
cache_add(key, self.url_connection, DEFAULT_TIMEOUT_SECS)
|
||||
self.url_connection = None
|
||||
|
|
|
|||
|
|
@ -78,8 +78,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
roboturl = self.get_robots_txt_url()
|
||||
user, password = self.get_user_password()
|
||||
return self.consumer.cache.robots_txt_allows_url(roboturl, url,
|
||||
user, password)
|
||||
return self.consumer.robots_txt_allows_url(roboturl, url,
|
||||
user, password)
|
||||
|
||||
def check_connection (self):
|
||||
"""
|
||||
|
|
@ -124,15 +124,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
| extension-code
|
||||
"""
|
||||
# set the proxy, so a 407 status after this is an error
|
||||
self.set_proxy(self.consumer.config["proxy"].get(self.scheme))
|
||||
self.set_proxy(self.consumer.config("proxy").get(self.scheme))
|
||||
# initialize check data
|
||||
self.headers = None
|
||||
self.auth = None
|
||||
self.cookies = []
|
||||
# check robots.txt
|
||||
if not self.allows_robots(self.url):
|
||||
self.add_info(
|
||||
# remove all previously stored results
|
||||
self.add_warning(
|
||||
_("Access denied by robots.txt, checked only syntax."))
|
||||
self.set_result(u"syntax OK")
|
||||
return
|
||||
# check for amazon server quirk
|
||||
if _is_amazon(self.urlparts[1]):
|
||||
|
|
@ -144,12 +146,23 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
self.method = "HEAD"
|
||||
# check the http connection
|
||||
response, fallback_GET = self.check_http_connection()
|
||||
if self.headers and self.headers.has_key("Server"):
|
||||
server = self.headers['Server']
|
||||
else:
|
||||
server = _("unknown")
|
||||
if fallback_GET:
|
||||
self.add_info(_("Server %r did not support HEAD request; "\
|
||||
"a GET request was used instead.") % server)
|
||||
if self.no_anchor:
|
||||
self.add_warning(_("Server %r had no anchor support, removed"\
|
||||
" anchor from request.") % server)
|
||||
# redirections might have changed the URL
|
||||
newurl = urlparse.urlunsplit(self.urlparts)
|
||||
if self.url != newurl:
|
||||
self.url = newurl
|
||||
# check response
|
||||
self.check_response(response, fallback_GET)
|
||||
if response:
|
||||
self.check_response(response)
|
||||
|
||||
def check_http_connection (self):
|
||||
"""
|
||||
|
|
@ -205,7 +218,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
raise
|
||||
if tries == -1:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "already handled")
|
||||
return response, fallback_GET
|
||||
return None, fallback_GET
|
||||
if tries >= self.max_redirects:
|
||||
if self.method == "HEAD":
|
||||
# Microsoft servers tend to recurse HEAD requests
|
||||
|
|
@ -276,11 +289,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if self.is_extern():
|
||||
self.add_info(
|
||||
_("Outside of domain filter, checked only syntax."))
|
||||
self.set_result(u"filtered")
|
||||
return -1, response
|
||||
# check robots.txt allowance again
|
||||
if not self.allows_robots(redirected):
|
||||
self.add_warning(
|
||||
_("Access denied by robots.txt, checked only syntax."))
|
||||
self.set_result(u"syntax OK")
|
||||
return -1, response
|
||||
# see about recursive redirect
|
||||
all_seen = [self.cache_url_key] + self.aliases
|
||||
|
|
@ -330,7 +345,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
tries += 1
|
||||
return tries, response
|
||||
|
||||
def check_response (self, response, fallback_GET):
|
||||
def check_response (self, response):
|
||||
"""
|
||||
Check final result and log it.
|
||||
"""
|
||||
|
|
@ -338,27 +353,17 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
self.set_result(u"%r %s" % (response.status, response.reason),
|
||||
valid=False)
|
||||
else:
|
||||
if self.headers and self.headers.has_key("Server"):
|
||||
server = self.headers['Server']
|
||||
else:
|
||||
server = _("unknown")
|
||||
if fallback_GET:
|
||||
self.add_info(_("Server %r did not support HEAD request; "\
|
||||
"a GET request was used instead.") % server)
|
||||
if self.no_anchor:
|
||||
self.add_warning(_("Server %r had no anchor support, removed"\
|
||||
" anchor from request.") % server)
|
||||
if response.status == 204:
|
||||
# no content
|
||||
self.add_warning(
|
||||
linkcheck.strformat.unicode_safe(response.reason))
|
||||
# store cookies for valid links
|
||||
if self.consumer.config['cookies']:
|
||||
if self.consumer.config('cookies'):
|
||||
for c in self.cookies:
|
||||
self.add_info(_("Store cookie: %s.") % c)
|
||||
try:
|
||||
out = self.consumer.cache.store_cookies(self.headers,
|
||||
self.urlparts[1])
|
||||
out = self.consumer.store_cookies(self.headers,
|
||||
self.urlparts[1])
|
||||
for h in out:
|
||||
self.add_info(linkcheck.strformat.unicode_safe(h))
|
||||
except Cookie.CookieError, msg:
|
||||
|
|
@ -414,9 +419,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
linkcheck.configuration.UserAgent)
|
||||
self.url_connection.putheader("Accept-Encoding",
|
||||
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
|
||||
if self.consumer.config['cookies']:
|
||||
self.cookies = self.consumer.cache.get_cookies(self.urlparts[1],
|
||||
self.urlparts[2])
|
||||
if self.consumer.config('cookies'):
|
||||
self.cookies = self.consumer.get_cookies(self.urlparts[1],
|
||||
self.urlparts[2])
|
||||
for c in self.cookies:
|
||||
self.url_connection.putheader("Cookie", c)
|
||||
self.url_connection.endheaders()
|
||||
|
|
@ -439,7 +444,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
_user, _password = self.get_user_password()
|
||||
key = (scheme, self.urlparts[1], _user, _password)
|
||||
conn = self.consumer.cache.get_connection(key)
|
||||
conn = self.consumer.get_connection(key)
|
||||
if conn is not None:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"reuse cached HTTP(S) connection %s", conn)
|
||||
|
|
@ -566,7 +571,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# add to cached connections
|
||||
_user, _password = self.get_user_password()
|
||||
key = ("http", self.urlparts[1], _user, _password)
|
||||
cache_add = self.consumer.cache.add_connection
|
||||
cache_add = self.consumer.add_connection
|
||||
# note: only cache the connection when it is persistent
|
||||
# and all pending content has been received
|
||||
if not self.persistent or not self.has_content or \
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ class NntpUrl (urlbase.UrlBase):
|
|||
Connect to NNTP server and try to request the URL article
|
||||
resource (if specified).
|
||||
"""
|
||||
nntpserver = self.host or self.consumer.config["nntpserver"]
|
||||
nntpserver = self.host or self.consumer.config("nntpserver")
|
||||
if not nntpserver:
|
||||
self.add_warning(
|
||||
_("No NNTP server was specified, skipping this URL."))
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ class ProxySupport (object):
|
|||
"""
|
||||
Check if self.host is in the no-proxy-for ignore list.
|
||||
"""
|
||||
for ro in self.consumer.config["noproxyfor"]:
|
||||
for ro in self.consumer.config("noproxyfor"):
|
||||
if ro.search(self.host):
|
||||
return True
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ class TelnetUrl (urlbase.UrlBase):
|
|||
label is "login: ", expected password label is "Password: ".
|
||||
"""
|
||||
self.url_connection = telnetlib.Telnet()
|
||||
if self.consumer.config.get("debug"):
|
||||
if self.consumer.config("debug"):
|
||||
self.url_connection.set_debuglevel(1)
|
||||
self.url_connection.open(self.host, self.port)
|
||||
if self.user:
|
||||
|
|
|
|||
|
|
@ -140,8 +140,9 @@ class UrlBase (object):
|
|||
self.urlparts = None
|
||||
# the anchor part of url
|
||||
self.anchor = None
|
||||
# the result message string
|
||||
# the result message string and flag
|
||||
self.result = u""
|
||||
self.has_result = False
|
||||
# cached or not
|
||||
self.cached = False
|
||||
# valid or not
|
||||
|
|
@ -166,11 +167,17 @@ class UrlBase (object):
|
|||
# cache keys, are set by build_url() calling set_cache_keys()
|
||||
self.cache_url_key = None
|
||||
self.cache_content_key = None
|
||||
self.check_syntax()
|
||||
|
||||
def set_result (self, msg, valid=True):
|
||||
"""
|
||||
Set result string and validity.
|
||||
"""
|
||||
if self.has_result:
|
||||
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
||||
"Double result %r (previous %r)", msg, self.result)
|
||||
else:
|
||||
self.has_result = True
|
||||
self.result = msg
|
||||
self.valid = valid
|
||||
|
||||
|
|
@ -245,8 +252,8 @@ class UrlBase (object):
|
|||
linkcheck.log.debug(linkcheck.LOG_CACHE, "Content cache key %r",
|
||||
self.cache_content_key)
|
||||
# construct cache key
|
||||
if self.consumer.config["anchorcaching"] and \
|
||||
self.consumer.config["anchors"]:
|
||||
if self.consumer.config("anchorcaching") and \
|
||||
self.consumer.config("anchors"):
|
||||
# do not ignore anchor
|
||||
parts = self.urlparts[:]
|
||||
parts[4] = self.anchor
|
||||
|
|
@ -271,7 +278,7 @@ class UrlBase (object):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
|
||||
if not self.base_url:
|
||||
self.set_result(_("URL is empty"), valid=False)
|
||||
return False
|
||||
return
|
||||
try:
|
||||
self.build_url()
|
||||
# check url warnings
|
||||
|
|
@ -282,10 +289,9 @@ class UrlBase (object):
|
|||
except linkcheck.LinkCheckerError, msg:
|
||||
self.set_result(linkcheck.strformat.unicode_safe(msg),
|
||||
valid=False)
|
||||
return False
|
||||
return
|
||||
self.set_cache_keys()
|
||||
self.extern = self._get_extern(self.url)
|
||||
return True
|
||||
|
||||
def build_url (self):
|
||||
"""
|
||||
|
|
@ -338,7 +344,7 @@ class UrlBase (object):
|
|||
"""
|
||||
Main check function for checking this URL.
|
||||
"""
|
||||
if self.consumer.config["trace"]:
|
||||
if self.consumer.config("trace"):
|
||||
linkcheck.log.trace()
|
||||
try:
|
||||
self.local_check()
|
||||
|
|
@ -372,11 +378,11 @@ class UrlBase (object):
|
|||
Local check function can be overridden in subclasses.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
|
||||
if self.recursion_level and self.consumer.config['wait']:
|
||||
if self.recursion_level and self.consumer.config('wait'):
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"sleeping for %d seconds",
|
||||
self.consumer.config['wait'])
|
||||
time.sleep(self.consumer.config['wait'])
|
||||
self.consumer.config('wait'))
|
||||
time.sleep(self.consumer.config('wait'))
|
||||
t = time.time()
|
||||
if self.is_extern():
|
||||
self.add_info(_("Outside of domain filter, checked only syntax."))
|
||||
|
|
@ -387,7 +393,7 @@ class UrlBase (object):
|
|||
try:
|
||||
self.check_connection()
|
||||
self.add_country_info()
|
||||
if self.consumer.config["anchors"]:
|
||||
if self.consumer.config("anchors"):
|
||||
self.check_anchors()
|
||||
except tuple(linkcheck.checker.ExcList):
|
||||
etype, evalue, etb = sys.exc_info()
|
||||
|
|
@ -403,7 +409,7 @@ class UrlBase (object):
|
|||
valid=False)
|
||||
|
||||
# check content
|
||||
warningregex = self.consumer.config["warningregex"]
|
||||
warningregex = self.consumer.config("warningregex")
|
||||
if warningregex and self.valid:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
|
||||
try:
|
||||
|
|
@ -469,8 +475,8 @@ class UrlBase (object):
|
|||
return self.valid and \
|
||||
self.is_parseable() and \
|
||||
self.can_get_content() and \
|
||||
(self.consumer.config["recursionlevel"] < 0 or
|
||||
self.recursion_level < self.consumer.config["recursionlevel"]) and \
|
||||
(self.consumer.config("recursionlevel") < 0 or
|
||||
self.recursion_level < self.consumer.config("recursionlevel")) and \
|
||||
not self.extern[0] and self.content_allows_robots()
|
||||
|
||||
def content_allows_robots (self):
|
||||
|
|
@ -533,13 +539,13 @@ class UrlBase (object):
|
|||
@return: a tuple (is_extern, is_strict)
|
||||
@rtype: tuple (bool, bool)
|
||||
"""
|
||||
for entry in self.consumer.config["externlinks"]:
|
||||
for entry in self.consumer.config("externlinks"):
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern URL %r", url)
|
||||
return (1, entry['strict'])
|
||||
for entry in self.consumer.config["internlinks"]:
|
||||
for entry in self.consumer.config("internlinks"):
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
|
|
@ -582,7 +588,7 @@ class UrlBase (object):
|
|||
If a maximum size was given, call this function to check it
|
||||
against the content size of this url.
|
||||
"""
|
||||
maxbytes = self.consumer.config["warnsizebytes"]
|
||||
maxbytes = self.consumer.config("warnsizebytes")
|
||||
if maxbytes is not None and self.dlsize >= maxbytes:
|
||||
self.add_warning(_("Content size %s is larger than %s.") % \
|
||||
(linkcheck.strformat.strsize(self.dlsize),
|
||||
|
|
@ -602,7 +608,7 @@ class UrlBase (object):
|
|||
Get tuple (user, password) from configured authentication.
|
||||
Both user and password can be None if not specified.
|
||||
"""
|
||||
for auth in self.consumer.config["authentication"]:
|
||||
for auth in self.consumer.config("authentication"):
|
||||
if auth['pattern'].match(self.url):
|
||||
return auth['user'], auth['password']
|
||||
return None, None
|
||||
|
|
@ -728,7 +734,7 @@ class UrlBase (object):
|
|||
@rtype: string
|
||||
"""
|
||||
s = self.serialized()
|
||||
return self.consumer.config['logger'].encode(s)
|
||||
return self.consumer.config('logger').encode(s)
|
||||
|
||||
def __repr__ (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ Simple decorators (usable in Python >= 2.4).
|
|||
import warnings
|
||||
import signal
|
||||
import os
|
||||
import thread
|
||||
|
||||
def deprecated (func):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -163,9 +163,9 @@ class StandardTest (unittest.TestCase):
|
|||
url, 0, consumer, cmdline=cmdline)
|
||||
consumer.append_url(url_data)
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
if consumer.config['logger'].diff:
|
||||
if consumer.config('logger').diff:
|
||||
sep = unicode(os.linesep)
|
||||
l = [url] + consumer.config['logger'].diff
|
||||
l = [url] + consumer.config('logger').diff
|
||||
l = sep.join(l)
|
||||
self.fail(l.encode("iso8859-1", "ignore"))
|
||||
|
||||
|
|
@ -187,8 +187,8 @@ class StandardTest (unittest.TestCase):
|
|||
url, 0, consumer, cmdline=cmdline)
|
||||
consumer.append_url(url_data)
|
||||
linkcheck.checker.check_urls(consumer)
|
||||
if consumer.config['logger'].diff:
|
||||
if consumer.config('logger').diff:
|
||||
sep = unicode(os.linesep)
|
||||
l = [url] + consumer.config['logger'].diff
|
||||
l = [url] + consumer.config('logger').diff
|
||||
l = sep.join(l)
|
||||
self.fail(l.encode("iso8859-1", "ignore"))
|
||||
|
|
|
|||
|
|
@ -145,9 +145,9 @@ def get_locale ():
|
|||
loc = None
|
||||
try:
|
||||
loc = locale.getdefaultlocale()[0]
|
||||
except ValueError:
|
||||
# workaround (XXX delete this when python2.5 is fixed)
|
||||
pass
|
||||
except ValueError, msg:
|
||||
# workaround for XXX
|
||||
print >>sys.stderr, "WARNING", msg
|
||||
if loc is None:
|
||||
return 'C'
|
||||
loc = locale.normalize(loc)
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import cStringIO as StringIO
|
|||
import linecache
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
try:
|
||||
import thread as _thread
|
||||
except ImportError:
|
||||
|
|
@ -71,8 +72,8 @@ def _traceit (frame, event, arg):
|
|||
if filename.endswith(".pyc") or filename.endswith(".pyo"):
|
||||
filename = filename[:-1]
|
||||
line = linecache.getline(filename, lineno)
|
||||
print "THREAD(%d) %s:%d: %s" % \
|
||||
(_thread.get_ident(), name, lineno, line.rstrip())
|
||||
print "THREAD(%d) %.2f %s:%d: %s" % \
|
||||
(_thread.get_ident(), time.time(), name, lineno, line.rstrip())
|
||||
return _traceit
|
||||
|
||||
def trace ():
|
||||
|
|
|
|||
Loading…
Reference in a new issue