mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-24 16:14:45 +00:00
documentation
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2160 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
46de8b6ca0
commit
b008747f39
14 changed files with 326 additions and 139 deletions
|
|
@ -32,7 +32,9 @@ import linkcheck.threader
|
|||
|
||||
|
||||
def _check_morsel (m, host, path):
|
||||
"""check given cookie morsel against the desired host and path"""
|
||||
"""
|
||||
Check given cookie morsel against the desired host and path.
|
||||
"""
|
||||
# check domain (if its stored)
|
||||
if m["domain"] and not host.endswith(m["domain"]):
|
||||
return None
|
||||
|
|
@ -48,15 +50,18 @@ def _check_morsel (m, host, path):
|
|||
|
||||
|
||||
class Cache (object):
|
||||
"""Store and provide routines for cached data. Currently there are
|
||||
caches for cookies, checked urls, FTP connections and robots.txt
|
||||
contents.
|
||||
"""
|
||||
Store and provide routines for cached data. Currently there are
|
||||
caches for cookies, checked urls, FTP connections and robots.txt
|
||||
contents.
|
||||
|
||||
All public operations (except __init__()) are thread-safe.
|
||||
All public operations (except __init__()) are thread-safe.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize the default options"""
|
||||
"""
|
||||
Initialize the default options.
|
||||
"""
|
||||
# one big lock for all caches and queues
|
||||
self.lock = threading.Lock()
|
||||
# already checked urls
|
||||
|
|
@ -81,9 +86,11 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def incoming_get_url (self):
|
||||
"""Get first not-in-progress url from the incoming queue and
|
||||
return it. If no such url is available return None. The
|
||||
url might be already cached."""
|
||||
"""
|
||||
Get first not-in-progress url from the incoming queue and
|
||||
return it. If no such url is available return None. The
|
||||
url might be already cached.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
for i, url_data in enumerate(self.incoming):
|
||||
|
|
@ -102,7 +109,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def incoming_len (self):
|
||||
"""return number of entries in incoming queue"""
|
||||
"""
|
||||
Return number of entries in incoming queue.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return len(self.incoming)
|
||||
|
|
@ -110,7 +119,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def incoming_add (self, url_data):
|
||||
"""add a new URL to list of URLs to check"""
|
||||
"""
|
||||
Add a new URL to list of URLs to check.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
|
|
@ -171,10 +182,12 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def checked_redirect (self, redirect, url_data):
|
||||
"""Check if redirect is already in cache. Used for URL redirections
|
||||
to avoid double checking of already cached URLs.
|
||||
If the redirect URL is found in the cache, the result data is
|
||||
already copied."""
|
||||
"""
|
||||
Check if redirect is already in cache. Used for URL redirections
|
||||
to avoid double checking of already cached URLs.
|
||||
If the redirect URL is found in the cache, the result data is
|
||||
already copied.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if redirect in self.checked:
|
||||
|
|
@ -185,7 +198,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def robots_txt_allows_url (self, roboturl, url, user, password):
|
||||
"""ask robots.txt allowance"""
|
||||
"""
|
||||
Ask robots.txt allowance.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if roboturl not in self.robots_txt:
|
||||
|
|
@ -201,8 +216,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def get_ftp_connection (self, host, username, password):
|
||||
"""Get open FTP connection to given host. Return None if no such
|
||||
connection is available.
|
||||
"""
|
||||
Get open FTP connection to given host. Return None if no such
|
||||
connection is available.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
|
|
@ -218,7 +234,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def add_ftp_connection (self, host, username, password, conn):
|
||||
"""Store open FTP connection into cache for reuse."""
|
||||
"""
|
||||
Store open FTP connection into cache for reuse.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
key = (host, username, password)
|
||||
|
|
@ -230,7 +248,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def release_ftp_connection (self, host, username, password):
|
||||
"""Store open FTP connection into cache for reuse."""
|
||||
"""
|
||||
Store open FTP connection into cache for reuse.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
key = (host, username, password)
|
||||
|
|
@ -239,8 +259,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def store_cookies (self, headers, host):
|
||||
"""Thread-safe cookie cache setter function. Can raise the
|
||||
exception Cookie.CookieError.
|
||||
"""
|
||||
Thread-safe cookie cache setter function. Can raise the
|
||||
exception Cookie.CookieError.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
|
|
@ -255,7 +276,9 @@ class Cache (object):
|
|||
self.lock.release()
|
||||
|
||||
def get_cookies (self, host, path):
|
||||
"""Thread-safe cookie cache getter function."""
|
||||
"""
|
||||
Thread-safe cookie cache getter function.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""url consumer class"""
|
||||
"""
|
||||
Url consumer class.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -28,10 +30,14 @@ import linkcheck.log
|
|||
from urlbase import stderr
|
||||
|
||||
class Consumer (object):
|
||||
"""consume urls from the url queue in a threaded manner"""
|
||||
"""
|
||||
Consume urls from the url queue in a threaded manner.
|
||||
"""
|
||||
|
||||
def __init__ (self, config, cache):
|
||||
"""initialize consumer data and threads"""
|
||||
"""
|
||||
Initialize consumer data and threads.
|
||||
"""
|
||||
self.config = config
|
||||
self.cache = cache
|
||||
self.threader = linkcheck.threader.Threader()
|
||||
|
|
@ -47,7 +53,9 @@ class Consumer (object):
|
|||
self.warnings = False
|
||||
|
||||
def _set_threads (self, num):
|
||||
"""set number of checker threads to start"""
|
||||
"""
|
||||
Set number of checker threads to start.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"set threading with %d threads", num)
|
||||
self.threader.threads_max = num
|
||||
|
|
@ -57,7 +65,9 @@ class Consumer (object):
|
|||
sys.setcheckinterval(100)
|
||||
|
||||
def append_url (self, url_data):
|
||||
"""append url to incoming check list"""
|
||||
"""
|
||||
Append url to incoming check list.
|
||||
"""
|
||||
if not self.cache.incoming_add(url_data):
|
||||
# can be logged
|
||||
self.logger_new_url(url_data)
|
||||
|
|
@ -78,7 +88,9 @@ class Consumer (object):
|
|||
self.threader.start_thread(url_data.check, ())
|
||||
|
||||
def checked (self, url_data):
|
||||
"""put checked url in cache and log it"""
|
||||
"""
|
||||
Put checked url in cache and log it.
|
||||
"""
|
||||
# log before putting it in the cache (otherwise we would see
|
||||
# a "(cached)" after every url
|
||||
self.logger_new_url(url_data)
|
||||
|
|
@ -88,11 +100,15 @@ class Consumer (object):
|
|||
self.cache.in_progress_remove(url_data)
|
||||
|
||||
def interrupted (self, url_data):
|
||||
"""remove url from active list"""
|
||||
"""
|
||||
Remove url from active list.
|
||||
"""
|
||||
self.cache.in_progress_remove(url_data)
|
||||
|
||||
def finished (self):
|
||||
"""return True if checking is finished"""
|
||||
"""
|
||||
Return True if checking is finished.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return self.threader.finished() and \
|
||||
|
|
@ -101,7 +117,9 @@ class Consumer (object):
|
|||
self.lock.release()
|
||||
|
||||
def no_more_threads (self):
|
||||
"""return True if no more active threads are running"""
|
||||
"""
|
||||
Return True if no more active threads are running.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return self.threader.finished()
|
||||
|
|
@ -109,7 +127,9 @@ class Consumer (object):
|
|||
self.lock.release()
|
||||
|
||||
def abort (self):
|
||||
"""abort checking and send of-of-output message to logger"""
|
||||
"""
|
||||
Abort checking and send of-of-output message to logger.
|
||||
"""
|
||||
while not self.no_more_threads():
|
||||
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
||||
_("keyboard interrupt; waiting for %d active threads to finish"),
|
||||
|
|
@ -123,7 +143,9 @@ class Consumer (object):
|
|||
self.logger_end_output()
|
||||
|
||||
def print_status (self, curtime, start_time):
|
||||
"""print check status looking at url queues"""
|
||||
"""
|
||||
Print check status looking at url queues.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
active = self.threader.active_threads()
|
||||
|
|
@ -137,7 +159,9 @@ class Consumer (object):
|
|||
self.lock.release()
|
||||
|
||||
def logger_start_output (self):
|
||||
"""start output of all configured loggers"""
|
||||
"""
|
||||
Start output of all configured loggers.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.logger.start_output()
|
||||
|
|
@ -147,7 +171,9 @@ class Consumer (object):
|
|||
self.lock.release()
|
||||
|
||||
def logger_new_url (self, url_data):
|
||||
"""send new url to all configured loggers"""
|
||||
"""
|
||||
Send new url to all configured loggers.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.linknumber += 1
|
||||
|
|
@ -168,7 +194,9 @@ class Consumer (object):
|
|||
# self.filter_queue(self)
|
||||
|
||||
def logger_end_output (self):
|
||||
"""end output of all configured loggers"""
|
||||
"""
|
||||
End output of all configured loggers.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.logger.end_output(linknumber=self.linknumber)
|
||||
|
|
@ -178,7 +206,9 @@ class Consumer (object):
|
|||
self.lock.release()
|
||||
|
||||
def active_threads (self):
|
||||
"""return number of active threads"""
|
||||
"""
|
||||
Return number of active threads.
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
return self.threader.active_threads()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle for unknown links"""
|
||||
"""
|
||||
Handle for unknown links.
|
||||
"""
|
||||
# Copyright (C) 2001-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -21,7 +23,9 @@ import urlbase
|
|||
import linkcheck
|
||||
|
||||
class ErrorUrl (urlbase.UrlBase):
|
||||
"""Unknown URL links"""
|
||||
"""
|
||||
Unknown URL links.
|
||||
"""
|
||||
|
||||
def check_syntax (self):
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
|
||||
|
|
@ -31,6 +35,8 @@ class ErrorUrl (urlbase.UrlBase):
|
|||
return False
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""cache key is forbidden"""
|
||||
"""
|
||||
Cache key is forbidden.
|
||||
"""
|
||||
raise NotImplementedError, "cache keys are forbidden"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle local file: links"""
|
||||
"""
|
||||
Handle local file: links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -37,8 +39,9 @@ contents = {
|
|||
|
||||
|
||||
def get_files (dirname):
|
||||
"""Get lists of files in directory. Does only allow regular files
|
||||
and directories, no symlinks.
|
||||
"""
|
||||
Get lists of files in directory. Does only allow regular files
|
||||
and directories, no symlinks.
|
||||
"""
|
||||
files = []
|
||||
for entry in os.listdir(dirname):
|
||||
|
|
@ -51,7 +54,9 @@ def get_files (dirname):
|
|||
|
||||
|
||||
def get_nt_filename (path):
|
||||
"""return case sensitive filename for NT path"""
|
||||
"""
|
||||
Return case sensitive filename for NT path.
|
||||
"""
|
||||
head, tail = os.path.split(path)
|
||||
if not tail:
|
||||
return path
|
||||
|
|
@ -64,7 +69,9 @@ def get_nt_filename (path):
|
|||
|
||||
|
||||
class FileUrl (urlbase.UrlBase):
|
||||
"Url link with file scheme"
|
||||
"""
|
||||
Url link with file scheme.
|
||||
"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url = None,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle FTP links"""
|
||||
"""
|
||||
Handle FTP links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -28,7 +30,9 @@ import httpurl
|
|||
|
||||
|
||||
class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
||||
"""Url link with ftp scheme."""
|
||||
"""
|
||||
Url link with ftp scheme.
|
||||
"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url = None,
|
||||
|
|
@ -75,7 +79,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
return super(FtpUrl, self).get_user_password()
|
||||
|
||||
def login (self):
|
||||
"""log into ftp server and check the welcome message"""
|
||||
"""
|
||||
Log into ftp server and check the welcome message.
|
||||
"""
|
||||
_user, _password = self.get_user_password()
|
||||
# ready to connect
|
||||
conn = self.consumer.cache.get_ftp_connection(
|
||||
|
|
@ -111,8 +117,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.urlparts[1], _user, _password, self.url_connection)
|
||||
|
||||
def cwd (self):
|
||||
"""Change to URL parent directory. Return filename of last path
|
||||
component.
|
||||
"""
|
||||
Change to URL parent directory. Return filename of last path
|
||||
component.
|
||||
"""
|
||||
dirname = self.urlparts[2].strip('/')
|
||||
dirs = dirname.split('/')
|
||||
|
|
@ -123,7 +130,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
return filename
|
||||
|
||||
def listfile (self):
|
||||
"""see if filename is in the current FTP directory"""
|
||||
"""
|
||||
See if filename is in the current FTP directory.
|
||||
"""
|
||||
if not self.filename:
|
||||
return
|
||||
files = self.get_files()
|
||||
|
|
@ -141,8 +150,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
raise ftplib.error_perm, "550 File not found"
|
||||
|
||||
def get_files (self):
|
||||
"""Get list of filenames in directory. Subdirectories have an
|
||||
ending slash.
|
||||
"""
|
||||
Get list of filenames in directory. Subdirectories have an
|
||||
ending slash.
|
||||
"""
|
||||
# Rudimentary LIST output parsing. An entry is assumed to have
|
||||
# the following form:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle Gopher links"""
|
||||
"""
|
||||
Handle Gopher links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -19,5 +21,7 @@
|
|||
import urlbase
|
||||
|
||||
class GopherUrl (urlbase.UrlBase):
|
||||
"Url link with gopher scheme"
|
||||
"""
|
||||
Url link with gopher scheme.
|
||||
"""
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle https links"""
|
||||
"""
|
||||
Handle https links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -19,10 +21,14 @@
|
|||
import httpurl
|
||||
|
||||
class HttpsUrl (httpurl.HttpUrl):
|
||||
"""Url link with https scheme"""
|
||||
"""
|
||||
Url link with https scheme.
|
||||
"""
|
||||
|
||||
def local_check (self):
|
||||
"""check connection if SSL is supported, else ignore"""
|
||||
"""
|
||||
Check connection if SSL is supported, else ignore.
|
||||
"""
|
||||
if httpurl.supportHttps:
|
||||
super(HttpsUrl, self).local_check()
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle http links"""
|
||||
"""
|
||||
Handle http links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -43,11 +45,15 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
|
|||
|
||||
|
||||
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
||||
"Url link with http scheme"
|
||||
"""
|
||||
Url link with http scheme.
|
||||
"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url=None, base_ref=None, line=0, column=0, name=u""):
|
||||
"""initialize basic url data and HTTP specific variables"""
|
||||
"""
|
||||
Initialize basic url data and HTTP specific variables.
|
||||
"""
|
||||
super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
|
||||
parent_url=parent_url, base_ref=base_ref, line=line,
|
||||
column=column, name=name)
|
||||
|
|
@ -348,8 +354,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
self.add_info(_("Last modified %s.") % modified)
|
||||
|
||||
def _get_http_response (self):
|
||||
"""Put request and return (status code, status text, mime object).
|
||||
Host can be host:port format.
|
||||
"""
|
||||
Put request and return (status code, status text, mime object).
|
||||
Host can be host:port format.
|
||||
"""
|
||||
if self.proxy:
|
||||
host = self.proxy
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle for uncheckable application-specific links"""
|
||||
"""
|
||||
Handle for uncheckable application-specific links.
|
||||
"""
|
||||
# Copyright (C) 2001-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -19,7 +21,9 @@
|
|||
import urlbase
|
||||
|
||||
class IgnoredUrl (urlbase.UrlBase):
|
||||
"""Some schemes are defined in http://www.w3.org/Addressing/schemes"""
|
||||
"""
|
||||
Some schemes are defined in <http://www.w3.org/Addressing/schemes>.
|
||||
"""
|
||||
|
||||
def local_check (self):
|
||||
self.add_warning(_("%s URL ignored.") % self.scheme.capitalize())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle for mailto: links"""
|
||||
"""
|
||||
Handle for mailto: links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -29,7 +31,9 @@ import linkcheck.dns.resolver
|
|||
|
||||
|
||||
class MailtoUrl (urlbase.UrlBase):
|
||||
"""Url link with mailto scheme"""
|
||||
"""
|
||||
Url link with mailto scheme.
|
||||
"""
|
||||
|
||||
def build_url (self):
|
||||
super(MailtoUrl, self).build_url()
|
||||
|
|
@ -83,7 +87,8 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
return addrs
|
||||
|
||||
def check_connection (self):
|
||||
"""Verify a list of email addresses. If one address fails,
|
||||
"""
|
||||
Verify a list of email addresses. If one address fails,
|
||||
the whole list will fail.
|
||||
For each mail address we check the following things:
|
||||
(1) Look up the MX DNS records. If we found no MX record,
|
||||
|
|
@ -103,7 +108,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
|
||||
|
||||
def check_smtp_domain (self, name, mail):
|
||||
"""Check a single mail address"""
|
||||
"""
|
||||
Check a single mail address.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"checking mail address %r", mail)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "splitting address")
|
||||
|
|
@ -131,7 +138,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
self.check_smtp_connect(mxdata, username)
|
||||
|
||||
def check_smtp_connect (self, mxdata, username):
|
||||
"""mxdata is a list of (preference, host) tuples to check for"""
|
||||
"""
|
||||
mxdata is a list of (preference, host) tuples to check for
|
||||
"""
|
||||
smtpconnect = 0
|
||||
for preference, host in mxdata:
|
||||
try:
|
||||
|
|
@ -180,7 +189,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
_("Could not split the mail address"))
|
||||
|
||||
def close_connection (self):
|
||||
"""close a possibly opened SMTP connection"""
|
||||
"""
|
||||
Close a possibly opened SMTP connection.
|
||||
"""
|
||||
if self.url_connection is None:
|
||||
# no connection is open
|
||||
return
|
||||
|
|
@ -191,7 +202,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
self.url_connection = None
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""The cache key is a comma separated list of emails."""
|
||||
"""
|
||||
The cache key is a comma separated list of emails.
|
||||
"""
|
||||
emails = [addr[1] for addr in self.addresses]
|
||||
emails.sort()
|
||||
self.cache_url_key = u"%s:%s" % (self.scheme, u",".join(emails))
|
||||
|
|
@ -199,7 +212,9 @@ class MailtoUrl (urlbase.UrlBase):
|
|||
# cache_content_key remains None, recursion is not allowed
|
||||
|
||||
def can_get_content (self):
|
||||
"""mailto: URLs do not have any content
|
||||
@return False
|
||||
"""
|
||||
mailto: URLs do not have any content
|
||||
|
||||
@return: c{False}
|
||||
"""
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle nntp: and news: links"""
|
||||
"""
|
||||
Handle nntp: and news: links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -31,11 +33,14 @@ import linkcheck.log
|
|||
random.seed()
|
||||
|
||||
class NoNetrcNNTP (nntplib.NNTP):
|
||||
"""NNTP class ignoring possible entries in ~/.netrc"""
|
||||
"""
|
||||
NNTP class ignoring possible entries in ~/.netrc.
|
||||
"""
|
||||
|
||||
def __init__ (self, host, port=nntplib.NNTP_PORT, user=None,
|
||||
password=None, readermode=None):
|
||||
"""Initialize an instance. Arguments:
|
||||
"""
|
||||
Initialize an instance. Arguments:
|
||||
- host: hostname to connect to
|
||||
- port: port to connect to (default the standard NNTP port)
|
||||
- user: username to authenticate with
|
||||
|
|
@ -95,7 +100,9 @@ class NoNetrcNNTP (nntplib.NNTP):
|
|||
|
||||
|
||||
class NntpUrl (urlbase.UrlBase):
|
||||
"""Url link with NNTP scheme"""
|
||||
"""
|
||||
Url link with NNTP scheme.
|
||||
"""
|
||||
|
||||
def check_connection (self):
|
||||
nntpserver = self.host or self.consumer.config["nntpserver"]
|
||||
|
|
@ -124,9 +131,11 @@ class NntpUrl (urlbase.UrlBase):
|
|||
self.add_warning(_("No newsgroup specified in NNTP URL."))
|
||||
|
||||
def _connectNntp (self, nntpserver):
|
||||
"""This is done only once per checking task. Also, the newly
|
||||
"""
|
||||
This is done only once per checking task. Also, the newly
|
||||
introduced error codes 504 and 505 (both inclining "Too busy, retry
|
||||
later", are caught."""
|
||||
later", are caught.
|
||||
"""
|
||||
tries = 0
|
||||
nntp = value = None
|
||||
while tries < 5:
|
||||
|
|
|
|||
|
|
@ -18,10 +18,14 @@
|
|||
import urllib
|
||||
|
||||
class ProxySupport (object):
|
||||
"""get support for proxying and for urls with user:pass@host setting"""
|
||||
"""
|
||||
Get support for proxying and for urls with user:pass@host setting.
|
||||
"""
|
||||
|
||||
def set_proxy (self, proxy):
|
||||
"""parse given proxy information and store parsed values"""
|
||||
"""
|
||||
Parse given proxy information and store parsed values.
|
||||
"""
|
||||
self.proxy = proxy
|
||||
self.proxyauth = None
|
||||
if self.proxy:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Handle telnet: links"""
|
||||
"""
|
||||
Handle telnet: links.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -25,7 +27,9 @@ import urlbase
|
|||
|
||||
|
||||
class TelnetUrl (urlbase.UrlBase):
|
||||
"""Url link with telnet scheme"""
|
||||
"""
|
||||
Url link with telnet scheme.
|
||||
"""
|
||||
|
||||
def build_url (self):
|
||||
super(TelnetUrl, self).build_url()
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Base URL handler"""
|
||||
"""
|
||||
Base URL handler.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -40,7 +42,9 @@ import linkcheck.HtmlParser.htmlsax
|
|||
stderr = codecs.getwriter("iso8859-1")(sys.stderr, errors="ignore")
|
||||
|
||||
def internal_error ():
|
||||
"""print internal error message to stderr"""
|
||||
"""
|
||||
Print internal error message to stderr.
|
||||
"""
|
||||
print >> stderr, os.linesep
|
||||
print >> stderr, _("""********** Oops, I did it again. *************
|
||||
|
||||
|
|
@ -66,7 +70,9 @@ I can work with ;) .
|
|||
|
||||
|
||||
def print_app_info ():
|
||||
"""print system and application info to stderr"""
|
||||
"""
|
||||
Print system and application info to stderr.
|
||||
"""
|
||||
print >> stderr, _("System info:")
|
||||
print >> stderr, linkcheck.configuration.App
|
||||
print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
|
||||
|
|
@ -83,21 +89,24 @@ def urljoin (parent, url, scheme):
|
|||
|
||||
|
||||
class UrlBase (object):
|
||||
"""An URL with additional information like validity etc."""
|
||||
"""
|
||||
An URL with additional information like validity etc.
|
||||
"""
|
||||
|
||||
def __init__ (self, base_url, recursion_level, consumer,
|
||||
parent_url = None, base_ref = None,
|
||||
line = -1, column = -1, name = u""):
|
||||
"""Initialize check data, and store given variables.
|
||||
"""
|
||||
Initialize check data, and store given variables.
|
||||
|
||||
@base_url - unquoted and possibly unnormed url
|
||||
@recursion_level - on what check level lies the base url
|
||||
@config - Configuration instance
|
||||
@parent_url - quoted and normed url of parent or None
|
||||
@base_ref - quoted and normed url of <base href=""> or None
|
||||
@line - line number of url in parent content
|
||||
@column - column number of url in parent content
|
||||
@name - name of url or empty
|
||||
@base_url - unquoted and possibly unnormed url
|
||||
@recursion_level - on what check level lies the base url
|
||||
@config - Configuration instance
|
||||
@parent_url - quoted and normed url of parent or None
|
||||
@base_ref - quoted and normed url of <base href=""> or None
|
||||
@line - line number of url in parent content
|
||||
@column - column number of url in parent content
|
||||
@name - name of url or empty
|
||||
"""
|
||||
self.base_ref = base_ref
|
||||
# note that self.base_url must not be modified
|
||||
|
|
@ -157,36 +166,52 @@ class UrlBase (object):
|
|||
self.aliases = []
|
||||
|
||||
def set_result (self, msg, valid=True):
|
||||
"""set result string and validity"""
|
||||
"""
|
||||
Set result string and validity.
|
||||
"""
|
||||
self.result = msg
|
||||
self.valid = valid
|
||||
|
||||
def is_parseable (self):
|
||||
"""return True iff content of this url is parseable"""
|
||||
"""
|
||||
Return True iff content of this url is parseable.
|
||||
"""
|
||||
return False
|
||||
|
||||
def is_html (self):
|
||||
"""return True iff content of this url is HTML formatted"""
|
||||
"""
|
||||
Return True iff content of this url is HTML formatted.
|
||||
"""
|
||||
return False
|
||||
|
||||
def is_http (self):
|
||||
"""return True for http:// URLs"""
|
||||
"""
|
||||
Return True for http:// URLs.
|
||||
"""
|
||||
return False
|
||||
|
||||
def is_file (self):
|
||||
"""return True for file:// URLs"""
|
||||
"""
|
||||
Return True for file:// URLs.
|
||||
"""
|
||||
return False
|
||||
|
||||
def add_warning (self, s):
|
||||
"""add a warning string"""
|
||||
"""
|
||||
Add a warning string.
|
||||
"""
|
||||
self.warning.append(s)
|
||||
|
||||
def add_info (self, s):
|
||||
"""add an info string"""
|
||||
"""
|
||||
Add an info string.
|
||||
"""
|
||||
self.info.append(s)
|
||||
|
||||
def copy_from_cache (self, cache_data):
|
||||
"""fill attributes from cache data"""
|
||||
"""
|
||||
Fill attributes from cache data.
|
||||
"""
|
||||
self.result = cache_data["result"]
|
||||
self.warning.extend(cache_data["warning"])
|
||||
self.info.extend(cache_data["info"])
|
||||
|
|
@ -196,7 +221,9 @@ class UrlBase (object):
|
|||
self.cached = True
|
||||
|
||||
def get_cache_data (self):
|
||||
"""return all data values that should be put in the cache"""
|
||||
"""
|
||||
Return all data values that should be put in the cache.
|
||||
"""
|
||||
return {"result": self.result,
|
||||
"warning": self.warning,
|
||||
"info": self.info,
|
||||
|
|
@ -206,7 +233,9 @@ class UrlBase (object):
|
|||
}
|
||||
|
||||
def set_cache_keys (self):
|
||||
"""Set keys for URL checking and content recursion."""
|
||||
"""
|
||||
Set keys for URL checking and content recursion.
|
||||
"""
|
||||
# remove anchor from content cache key since we assume
|
||||
# URLs with different anchors to have the same content
|
||||
self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
|
||||
|
|
@ -230,10 +259,11 @@ class UrlBase (object):
|
|||
self.cache_url_key)
|
||||
|
||||
def check_syntax (self):
|
||||
"""Called before self.check(), this function inspects the
|
||||
url syntax. Success enables further checking, failure
|
||||
immediately logs this url. Syntax checks must not
|
||||
use any network resources.
|
||||
"""
|
||||
Called before self.check(), this function inspects the
|
||||
url syntax. Success enables further checking, failure
|
||||
immediately logs this url. Syntax checks must not
|
||||
use any network resources.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
|
||||
if not self.base_url:
|
||||
|
|
@ -249,8 +279,9 @@ class UrlBase (object):
|
|||
return True
|
||||
|
||||
def build_url (self):
|
||||
"""Construct self.url and self.urlparts out of the given base
|
||||
url information self.base_url, self.parent_url and self.base_ref.
|
||||
"""
|
||||
Construct self.url and self.urlparts out of the given base
|
||||
url information self.base_url, self.parent_url and self.base_ref.
|
||||
"""
|
||||
# norm base url
|
||||
base_url, is_idn = linkcheck.url.url_norm(self.base_url)
|
||||
|
|
@ -292,7 +323,9 @@ class UrlBase (object):
|
|||
self.port = int(self.port)
|
||||
|
||||
def check (self):
|
||||
"""main check function for checking this URL"""
|
||||
"""
|
||||
Main check function for checking this URL.
|
||||
"""
|
||||
try:
|
||||
self.local_check()
|
||||
self.consumer.checked(self)
|
||||
|
|
@ -313,7 +346,9 @@ class UrlBase (object):
|
|||
internal_error()
|
||||
|
||||
def local_check (self):
|
||||
"""local check function can be overridden in subclasses"""
|
||||
"""
|
||||
Local check function can be overridden in subclasses.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
|
||||
if self.recursion_level and self.consumer.config['wait']:
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
|
|
@ -375,7 +410,9 @@ class UrlBase (object):
|
|||
self.close_connection()
|
||||
|
||||
def close_connection (self):
|
||||
"""close an opened url connection"""
|
||||
"""
|
||||
Close an opened url connection.
|
||||
"""
|
||||
# brute force closing
|
||||
if self.url_connection is not None:
|
||||
try:
|
||||
|
|
@ -387,13 +424,16 @@ class UrlBase (object):
|
|||
self.url_connection = None
|
||||
|
||||
def check_connection (self):
|
||||
"""The basic connection check uses urllib2.urlopen to initialize
|
||||
a connection object.
|
||||
"""
|
||||
The basic connection check uses urllib2.urlopen to initialize
|
||||
a connection object.
|
||||
"""
|
||||
self.url_connection = urllib2.urlopen(self.url)
|
||||
|
||||
def allows_recursion (self):
|
||||
"""return True iff we can recurse into the url's content"""
|
||||
"""
|
||||
Return True iff we can recurse into the url's content.
|
||||
"""
|
||||
#linkcheck.log.debug(linkcheck.LOG_CHECK, "valid=%s, parseable=%s, "\
|
||||
# "content=%s, extern=%s, robots=%s",
|
||||
# self.valid, self.is_parseable(),
|
||||
|
|
@ -409,8 +449,9 @@ class UrlBase (object):
|
|||
not self.extern[0] and self.content_allows_robots()
|
||||
|
||||
def content_allows_robots (self):
|
||||
"""return True if the content of this URL forbids robots to
|
||||
search for recursive links.
|
||||
"""
|
||||
Return True if the content of this URL forbids robots to
|
||||
search for recursive links.
|
||||
"""
|
||||
if not self.is_html():
|
||||
return True
|
||||
|
|
@ -491,11 +532,15 @@ class UrlBase (object):
|
|||
return (1, 0)
|
||||
|
||||
def can_get_content (self):
|
||||
"""indicate wether url get_content() can be called"""
|
||||
"""
|
||||
Indicate wether url get_content() can be called.
|
||||
"""
|
||||
return True
|
||||
|
||||
def get_content (self):
|
||||
"""Precondition: url_connection is an opened URL."""
|
||||
"""
|
||||
Precondition: url_connection is an opened URL.
|
||||
"""
|
||||
if not self.has_content:
|
||||
t = time.time()
|
||||
self.data = self.url_connection.read()
|
||||
|
|
@ -505,8 +550,9 @@ class UrlBase (object):
|
|||
return self.data
|
||||
|
||||
def check_content (self, warningregex):
|
||||
"""If a warning expression was given, call this function to check it
|
||||
against the content of this url.
|
||||
"""
|
||||
If a warning expression was given, call this function to check it
|
||||
against the content of this url.
|
||||
"""
|
||||
if not self.can_get_content():
|
||||
return
|
||||
|
|
@ -515,8 +561,10 @@ class UrlBase (object):
|
|||
self.add_warning(_("Found %r in link contents.") % match.group())
|
||||
|
||||
def check_size (self):
|
||||
"""if a maximum size was given, call this function to check it
|
||||
against the content size of this url"""
|
||||
"""
|
||||
If a maximum size was given, call this function to check it
|
||||
against the content size of this url.
|
||||
"""
|
||||
maxbytes = self.consumer.config["warnsizebytes"]
|
||||
if maxbytes is not None and self.dlsize >= maxbytes:
|
||||
self.add_warning(_("Content size %s is larger than %s.") % \
|
||||
|
|
@ -524,16 +572,18 @@ class UrlBase (object):
|
|||
linkcheck.strformat.strsize(maxbytes)))
|
||||
|
||||
def parse_url (self):
|
||||
"""Parse url content and search for recursive links.
|
||||
Default parse type is html.
|
||||
"""
|
||||
Parse url content and search for recursive links.
|
||||
Default parse type is html.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"Parsing recursively into %s", self)
|
||||
self.parse_html()
|
||||
|
||||
def get_user_password (self):
|
||||
"""Get tuple (user, password) from configured authentication.
|
||||
Both user and password can be None if not specified.
|
||||
"""
|
||||
Get tuple (user, password) from configured authentication.
|
||||
Both user and password can be None if not specified.
|
||||
"""
|
||||
for auth in self.consumer.config["authentication"]:
|
||||
if auth['pattern'].match(self.url):
|
||||
|
|
@ -541,8 +591,9 @@ class UrlBase (object):
|
|||
return None, None
|
||||
|
||||
def parse_html (self):
|
||||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
h = linkcheck.linkparse.LinkFinder(self.get_content())
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
|
|
@ -565,7 +616,9 @@ class UrlBase (object):
|
|||
self.consumer.append_url(url_data)
|
||||
|
||||
def parse_opera (self):
|
||||
"""parse an opera bookmark file"""
|
||||
"""
|
||||
Parse an opera bookmark file.
|
||||
"""
|
||||
name = ""
|
||||
lineno = 0
|
||||
lines = self.get_content().splitlines()
|
||||
|
|
@ -584,8 +637,9 @@ class UrlBase (object):
|
|||
name = ""
|
||||
|
||||
def parse_text (self):
|
||||
"""parse a text file with on url per line; comment and blank
|
||||
lines are ignored
|
||||
"""
|
||||
Parse a text file with on url per line; comment and blank
|
||||
lines are ignored.
|
||||
"""
|
||||
lineno = 0
|
||||
for line in self.get_content().splitlines():
|
||||
|
|
@ -599,7 +653,9 @@ class UrlBase (object):
|
|||
self.consumer.append_url(url_data)
|
||||
|
||||
def parse_css (self):
|
||||
"""parse a CSS file for url() patterns"""
|
||||
"""
|
||||
Parse a CSS file for url() patterns.
|
||||
"""
|
||||
lineno = 0
|
||||
for line in self.get_content().splitlines():
|
||||
lineno += 1
|
||||
|
|
@ -612,7 +668,9 @@ class UrlBase (object):
|
|||
self.consumer.append_url(url_data)
|
||||
|
||||
def serialized (self):
|
||||
"""return serialized url check data as unicode string"""
|
||||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
"""
|
||||
sep = unicode(os.linesep)
|
||||
assert isinstance(self.base_url, unicode), self
|
||||
if self.parent_url is not None:
|
||||
|
|
|
|||
Loading…
Reference in a new issue