documentation

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2160 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-01-19 01:04:38 +00:00
parent 46de8b6ca0
commit b008747f39
14 changed files with 326 additions and 139 deletions

View file

@ -32,7 +32,9 @@ import linkcheck.threader
def _check_morsel (m, host, path):
"""check given cookie morsel against the desired host and path"""
"""
Check given cookie morsel against the desired host and path.
"""
# check domain (if its stored)
if m["domain"] and not host.endswith(m["domain"]):
return None
@ -48,15 +50,18 @@ def _check_morsel (m, host, path):
class Cache (object):
"""Store and provide routines for cached data. Currently there are
caches for cookies, checked urls, FTP connections and robots.txt
contents.
"""
Store and provide routines for cached data. Currently there are
caches for cookies, checked urls, FTP connections and robots.txt
contents.
All public operations (except __init__()) are thread-safe.
All public operations (except __init__()) are thread-safe.
"""
def __init__ (self):
"""Initialize the default options"""
"""
Initialize the default options.
"""
# one big lock for all caches and queues
self.lock = threading.Lock()
# already checked urls
@ -81,9 +86,11 @@ class Cache (object):
self.lock.release()
def incoming_get_url (self):
"""Get first not-in-progress url from the incoming queue and
return it. If no such url is available return None. The
url might be already cached."""
"""
Get first not-in-progress url from the incoming queue and
return it. If no such url is available return None. The
url might be already cached.
"""
self.lock.acquire()
try:
for i, url_data in enumerate(self.incoming):
@ -102,7 +109,9 @@ class Cache (object):
self.lock.release()
def incoming_len (self):
"""return number of entries in incoming queue"""
"""
Return number of entries in incoming queue.
"""
self.lock.acquire()
try:
return len(self.incoming)
@ -110,7 +119,9 @@ class Cache (object):
self.lock.release()
def incoming_add (self, url_data):
"""add a new URL to list of URLs to check"""
"""
Add a new URL to list of URLs to check.
"""
self.lock.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE,
@ -171,10 +182,12 @@ class Cache (object):
self.lock.release()
def checked_redirect (self, redirect, url_data):
"""Check if redirect is already in cache. Used for URL redirections
to avoid double checking of already cached URLs.
If the redirect URL is found in the cache, the result data is
already copied."""
"""
Check if redirect is already in cache. Used for URL redirections
to avoid double checking of already cached URLs.
If the redirect URL is found in the cache, the result data is
already copied.
"""
self.lock.acquire()
try:
if redirect in self.checked:
@ -185,7 +198,9 @@ class Cache (object):
self.lock.release()
def robots_txt_allows_url (self, roboturl, url, user, password):
"""ask robots.txt allowance"""
"""
Ask robots.txt allowance.
"""
self.lock.acquire()
try:
if roboturl not in self.robots_txt:
@ -201,8 +216,9 @@ class Cache (object):
self.lock.release()
def get_ftp_connection (self, host, username, password):
"""Get open FTP connection to given host. Return None if no such
connection is available.
"""
Get open FTP connection to given host. Return None if no such
connection is available.
"""
self.lock.acquire()
try:
@ -218,7 +234,9 @@ class Cache (object):
self.lock.release()
def add_ftp_connection (self, host, username, password, conn):
"""Store open FTP connection into cache for reuse."""
"""
Store open FTP connection into cache for reuse.
"""
self.lock.acquire()
try:
key = (host, username, password)
@ -230,7 +248,9 @@ class Cache (object):
self.lock.release()
def release_ftp_connection (self, host, username, password):
"""Store open FTP connection into cache for reuse."""
"""
Store open FTP connection into cache for reuse.
"""
self.lock.acquire()
try:
key = (host, username, password)
@ -239,8 +259,9 @@ class Cache (object):
self.lock.release()
def store_cookies (self, headers, host):
"""Thread-safe cookie cache setter function. Can raise the
exception Cookie.CookieError.
"""
Thread-safe cookie cache setter function. Can raise the
exception Cookie.CookieError.
"""
self.lock.acquire()
try:
@ -255,7 +276,9 @@ class Cache (object):
self.lock.release()
def get_cookies (self, host, path):
"""Thread-safe cookie cache getter function."""
"""
Thread-safe cookie cache getter function.
"""
self.lock.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE,

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""url consumer class"""
"""
Url consumer class.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -28,10 +30,14 @@ import linkcheck.log
from urlbase import stderr
class Consumer (object):
"""consume urls from the url queue in a threaded manner"""
"""
Consume urls from the url queue in a threaded manner.
"""
def __init__ (self, config, cache):
"""initialize consumer data and threads"""
"""
Initialize consumer data and threads.
"""
self.config = config
self.cache = cache
self.threader = linkcheck.threader.Threader()
@ -47,7 +53,9 @@ class Consumer (object):
self.warnings = False
def _set_threads (self, num):
"""set number of checker threads to start"""
"""
Set number of checker threads to start.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK,
"set threading with %d threads", num)
self.threader.threads_max = num
@ -57,7 +65,9 @@ class Consumer (object):
sys.setcheckinterval(100)
def append_url (self, url_data):
"""append url to incoming check list"""
"""
Append url to incoming check list.
"""
if not self.cache.incoming_add(url_data):
# can be logged
self.logger_new_url(url_data)
@ -78,7 +88,9 @@ class Consumer (object):
self.threader.start_thread(url_data.check, ())
def checked (self, url_data):
"""put checked url in cache and log it"""
"""
Put checked url in cache and log it.
"""
# log before putting it in the cache (otherwise we would see
# a "(cached)" after every url
self.logger_new_url(url_data)
@ -88,11 +100,15 @@ class Consumer (object):
self.cache.in_progress_remove(url_data)
def interrupted (self, url_data):
"""remove url from active list"""
"""
Remove url from active list.
"""
self.cache.in_progress_remove(url_data)
def finished (self):
"""return True if checking is finished"""
"""
Return True if checking is finished.
"""
self.lock.acquire()
try:
return self.threader.finished() and \
@ -101,7 +117,9 @@ class Consumer (object):
self.lock.release()
def no_more_threads (self):
"""return True if no more active threads are running"""
"""
Return True if no more active threads are running.
"""
self.lock.acquire()
try:
return self.threader.finished()
@ -109,7 +127,9 @@ class Consumer (object):
self.lock.release()
def abort (self):
"""abort checking and send of-of-output message to logger"""
"""
Abort checking and send of-of-output message to logger.
"""
while not self.no_more_threads():
linkcheck.log.warn(linkcheck.LOG_CHECK,
_("keyboard interrupt; waiting for %d active threads to finish"),
@ -123,7 +143,9 @@ class Consumer (object):
self.logger_end_output()
def print_status (self, curtime, start_time):
"""print check status looking at url queues"""
"""
Print check status looking at url queues.
"""
self.lock.acquire()
try:
active = self.threader.active_threads()
@ -137,7 +159,9 @@ class Consumer (object):
self.lock.release()
def logger_start_output (self):
"""start output of all configured loggers"""
"""
Start output of all configured loggers.
"""
self.lock.acquire()
try:
self.logger.start_output()
@ -147,7 +171,9 @@ class Consumer (object):
self.lock.release()
def logger_new_url (self, url_data):
"""send new url to all configured loggers"""
"""
Send new url to all configured loggers.
"""
self.lock.acquire()
try:
self.linknumber += 1
@ -168,7 +194,9 @@ class Consumer (object):
# self.filter_queue(self)
def logger_end_output (self):
"""end output of all configured loggers"""
"""
End output of all configured loggers.
"""
self.lock.acquire()
try:
self.logger.end_output(linknumber=self.linknumber)
@ -178,7 +206,9 @@ class Consumer (object):
self.lock.release()
def active_threads (self):
"""return number of active threads"""
"""
Return number of active threads.
"""
self.lock.acquire()
try:
return self.threader.active_threads()

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle for unknown links"""
"""
Handle for unknown links.
"""
# Copyright (C) 2001-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -21,7 +23,9 @@ import urlbase
import linkcheck
class ErrorUrl (urlbase.UrlBase):
"""Unknown URL links"""
"""
Unknown URL links.
"""
def check_syntax (self):
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
@ -31,6 +35,8 @@ class ErrorUrl (urlbase.UrlBase):
return False
def set_cache_keys (self):
"""cache key is forbidden"""
"""
Cache key is forbidden.
"""
raise NotImplementedError, "cache keys are forbidden"

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle local file: links"""
"""
Handle local file: links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -37,8 +39,9 @@ contents = {
def get_files (dirname):
"""Get lists of files in directory. Does only allow regular files
and directories, no symlinks.
"""
Get lists of files in directory. Does only allow regular files
and directories, no symlinks.
"""
files = []
for entry in os.listdir(dirname):
@ -51,7 +54,9 @@ def get_files (dirname):
def get_nt_filename (path):
"""return case sensitive filename for NT path"""
"""
Return case sensitive filename for NT path.
"""
head, tail = os.path.split(path)
if not tail:
return path
@ -64,7 +69,9 @@ def get_nt_filename (path):
class FileUrl (urlbase.UrlBase):
"Url link with file scheme"
"""
Url link with file scheme.
"""
def __init__ (self, base_url, recursion_level, consumer,
parent_url = None,

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle FTP links"""
"""
Handle FTP links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -28,7 +30,9 @@ import httpurl
class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
"""Url link with ftp scheme."""
"""
Url link with ftp scheme.
"""
def __init__ (self, base_url, recursion_level, consumer,
parent_url = None,
@ -75,7 +79,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
return super(FtpUrl, self).get_user_password()
def login (self):
"""log into ftp server and check the welcome message"""
"""
Log into ftp server and check the welcome message.
"""
_user, _password = self.get_user_password()
# ready to connect
conn = self.consumer.cache.get_ftp_connection(
@ -111,8 +117,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.urlparts[1], _user, _password, self.url_connection)
def cwd (self):
"""Change to URL parent directory. Return filename of last path
component.
"""
Change to URL parent directory. Return filename of last path
component.
"""
dirname = self.urlparts[2].strip('/')
dirs = dirname.split('/')
@ -123,7 +130,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
return filename
def listfile (self):
"""see if filename is in the current FTP directory"""
"""
See if filename is in the current FTP directory.
"""
if not self.filename:
return
files = self.get_files()
@ -141,8 +150,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
raise ftplib.error_perm, "550 File not found"
def get_files (self):
"""Get list of filenames in directory. Subdirectories have an
ending slash.
"""
Get list of filenames in directory. Subdirectories have an
ending slash.
"""
# Rudimentary LIST output parsing. An entry is assumed to have
# the following form:

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle Gopher links"""
"""
Handle Gopher links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -19,5 +21,7 @@
import urlbase
class GopherUrl (urlbase.UrlBase):
"Url link with gopher scheme"
"""
Url link with gopher scheme.
"""
pass

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle https links"""
"""
Handle https links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -19,10 +21,14 @@
import httpurl
class HttpsUrl (httpurl.HttpUrl):
"""Url link with https scheme"""
"""
Url link with https scheme.
"""
def local_check (self):
"""check connection if SSL is supported, else ignore"""
"""
Check connection if SSL is supported, else ignore.
"""
if httpurl.supportHttps:
super(HttpsUrl, self).local_check()
else:

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle http links"""
"""
Handle http links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -43,11 +45,15 @@ _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
"Url link with http scheme"
"""
Url link with http scheme.
"""
def __init__ (self, base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0, name=u""):
"""initialize basic url data and HTTP specific variables"""
"""
Initialize basic url data and HTTP specific variables.
"""
super(HttpUrl, self).__init__(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref, line=line,
column=column, name=name)
@ -348,8 +354,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
self.add_info(_("Last modified %s.") % modified)
def _get_http_response (self):
"""Put request and return (status code, status text, mime object).
Host can be host:port format.
"""
Put request and return (status code, status text, mime object).
Host can be host:port format.
"""
if self.proxy:
host = self.proxy

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle for uncheckable application-specific links"""
"""
Handle for uncheckable application-specific links.
"""
# Copyright (C) 2001-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -19,7 +21,9 @@
import urlbase
class IgnoredUrl (urlbase.UrlBase):
"""Some schemes are defined in http://www.w3.org/Addressing/schemes"""
"""
Some schemes are defined in <http://www.w3.org/Addressing/schemes>.
"""
def local_check (self):
self.add_warning(_("%s URL ignored.") % self.scheme.capitalize())

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle for mailto: links"""
"""
Handle for mailto: links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -29,7 +31,9 @@ import linkcheck.dns.resolver
class MailtoUrl (urlbase.UrlBase):
"""Url link with mailto scheme"""
"""
Url link with mailto scheme.
"""
def build_url (self):
super(MailtoUrl, self).build_url()
@ -83,7 +87,8 @@ class MailtoUrl (urlbase.UrlBase):
return addrs
def check_connection (self):
"""Verify a list of email addresses. If one address fails,
"""
Verify a list of email addresses. If one address fails,
the whole list will fail.
For each mail address we check the following things:
(1) Look up the MX DNS records. If we found no MX record,
@ -103,7 +108,9 @@ class MailtoUrl (urlbase.UrlBase):
def check_smtp_domain (self, name, mail):
"""Check a single mail address"""
"""
Check a single mail address.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK,
"checking mail address %r", mail)
linkcheck.log.debug(linkcheck.LOG_CHECK, "splitting address")
@ -131,7 +138,9 @@ class MailtoUrl (urlbase.UrlBase):
self.check_smtp_connect(mxdata, username)
def check_smtp_connect (self, mxdata, username):
"""mxdata is a list of (preference, host) tuples to check for"""
"""
mxdata is a list of (preference, host) tuples to check for
"""
smtpconnect = 0
for preference, host in mxdata:
try:
@ -180,7 +189,9 @@ class MailtoUrl (urlbase.UrlBase):
_("Could not split the mail address"))
def close_connection (self):
"""close a possibly opened SMTP connection"""
"""
Close a possibly opened SMTP connection.
"""
if self.url_connection is None:
# no connection is open
return
@ -191,7 +202,9 @@ class MailtoUrl (urlbase.UrlBase):
self.url_connection = None
def set_cache_keys (self):
"""The cache key is a comma separated list of emails."""
"""
The cache key is a comma separated list of emails.
"""
emails = [addr[1] for addr in self.addresses]
emails.sort()
self.cache_url_key = u"%s:%s" % (self.scheme, u",".join(emails))
@ -199,7 +212,9 @@ class MailtoUrl (urlbase.UrlBase):
# cache_content_key remains None, recursion is not allowed
def can_get_content (self):
"""mailto: URLs do not have any content
@return False
"""
mailto: URLs do not have any content
@return: c{False}
"""
return False

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle nntp: and news: links"""
"""
Handle nntp: and news: links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -31,11 +33,14 @@ import linkcheck.log
random.seed()
class NoNetrcNNTP (nntplib.NNTP):
"""NNTP class ignoring possible entries in ~/.netrc"""
"""
NNTP class ignoring possible entries in ~/.netrc.
"""
def __init__ (self, host, port=nntplib.NNTP_PORT, user=None,
password=None, readermode=None):
"""Initialize an instance. Arguments:
"""
Initialize an instance. Arguments:
- host: hostname to connect to
- port: port to connect to (default the standard NNTP port)
- user: username to authenticate with
@ -95,7 +100,9 @@ class NoNetrcNNTP (nntplib.NNTP):
class NntpUrl (urlbase.UrlBase):
"""Url link with NNTP scheme"""
"""
Url link with NNTP scheme.
"""
def check_connection (self):
nntpserver = self.host or self.consumer.config["nntpserver"]
@ -124,9 +131,11 @@ class NntpUrl (urlbase.UrlBase):
self.add_warning(_("No newsgroup specified in NNTP URL."))
def _connectNntp (self, nntpserver):
"""This is done only once per checking task. Also, the newly
"""
This is done only once per checking task. Also, the newly
introduced error codes 504 and 505 (both inclining "Too busy, retry
later", are caught."""
later", are caught.
"""
tries = 0
nntp = value = None
while tries < 5:

View file

@ -18,10 +18,14 @@
import urllib
class ProxySupport (object):
"""get support for proxying and for urls with user:pass@host setting"""
"""
Get support for proxying and for urls with user:pass@host setting.
"""
def set_proxy (self, proxy):
"""parse given proxy information and store parsed values"""
"""
Parse given proxy information and store parsed values.
"""
self.proxy = proxy
self.proxyauth = None
if self.proxy:

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Handle telnet: links"""
"""
Handle telnet: links.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -25,7 +27,9 @@ import urlbase
class TelnetUrl (urlbase.UrlBase):
"""Url link with telnet scheme"""
"""
Url link with telnet scheme.
"""
def build_url (self):
super(TelnetUrl, self).build_url()

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""Base URL handler"""
"""
Base URL handler.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -40,7 +42,9 @@ import linkcheck.HtmlParser.htmlsax
stderr = codecs.getwriter("iso8859-1")(sys.stderr, errors="ignore")
def internal_error ():
"""print internal error message to stderr"""
"""
Print internal error message to stderr.
"""
print >> stderr, os.linesep
print >> stderr, _("""********** Oops, I did it again. *************
@ -66,7 +70,9 @@ I can work with ;) .
def print_app_info ():
"""print system and application info to stderr"""
"""
Print system and application info to stderr.
"""
print >> stderr, _("System info:")
print >> stderr, linkcheck.configuration.App
print >> stderr, _("Python %s on %s") % (sys.version, sys.platform)
@ -83,21 +89,24 @@ def urljoin (parent, url, scheme):
class UrlBase (object):
"""An URL with additional information like validity etc."""
"""
An URL with additional information like validity etc.
"""
def __init__ (self, base_url, recursion_level, consumer,
parent_url = None, base_ref = None,
line = -1, column = -1, name = u""):
"""Initialize check data, and store given variables.
"""
Initialize check data, and store given variables.
@base_url - unquoted and possibly unnormed url
@recursion_level - on what check level lies the base url
@config - Configuration instance
@parent_url - quoted and normed url of parent or None
@base_ref - quoted and normed url of <base href=""> or None
@line - line number of url in parent content
@column - column number of url in parent content
@name - name of url or empty
@base_url - unquoted and possibly unnormed url
@recursion_level - on what check level lies the base url
@config - Configuration instance
@parent_url - quoted and normed url of parent or None
@base_ref - quoted and normed url of <base href=""> or None
@line - line number of url in parent content
@column - column number of url in parent content
@name - name of url or empty
"""
self.base_ref = base_ref
# note that self.base_url must not be modified
@ -157,36 +166,52 @@ class UrlBase (object):
self.aliases = []
def set_result (self, msg, valid=True):
"""set result string and validity"""
"""
Set result string and validity.
"""
self.result = msg
self.valid = valid
def is_parseable (self):
"""return True iff content of this url is parseable"""
"""
Return True iff content of this url is parseable.
"""
return False
def is_html (self):
"""return True iff content of this url is HTML formatted"""
"""
Return True iff content of this url is HTML formatted.
"""
return False
def is_http (self):
"""return True for http:// URLs"""
"""
Return True for http:// URLs.
"""
return False
def is_file (self):
"""return True for file:// URLs"""
"""
Return True for file:// URLs.
"""
return False
def add_warning (self, s):
"""add a warning string"""
"""
Add a warning string.
"""
self.warning.append(s)
def add_info (self, s):
"""add an info string"""
"""
Add an info string.
"""
self.info.append(s)
def copy_from_cache (self, cache_data):
"""fill attributes from cache data"""
"""
Fill attributes from cache data.
"""
self.result = cache_data["result"]
self.warning.extend(cache_data["warning"])
self.info.extend(cache_data["info"])
@ -196,7 +221,9 @@ class UrlBase (object):
self.cached = True
def get_cache_data (self):
"""return all data values that should be put in the cache"""
"""
Return all data values that should be put in the cache.
"""
return {"result": self.result,
"warning": self.warning,
"info": self.info,
@ -206,7 +233,9 @@ class UrlBase (object):
}
def set_cache_keys (self):
"""Set keys for URL checking and content recursion."""
"""
Set keys for URL checking and content recursion.
"""
# remove anchor from content cache key since we assume
# URLs with different anchors to have the same content
self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
@ -230,10 +259,11 @@ class UrlBase (object):
self.cache_url_key)
def check_syntax (self):
"""Called before self.check(), this function inspects the
url syntax. Success enables further checking, failure
immediately logs this url. Syntax checks must not
use any network resources.
"""
Called before self.check(), this function inspects the
url syntax. Success enables further checking, failure
immediately logs this url. Syntax checks must not
use any network resources.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
if not self.base_url:
@ -249,8 +279,9 @@ class UrlBase (object):
return True
def build_url (self):
"""Construct self.url and self.urlparts out of the given base
url information self.base_url, self.parent_url and self.base_ref.
"""
Construct self.url and self.urlparts out of the given base
url information self.base_url, self.parent_url and self.base_ref.
"""
# norm base url
base_url, is_idn = linkcheck.url.url_norm(self.base_url)
@ -292,7 +323,9 @@ class UrlBase (object):
self.port = int(self.port)
def check (self):
"""main check function for checking this URL"""
"""
Main check function for checking this URL.
"""
try:
self.local_check()
self.consumer.checked(self)
@ -313,7 +346,9 @@ class UrlBase (object):
internal_error()
def local_check (self):
"""local check function can be overridden in subclasses"""
"""
Local check function can be overridden in subclasses.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
if self.recursion_level and self.consumer.config['wait']:
linkcheck.log.debug(linkcheck.LOG_CHECK,
@ -375,7 +410,9 @@ class UrlBase (object):
self.close_connection()
def close_connection (self):
"""close an opened url connection"""
"""
Close an opened url connection.
"""
# brute force closing
if self.url_connection is not None:
try:
@ -387,13 +424,16 @@ class UrlBase (object):
self.url_connection = None
def check_connection (self):
"""The basic connection check uses urllib2.urlopen to initialize
a connection object.
"""
The basic connection check uses urllib2.urlopen to initialize
a connection object.
"""
self.url_connection = urllib2.urlopen(self.url)
def allows_recursion (self):
"""return True iff we can recurse into the url's content"""
"""
Return True iff we can recurse into the url's content.
"""
#linkcheck.log.debug(linkcheck.LOG_CHECK, "valid=%s, parseable=%s, "\
# "content=%s, extern=%s, robots=%s",
# self.valid, self.is_parseable(),
@ -409,8 +449,9 @@ class UrlBase (object):
not self.extern[0] and self.content_allows_robots()
def content_allows_robots (self):
"""return True if the content of this URL forbids robots to
search for recursive links.
"""
Return True if the content of this URL forbids robots to
search for recursive links.
"""
if not self.is_html():
return True
@ -491,11 +532,15 @@ class UrlBase (object):
return (1, 0)
def can_get_content (self):
"""indicate wether url get_content() can be called"""
"""
Indicate wether url get_content() can be called.
"""
return True
def get_content (self):
"""Precondition: url_connection is an opened URL."""
"""
Precondition: url_connection is an opened URL.
"""
if not self.has_content:
t = time.time()
self.data = self.url_connection.read()
@ -505,8 +550,9 @@ class UrlBase (object):
return self.data
def check_content (self, warningregex):
"""If a warning expression was given, call this function to check it
against the content of this url.
"""
If a warning expression was given, call this function to check it
against the content of this url.
"""
if not self.can_get_content():
return
@ -515,8 +561,10 @@ class UrlBase (object):
self.add_warning(_("Found %r in link contents.") % match.group())
def check_size (self):
"""if a maximum size was given, call this function to check it
against the content size of this url"""
"""
If a maximum size was given, call this function to check it
against the content size of this url.
"""
maxbytes = self.consumer.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(_("Content size %s is larger than %s.") % \
@ -524,16 +572,18 @@ class UrlBase (object):
linkcheck.strformat.strsize(maxbytes)))
def parse_url (self):
"""Parse url content and search for recursive links.
Default parse type is html.
"""
Parse url content and search for recursive links.
Default parse type is html.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK,
"Parsing recursively into %s", self)
self.parse_html()
def get_user_password (self):
"""Get tuple (user, password) from configured authentication.
Both user and password can be None if not specified.
"""
Get tuple (user, password) from configured authentication.
Both user and password can be None if not specified.
"""
for auth in self.consumer.config["authentication"]:
if auth['pattern'].match(self.url):
@ -541,8 +591,9 @@ class UrlBase (object):
return None, None
def parse_html (self):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
h = linkcheck.linkparse.LinkFinder(self.get_content())
p = linkcheck.HtmlParser.htmlsax.parser(h)
@ -565,7 +616,9 @@ class UrlBase (object):
self.consumer.append_url(url_data)
def parse_opera (self):
"""parse an opera bookmark file"""
"""
Parse an opera bookmark file.
"""
name = ""
lineno = 0
lines = self.get_content().splitlines()
@ -584,8 +637,9 @@ class UrlBase (object):
name = ""
def parse_text (self):
"""parse a text file with on url per line; comment and blank
lines are ignored
"""
Parse a text file with on url per line; comment and blank
lines are ignored.
"""
lineno = 0
for line in self.get_content().splitlines():
@ -599,7 +653,9 @@ class UrlBase (object):
self.consumer.append_url(url_data)
def parse_css (self):
"""parse a CSS file for url() patterns"""
"""
Parse a CSS file for url() patterns.
"""
lineno = 0
for line in self.get_content().splitlines():
lineno += 1
@ -612,7 +668,9 @@ class UrlBase (object):
self.consumer.append_url(url_data)
def serialized (self):
"""return serialized url check data as unicode string"""
"""
Return serialized url check data as unicode string.
"""
sep = unicode(os.linesep)
assert isinstance(self.base_url, unicode), self
if self.parent_url is not None: