documentation updates

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2148 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-01-18 01:00:45 +00:00
parent f779f065f9
commit 700d564be7
17 changed files with 726 additions and 261 deletions

View file

@ -39,12 +39,16 @@ lognamelist = ", ".join(["%r"%name for name in lognames.keys()])
class LinkCheckerError (Exception):
"""exception to be raised on linkchecker-specific check errors"""
"""
Exception to be raised on linkchecker-specific check errors.
"""
pass
def get_link_pat (arg, strict=False):
"""get a link pattern matcher for intern/extern links"""
"""
Get a link pattern matcher for intern/extern links.
"""
linkcheck.log.debug(LOG_CHECK, "Link pattern %r", arg)
if arg[0:1] == '!':
pattern = arg[1:]
@ -86,8 +90,12 @@ LoggerKeys = ", ".join(["%r"%name for name in Loggers.keys()])
def init_i18n ():
"""Initialize i18n with the configured locale dir. The environment
variable LOCPATH can also specify a locale dir."""
"""
Initialize i18n with the configured locale dir. The environment
variable LOCPATH can also specify a locale dir.
@return: C{None}
"""
locdir = os.environ.get('LOCPATH')
if locdir is None:
locdir = os.path.join(configdata.install_data, 'share', 'locale')

View file

@ -134,7 +134,9 @@ AnsiReset = esc_ansicolor("default")
def has_colors (fp):
"""see if given file is an ANSI color enabled tty"""
"""
Test if given file is an ANSI color enabled tty.
"""
# note: the isatty() function ensures that we do not colorize
# redirected streams, as this is almost never what we want
if hasattr(fp, "isatty") and fp.isatty():
@ -155,8 +157,9 @@ def has_colors (fp):
def has_colors_nt ():
"""windows has no curses; check if running in an environment
which supports ANSI colors
"""
Check if running in a Windows environment which supports ANSI colors.
Do this by searching for a loaded ANSI driver.
"""
_in = None
_out = None
@ -176,7 +179,10 @@ def has_colors_nt ():
def colorize (text, color=None):
"""return text colorized if color is given"""
"""
Colorize text with given color. If color is c{None}, leave the
text as-is.
"""
if color is not None:
return '%s%s%s' % (esc_ansicolor(color), text, AnsiReset)
else:
@ -184,17 +190,22 @@ def colorize (text, color=None):
class Colorizer (object):
"""prints colored messages to streams"""
"""
Prints colored messages to streams.
"""
def __init__ (self, fp):
"""initialize with given stream (file-like object)"""
"""
Initialize with given stream (file-like object).
"""
super(Colorizer, self).__init__()
self._fp = fp
def write (self, s, color=None):
"""Writes message s in color if output stream is
a console stream (stderr or stdout).
Else writes without color (i.e. black/white).
"""
Writes message s in color if output stream is
a console stream (stderr or stdout).
Else writes without color (i.e. black/white).
"""
if has_colors(self._fp):
# stdout or stderr can be colorized
@ -202,7 +213,9 @@ class Colorizer (object):
self._fp.write(s)
def __getattr__ (self, name):
"""delegate attribute access to the stored stream object"""
"""
Delegate attribute access to the stored stream object.
"""
return getattr(self._fp, name)
@ -210,8 +223,9 @@ class ColoredStreamHandler (logging.StreamHandler, object):
"""Send colored log messages to streams (file-like objects)."""
def __init__ (self, strm=None):
"""Log to given stream (a file-like object) or to stderr if
strm is None.
"""
Log to given stream (a file-like object) or to stderr if
strm is None.
"""
super(ColoredStreamHandler, self).__init__(strm=strm)
self.stream = Colorizer(self.stream)
@ -224,7 +238,9 @@ class ColoredStreamHandler (logging.StreamHandler, object):
}
def get_color (self, record):
"""get appropriate color according to log level"""
"""
Get appropriate color according to log level.
"""
return self.colors.get(record.levelno, 'default')
def emit (self, record):

View file

@ -97,8 +97,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
else:
self.url_connection.login(_user, _password)
except EOFError, msg:
msg = str(msg)
raise linkcheck.LinkCheckerError(
_("Remote host has closed connection")+": "+msg)
_("Remote host has closed connection: %s") % msg)
if not self.url_connection.getwelcome():
self.close_connection()
raise linkcheck.LinkCheckerError(

View file

@ -48,18 +48,23 @@ distribution."""
def norm (path):
"""norm given system path with all available norm funcs in os.path"""
"""
Norm given system path with all available norm funcs in os.path.
"""
return os.path.normcase(os.path.normpath(os.path.expanduser(path)))
# dynamic options
class Configuration (dict):
"""Storage for configuration options. Options can both be given from
the command line as well as from configuration files.
"""
Storage for configuration options. Options can both be given from
the command line as well as from configuration files.
"""
def __init__ (self):
"""Initialize the default options"""
"""
Initialize the default options.
"""
super(Configuration, self).__init__()
self["verbose"] = False
self["warnings"] = False
@ -130,12 +135,13 @@ class Configuration (dict):
self["threads"] = 10
def init_logging (self, debug=None):
"""Load logging.conf file settings to set up the
application logging (not to be confused with check loggers).
When debug is not None it is expected to be a list of
logger names for which debugging will be enabled.
"""
Load logging.conf file settings to set up the
application logging (not to be confused with check loggers).
When debug is not None it is expected to be a list of
logger names for which debugging will be enabled.
Activating debugging disables threading.
Activating debugging disables threading.
"""
config_dir = _linkchecker_configdata.config_dir
filename = norm(os.path.join(config_dir, "logging.conf"))
@ -154,21 +160,27 @@ class Configuration (dict):
logging.getLogger(name).setLevel(logging.DEBUG)
def logger_new (self, loggertype, **kwargs):
"""instantiate new logger and return it"""
"""
Instantiate new logger and return it.
"""
args = {}
args.update(self[loggertype])
args.update(kwargs)
return linkcheck.Loggers[loggertype](**args)
def logger_add (self, loggertype, loggerclass, loggerargs=None):
"""add a new logger type to the known loggers"""
"""
Add a new logger type to the known loggers.
"""
if loggerargs is None:
loggerargs = {}
linkcheck.Loggers[loggertype] = loggerclass
self[loggertype] = loggerargs
def read (self, files=None):
"""read settings from given config files"""
"""
Read settings from given config files.
"""
if files is None:
cfiles = []
else:
@ -184,7 +196,9 @@ class Configuration (dict):
self['logger'] = self.logger_new('text')
def read_config (self, files):
"""read all the configuration parameters from the given files"""
"""
Read all the configuration parameters from the given files.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK,
"reading configuration from %s", files)
try:
@ -199,7 +213,9 @@ class Configuration (dict):
self.read_filtering_config(cfgparser)
def read_output_config (self, cfgparser):
"""read configuration options in section "output"."""
"""
Read configuration options in section "output".
"""
section = "output"
for key in linkcheck.Loggers.keys():
if cfgparser.has_section(key):
@ -257,7 +273,9 @@ class Configuration (dict):
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
def read_checking_config (self, cfgparser):
"""read configuration options in section "checking"."""
"""
Read configuration options in section "checking".
"""
section = "checking"
try:
num = cfgparser.getint(section, "threads")
@ -303,7 +321,9 @@ class Configuration (dict):
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
def read_authentication_config (self, cfgparser):
"""read configuration options in section "authentication"."""
"""
Read configuration options in section "authentication".
"""
section = "authentication"
try:
i = 1
@ -320,7 +340,9 @@ class Configuration (dict):
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
def read_filtering_config (self, cfgparser):
"""read configuration options in section "filtering"."""
"""
Read configuration options in section "filtering".
"""
section = "filtering"
try:
i = 1

View file

@ -18,74 +18,99 @@
class SetList (list):
"""a list that eliminates all duplicates"""
"""
A list that eliminates all duplicates.
"""
def append (self, x):
"""append only if not already there"""
if x not in self:
super(SetList, self).append(x)
def append (self, item):
"""
Append only if not already there.
"""
if item not in self:
super(SetList, self).append(item)
def extend (self, x):
"""extend while eliminating duplicates by appending item for item"""
for i in x:
self.append(i)
def extend (self, itemlist):
"""
Extend while eliminating duplicates by appending item for item.
"""
for item in itemlist:
self.append(item)
def insert (self, i, x):
"""insert only if not already there"""
if x not in self:
super(SetList, self).insert(i, x)
def insert (self, index, item):
"""
Insert item at given index only if it is not already there.
"""
if item not in self:
super(SetList, self).insert(index, item)
def __setitem__ (self, key, value):
"""set new value, and eliminate a possible duplicate value"""
# search index idx with self[i] == value
idx = -1
def __setitem__ (self, index, item):
"""
Set new value, and eliminate a possible duplicate value.
"""
# search index i with self[i] == item
delidx = -1
for i in range(len(self)):
if self[i] == value and i != key:
idx = i
if self[i] == item and i != index:
delidx = i
# break here, there can be only one duplicate
break
# insert new value
super(SetList, self).__setitem__(key, value)
if idx != -1:
super(SetList, self).__setitem__(index, item)
if delidx != -1:
# remove duplicate
del self[idx]
del self[delidx]
class ListDict (dict):
"""a dictionary whose iterators reflect the order in which elements
were added
"""
A dictionary whose iterators reflect the order in which elements
were added.
"""
def __init__ (self):
"""initialize sorted key list"""
"""
Initialize sorted key list.
"""
# sorted list of keys
self._keys = []
def __setitem__ (self, key, value):
"""add key,value to dict, append key to sorted list"""
"""
Add key,value to dict, append key to sorted list.
"""
if not self.has_key(key):
self._keys.append(key)
super(ListDict, self).__setitem__(key, value)
def __delitem__ (self, key):
"""remove key from dict"""
"""
Remove key from dict.
"""
self._keys.remove(key)
super(ListDict, self).__delitem__(key)
def values (self):
"""return sorted list of values"""
"""
Return sorted list of values.
"""
return [self[k] for k in self._keys]
def items (self):
"""return sorted list of items"""
"""
Return sorted list of items.
"""
return [(k, self[k]) for k in self._keys]
def keys (self):
"""return sorted list of keys"""
"""
Return sorted list of keys.
"""
return self._keys[:]
def itervalues (self):
"""return iterator over sorted values"""
"""
Return iterator over sorted values.
"""
return iter(self.values())
def iteritems (self):
@ -93,11 +118,15 @@ class ListDict (dict):
return iter(self.items())
def iterkeys (self):
"""return iterator over sorted keys"""
"""
Return iterator over sorted keys.
"""
return iter(self.keys())
def clear (self):
"""remove all dict entires"""
"""
Remove all dict entries.
"""
self._keys = []
super(ListDict, self).clear()
@ -111,20 +140,28 @@ class LRU (object):
"""
class Node (object):
"""internal node with pointers to sisters"""
"""
Internal node with pointers to sisters.
"""
def __init__ (self, prev, me):
"""initialize pointers and data"""
"""
Initialize pointers and data.
"""
self.prev = prev
self.me = me
self.next = None
def __len__ (self):
"""number of stored objects in the queue"""
"""
Number of stored objects in the queue.
"""
return len(self.d)
def __init__ (self, count, pairs=None):
"""make new queue with given maximum count, and key/value pairs"""
"""
Make new queue with given maximum count, and key/value pairs.
"""
self.count = max(count, 1)
self.d = {}
self.first = None
@ -134,21 +171,29 @@ class LRU (object):
self[key] = value
def __contains__ (self, obj):
"""look if obj is in the queue"""
"""
Look if obj is in the queue.
"""
return obj in self.d
def has_key (self, obj):
"""look if obj is in the queue"""
"""
Look if obj is in the queue.
"""
return self.d.has_key(obj)
def __getitem__ (self, obj):
"""get stored object data, and mark it as LRU"""
"""
Get stored object data, and mark it as LRU.
"""
a = self.d[obj].me
self[a[0]] = a[1]
return a[1]
def __setitem__ (self, obj, val):
"""set given object data, and mark it as LRU"""
"""
Set given object data, and mark it as LRU.
"""
if obj in self.d:
del self[obj]
nobj = self.Node(self.last, (obj, val))
@ -171,7 +216,9 @@ class LRU (object):
del a
def __delitem__ (self, obj):
"""remove object from queue"""
"""
Remove object from queue.
"""
nobj = self.d[obj]
if nobj.prev:
nobj.prev.next = nobj.next
@ -184,7 +231,9 @@ class LRU (object):
del self.d[obj]
def __iter__ (self):
"""iterate over stored object values"""
"""
Iterate over stored object values.
"""
cur = self.first
while cur != None:
cur2 = cur.next
@ -192,7 +241,9 @@ class LRU (object):
cur = cur2
def iteritems (self):
"""iterate over stored object items"""
"""
Iterate over stored object items.
"""
cur = self.first
while cur != None:
cur2 = cur.next
@ -200,21 +251,29 @@ class LRU (object):
cur = cur2
def iterkeys (self):
"""iterate over stored object keys"""
"""
Iterate over stored object keys.
"""
return iter(self.d)
def itervalues (self):
"""iterate over stored object values"""
"""
Iterate over stored object values.
"""
for i, j in self.iteritems():
yield j
def keys (self):
"""iterate over stored object keys"""
"""
Iterate over stored object keys.
"""
return self.d.keys()
def setdefault (self, key, failobj=None):
"""get given object data, and mark it as LRU. If it is not already
stored, store the given failobj."""
"""
Get given object data, and mark it as LRU. If it is not already
stored, store the given failobj.
"""
if not self.has_key(key):
self[key] = failobj
return self[key]

View file

@ -87,24 +87,32 @@ error = 'fcgi.error'
# anywhere at the moment
def _error (msg):
"""Append a string to /tmp/err"""
"""
Append a string to /tmp/err.
"""
errf = file('/tmp/err', 'a+')
errf.write(msg+'\n')
errf.close()
class Record (object):
"""Class representing FastCGI records"""
"""
Class representing FastCGI records.
"""
def __init__ (self):
"""initialize record data"""
"""
Initialize record data.
"""
self.version = FCGI_VERSION_1
self.rec_type = FCGI_UNKNOWN_TYPE
self.req_id = FCGI_NULL_REQUEST_ID
self.content = ""
def read_record (self, sock):
"""read a FastCGI record from socket"""
"""
Read a FastCGI record from socket.
"""
s = [ord(x) for x in sock.recv(8)]
self.version, self.rec_type, padding_length = s[0], s[1], s[6]
self.req_id, content_length = (s[2]<<8)+s[3], (s[4]<<8)+s[5]
@ -136,7 +144,9 @@ class Record (object):
self.protocolStatus = ord(c[4])
def write_record (self, sock):
"""write a FastCGI request to socket"""
"""
Write a FastCGI request to socket.
"""
content = self.content
if self.rec_type == FCGI_BEGIN_REQUEST:
content = chr(self.role>>8) + chr(self.role & 255) + \
@ -219,60 +229,79 @@ def HandleManTypes (r, conn):
class FastCGIWriter (object):
"""File-like object writing FastCGI requests. All read operations
return empty data.
"""
File-like object writing FastCGI requests. All read operations
return empty data.
"""
def __init__ (self, rec, conn):
"""initialize with given record and connection"""
"""
Initialize with given record and connection.
"""
self.record = rec
self.conn = conn
self.closed = False
def close (self):
"""close this writer"""
"""
Close this writer.
"""
if not self.closed:
self.closed = True
self.record.content = ""
self.record.write_record(self.conn)
def isatty (self):
"""returns False"""
"""
Returns False.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
return False
def seek (self, pos, mode=0):
"""does nothing"""
"""
Does nothing.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
def tell (self):
"""Return zero"""
"""
Return zero.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
return 0
def read (self, n=-1):
"""return empty string"""
"""
Return empty string.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
return ""
def readline (self, length=None):
"""return empty string"""
"""
Return empty string.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
return ""
def readlines (self):
"""return empty list"""
"""
Return empty list.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
return []
def write (self, s):
"""write data in record for record to connection"""
"""
Write data in record for record to connection.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"
while s:
@ -281,17 +310,23 @@ class FastCGIWriter (object):
self.record.write_record(self.conn)
def get_next_chunk (self, data):
"""return tuple (chunk of data, newdata)"""
"""
Return tuple (chunk of data, newdata).
"""
chunk = data[:8192]
data = data[8192:]
return chunk, data
def writelines (self, lines):
"""write given lines to the connection"""
"""
Write given lines to the connection.
"""
self.write(''.join(lines))
def flush (self):
"""does nothing"""
"""
Does nothing.
"""
if self.closed:
raise ValueError, "I/O operation on closed file"

View file

@ -26,7 +26,9 @@ supported_languages = ['en']
default_language = None
def install_builtin (translator, do_unicode):
"""install _() and _n() gettext methods into default namespace"""
"""
Install _() and _n() gettext methods into default namespace.
"""
import __builtin__
if do_unicode:
__builtin__.__dict__['_'] = translator.ugettext
@ -38,19 +40,35 @@ def install_builtin (translator, do_unicode):
__builtin__.__dict__['_n'] = translator.ngettext
class Translator (gettext.GNUTranslations):
"""
A translation class always installing its gettext methods into the
default namespace.
"""
def install (self, do_unicode):
"""
Install gettext methods into the default namespace.
"""
install_builtin(self, do_unicode)
class NullTranslator (gettext.NullTranslations):
"""
A translation class always installing its gettext methods into the
default namespace.
"""
def install (self, do_unicode):
"""
Install gettext methods into the default namespace.
"""
install_builtin(self, do_unicode)
def init (domain, directory):
"""initialize this gettext i18n module"""
"""
Initialize this gettext i18n module.
"""
global default_language
# get supported languages
for lang in os.listdir(directory):
@ -74,7 +92,9 @@ def get_translator (domain, directory, languages=None,
translatorklass=Translator,
fallback=False,
fallbackklass=NullTranslator):
"""search the appropriate GNUTranslations class"""
"""
Search the appropriate GNUTranslations class.
"""
translator = gettext.translation(domain, localedir=directory,
languages=languages, class_=translatorklass, fallback=fallback)
if not isinstance(translator, gettext.GNUTranslations):
@ -83,14 +103,18 @@ def get_translator (domain, directory, languages=None,
def get_lang (lang):
"""return lang if it is supported, or the default language"""
"""
Return lang if it is supported, or the default language.
"""
if lang in supported_languages:
return lang
return default_language
def get_headers_lang (headers):
"""return preferred supported language in given HTTP headers"""
"""
Return preferred supported language in given HTTP headers.
"""
if not headers.has_key('Accept-Language'):
return default_language
languages = headers['Accept-Language'].split(",")
@ -103,7 +127,9 @@ def get_headers_lang (headers):
def get_locale ():
"""return current configured locale"""
"""
Return current configured locale.
"""
loc = locale.getdefaultlocale()[0]
if loc is None:
loc = 'C'
@ -131,10 +157,14 @@ lang_transis = {
}
def lang_name (lang):
"""return full name of given language"""
"""
Return full name of given language.
"""
return lang_names[lang]
def lang_trans (lang, curlang):
"""return translated full name of given language"""
"""
Return translated full name of given language.
"""
return lang_transis[lang][curlang]

View file

@ -34,19 +34,25 @@ _supported_langs = ('de', 'fr', 'nl', 'C')
_is_level = re.compile(r'^[0123]$').match
class FormError (Exception):
"""form related errors"""
"""
Form related errors.
"""
pass
def startoutput (out=sys.stdout):
"""print leading HTML headers to given output stream"""
"""
Print leading HTML headers to given output stream.
"""
out.write("Content-type: text/html\r\n"
"Cache-Control: no-cache\r\n"
"Pragma: no-cache\r\n"
"\r\n")
def checkaccess (out=sys.stdout, hosts=[], servers=[], env=os.environ):
"""see if remote addr is allowed to access the CGI interface"""
"""
See if remote addr is allowed to access the CGI interface.
"""
if os.environ.get('REMOTE_ADDR') in hosts and \
os.environ.get('SERVER_ADDR') in servers:
return True
@ -56,7 +62,9 @@ def checkaccess (out=sys.stdout, hosts=[], servers=[], env=os.environ):
def checklink (out=sys.stdout, form={}, env=os.environ):
"""main cgi function, check the given links and print out the result"""
"""
Main cgi function, check the given links and print out the result.
"""
try:
checkform(form)
except FormError, why:
@ -96,14 +104,17 @@ def checklink (out=sys.stdout, form={}, env=os.environ):
def get_host_name (form):
"""return host name of given URL"""
"""
Return host name of given URL.
"""
return urlparse.urlparse(form["url"].value)[1]
def checkform (form):
"""check form data. throw exception on error
Be sure to NOT print out any user-given data as HTML code, so use
only plain strings as exception text.
"""
Check form data. throw exception on error
Be sure to NOT print out any user-given data as HTML code, so use
only plain strings as exception text.
"""
# check lang support
if form.has_key("language"):
@ -134,7 +145,9 @@ def checkform (form):
raise FormError(_("invalid %s option syntax") % option)
def logit (form, env):
"""log form errors"""
"""
Log form errors.
"""
global _logfile
if not _logfile:
return
@ -151,7 +164,9 @@ def logit (form, env):
def print_error (out, why):
"""print standard error page"""
"""
Print standard error page.
"""
out.write(_("""<html><head>
<title>LinkChecker Online Error</title></head>
<body text=#192c83 bgcolor=#fff7e5 link=#191c83 vlink=#191c83 alink=#191c83>

View file

@ -28,12 +28,16 @@ img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>])+>""")
endtag_re = re.compile(r"""(?i)</a\s*>""")
def _unquote (txt):
"""resolve entities and markup from txt"""
"""
Resolve entities and markup from txt.
"""
return linkcheck.HtmlParser.resolve_entities(
linkcheck.strformat.remove_markup(txt))
def image_name (txt):
"""return the alt part of the first <img alt=""> tag in txt"""
"""
Return the alt part of the first <img alt=""> tag in txt.
"""
mo = imgtag_re.search(txt)
if mo:
name = linkcheck.strformat.unquote(mo.group('name').strip())
@ -42,7 +46,9 @@ def image_name (txt):
def href_name (txt):
"""return the name part of the first <a href="">name</a> link in txt"""
"""
Return the name part of the first <a href="">name</a> link in txt.
"""
name = u""
endtag = endtag_re.search(txt)
if not endtag:

View file

@ -61,12 +61,15 @@ refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
css_url_re = re.compile(ur"url\((?P<url>[^\)]+)\)")
class TagFinder (object):
"""Base class storing HTML parse messages in a list.
TagFinder instances are to be used as HtmlParser handlers.
"""
Base class storing HTML parse messages in a list.
TagFinder instances are to be used as HtmlParser handlers.
"""
def __init__ (self, content):
"""store content in buffer"""
"""
Store content in buffer.
"""
super(TagFinder, self).__init__()
self.content = content
# warnings and errors during parsing
@ -76,28 +79,40 @@ class TagFinder (object):
self.parser = None
def _errorfun (self, msg, name):
"""append msg to error list"""
"""
Append msg to error list.
"""
self.parse_info.append("%s at line %d col %d: %s" % \
(name, self.parser.last_lineno(), self.parser.last_column(), msg))
def warning (self, msg):
"""signal a filter/parser warning"""
"""
Signal a filter/parser warning.
"""
self._errorfun(msg, "warning")
def error (self, msg):
"""signal a filter/parser error"""
"""
Signal a filter/parser error.
"""
self._errorfun(msg, "error")
def fatal_error (self, msg):
"""signal a fatal filter/parser error"""
"""
Signal a fatal filter/parser error.
"""
self._errorfun(msg, "fatal error")
class MetaRobotsFinder (TagFinder):
"""class for finding robots.txt meta values in HTML"""
"""
Class for finding robots.txt meta values in HTML.
"""
def __init__ (self, content):
"""store content in buffer and initialize flags"""
"""
Store content in buffer and initialize flags.
"""
super(MetaRobotsFinder, self).__init__(content)
self.follow = True
self.index = True
@ -105,7 +120,9 @@ class MetaRobotsFinder (TagFinder):
def start_element (self, tag, attrs):
"""search for meta robots.txt "nofollow" and "noindex" flags"""
"""
Search for meta robots.txt "nofollow" and "noindex" flags.
"""
if tag == 'meta':
if attrs.get('name') == 'robots':
val = attrs.get('content', u'').lower().split(u',')
@ -114,13 +131,16 @@ class MetaRobotsFinder (TagFinder):
class LinkFinder (TagFinder):
"""Find a list of links. After parsing, self.urls
"""
Find a list of links. After parsing, self.urls
will be a list of parsed links entries with the format
(url, lineno, column, name, codebase)
(url, lineno, column, name, codebase).
"""
def __init__ (self, content, tags=None):
"""store content in buffer and initialize URL list"""
"""
Store content in buffer and initialize URL list.
"""
super(LinkFinder, self).__init__(content)
if tags is None:
self.tags = LinkTags
@ -131,7 +151,9 @@ class LinkFinder (TagFinder):
linkcheck.log.debug(linkcheck.LOG_CHECK, "link finder")
def start_element (self, tag, attrs):
"""search for links and store found URLs in a list"""
"""
Search for links and store found URLs in a list.
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "LinkFinder tag %s attrs %s",
tag, attrs)
linkcheck.log.debug(linkcheck.LOG_CHECK,
@ -160,7 +182,9 @@ class LinkFinder (TagFinder):
"LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr):
"""Parse attrs for link name. Return name of link"""
"""
Parse attrs for link name. Return name of link.
"""
if tag == 'a' and attr == 'href':
name = linkcheck.strformat.unquote(attrs.get('title', u''))
if not name:
@ -176,7 +200,9 @@ class LinkFinder (TagFinder):
return name
def add_link (self, tag, attr, url, name, base):
"""add given url data to url list"""
"""
Add given url data to url list.
"""
urls = []
# look for meta refresh
if tag == 'meta':

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""logging and debug functions"""
"""
Logging and debug functions.
"""
# Copyright (C) 2003-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -26,30 +28,54 @@ import logging
#gc.enable()
#gc.set_debug(gc.DEBUG_LEAK)
def debug (log, msg, *args):
"""log a debug message"""
"""
Log a debug message.
return: c{None}
"""
logging.getLogger(log).debug(msg, *args)
def info (log, msg, *args):
"""log an informational message"""
"""
Log an informational message.
return: c{None}
"""
logging.getLogger(log).info(msg, *args)
def warn (log, msg, *args):
"""log a warning"""
"""
Log a warning.
return: c{None}
"""
logging.getLogger(log).warn(msg, *args)
def error (log, msg, *args):
"""log an error"""
"""
Log an error.
return: c{None}
"""
logging.getLogger(log).error(msg, *args)
def critical (log, msg, *args):
"""log a critical error"""
"""
Log a critical error.
return: c{None}
"""
logging.getLogger(log).critical(msg, *args)
def exception (log, msg, *args):
"""log an exception"""
"""
Log an exception.
return: c{None}
"""
logging.getLogger(log).exception(msg, *args)

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
"""a csv logger"""
"""A CSV logger."""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -25,8 +25,8 @@ import linkcheck.configuration
class CSVLogger (linkcheck.logger.Logger):
""" CSV output. CSV consists of one line per entry. Entries are
separated by a semicolon.
"""CSV output, consisting of one line per entry. Entries are
separated by a semicolon.
"""
def __init__ (self, **args):
"""store default separator and (os dependent) line terminator"""

View file

@ -18,7 +18,12 @@ _scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
'KB': 1024.0, 'MB': 1024.0*1024.0}
def _VmB (VmKey):
'''Parse /proc/<pid>/status file for given key.'''
"""
Parse /proc/<pid>/status file for given key.
@return requested number value of status entry
@rtype: c{float}
"""
if os.name != 'posix':
# not supported
return 0.0
@ -41,15 +46,30 @@ def _VmB (VmKey):
def memory (since=0.0):
'''Return memory usage in bytes.'''
"""
Get memory usage.
@return: memory usage in bytes
@rtype: c{float}
"""
return _VmB('VmSize:') - since
def resident (since=0.0):
'''Return resident memory usage in bytes.'''
"""
Get resident memory usage.
@return: resident memory usage in bytes
@rtype: c{float}
"""
return _VmB('VmRSS:') - since
def stacksize (since=0.0):
'''Return stack size in bytes.'''
"""
Get stack size.
@return: stack size in bytes
@rtype: c{float}
"""
return _VmB('VmStk:') - since

View file

@ -1,14 +1,24 @@
""" robotparser.py
Copyright (C) 2000-2005 Bastian Kleineidam
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/wc/norobots-rfc.html
"""
robotparser.py
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/wc/norobots-rfc.html
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import urlparse
import httplib
import urllib
@ -27,34 +37,60 @@ __all__ = ["RobotFileParser"]
_debug = False
def _msg (prefix, msg):
"""print debug message"""
"""
Print given prefix and debug message to stderr if the _debug flag is
set.
@return: c{None}
"""
if _debug:
print >> sys.stderr, prefix, msg
# methods for debug, warning and error messages
debug = lambda txt: _msg("debug:", txt)
warn = lambda txt: _msg("warning:", txt)
error = lambda txt: _msg("error:", txt)
class PasswordManager (object):
"""
Simple password manager storing username and password. Suitable
for use as an AuthHandler instance in urllib2.
"""
def __init__ (self, user, password):
"""
Store given username and password.
"""
self.user = user
self.password = password
def add_password (self, realm, uri, user, passwd):
# we have already our password
"""
Does nothing since username and password are already stored.
@return: c{None}
"""
pass
def find_user_password (self, realm, authuri):
"""
Get stored username and password.
@return: A tuple (user, password)
@rtype: c{tuple}
"""
return self.user, self.password
class RobotFileParser (object):
""" This class provides a set of methods to read, parse and answer
"""
This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
"""
def __init__ (self, url='', user=None, password=None):
"""Initialize internal entry lists and store given url and
"""
Initialize internal entry lists and store given url and
credentials.
"""
self.set_url(url)
@ -63,7 +99,11 @@ class RobotFileParser (object):
self._reset()
def _reset (self):
"""reset internal entry lists"""
"""
Reset internal flags and entry lists.
@return: c{None}
"""
self.entries = []
self.default_entry = None
self.disallow_all = False
@ -71,28 +111,47 @@ class RobotFileParser (object):
self.last_checked = 0
def mtime (self):
"""Returns the time the robots.txt file was last fetched.
"""
Returns the time the robots.txt file was last fetched.
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
@return: last modified in time.time() format
@rtype: c{number}
"""
return self.last_checked
def modified (self):
"""Sets the time the robots.txt file was last fetched to the
current time.
"""
Sets the time the robots.txt file was last fetched to the
current time.
@return: c{None}
"""
import time
self.last_checked = time.time()
def set_url (self, url):
"""Sets the URL referring to a robots.txt file."""
"""
Sets the URL referring to a robots.txt file.
@return: c{None}
"""
self.url = url
self.host, self.path = urlparse.urlparse(url)[1:3]
def get_opener (self):
"""
Construct an URL opener object. It considers the given credentials
from the __init__() method and supports proxies.
@return URL opener
@rtype: c{urllib2.OpenerDirector}
"""
pwd_manager = PasswordManager(self.user, self.password)
handlers = [urllib2.ProxyHandler(urllib.getproxies()),
handlers = [
urllib2.ProxyHandler(urllib.getproxies()),
urllib2.UnknownHandler,
HttpWithGzipHandler,
urllib2.HTTPBasicAuthHandler(pwd_manager),
@ -107,7 +166,11 @@ class RobotFileParser (object):
return urllib2.build_opener(*handlers)
def read (self):
"""Reads the robots.txt URL and feeds it to the parser."""
"""
Reads the robots.txt URL and feeds it to the parser.
@return: c{None}
"""
self._reset()
headers = {
'User-Agent': 'Python RobotFileParser/2.1',
@ -149,7 +212,11 @@ class RobotFileParser (object):
self.parse(lines)
def _add_entry (self, entry):
"""add entry to entry list"""
"""
Add a parsed entry to entry list.
@return: c{None}
"""
if "*" in entry.useragents:
# the default entry is considered last
self.default_entry = entry
@ -157,9 +224,12 @@ class RobotFileParser (object):
self.entries.append(entry)
def parse (self, lines):
"""parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
"""
Parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
@return: c{None}
"""
debug("robots.txt parse lines")
state = 0
@ -193,8 +263,8 @@ class RobotFileParser (object):
if line[0] == "user-agent":
if state == 2:
warn("line %d: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
" line before any user-agent"
" directive" % linenumber)
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
@ -221,7 +291,12 @@ class RobotFileParser (object):
debug("Parsed rules:\n%s" % str(self))
def can_fetch (self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
"""
Using the parsed robots.txt decide if useragent can fetch url.
@return: True if agent can fetch url, else False
@rtype: c{bool}
"""
debug("Checking robot.txt allowance for:\n"\
" user agent: %r\n url: %r" % (useragent, url))
if not isinstance(useragent, str):
@ -245,7 +320,13 @@ class RobotFileParser (object):
return True
def __str__ (self):
"""return string representation in robots.txt format"""
"""
Constructs string representation, usable as contents of a
robots.txt file.
@return: robots.txt format
@rtype: c{string}
"""
lines = [str(entry) for entry in self.entries]
if self.default_entry is not None:
lines.append(str(self.default_entry))
@ -253,11 +334,15 @@ class RobotFileParser (object):
class RuleLine (object):
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path."""
"""
A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path.
"""
def __init__ (self, path, allowance):
"""initialize with given path and allowance info"""
"""
Initialize with given path and allowance info.
"""
if path == '' and not allowance:
# an empty value means allow all
allowance = True
@ -265,30 +350,54 @@ class RuleLine (object):
self.allowance = allowance
def applies_to (self, path):
"""return True if pathname applies to this rule"""
"""
Look if given path applies to this rule.
@return: True if pathname applies to this rule, else False
@rtype: c{bool}
"""
return self.path == "*" or path.startswith(self.path)
def __str__ (self):
"""return string representation in robots.txt format"""
"""
Construct string representation in robots.txt format.
@return: robots.txt format
@rtype: c{string}
"""
return (self.allowance and "Allow" or "Disallow")+": "+self.path
class Entry (object):
"""An entry has one or more user-agents and zero or more rulelines"""
"""
An entry has one or more user-agents and zero or more rulelines.
"""
def __init__ (self):
"""initialize user agent and rule list"""
"""
Initialize user agent and rule list.
"""
self.useragents = []
self.rulelines = []
def __str__ (self):
"""return string representation in robots.txt format"""
"""
string representation in robots.txt format.
@return: robots.txt format
@rtype: c{string}
"""
lines = ["User-agent: %r" % agent for agent in self.useragents]
lines.extend([str(line) for line in self.rulelines])
return "\n".join(lines)
def applies_to (self, useragent):
"""check if this entry applies to the specified agent"""
"""
Check if this entry applies to the specified agent.
@return: True if this entry applies to the agent, else False.
@rtype: c{bool}
"""
# split the name token and make it lower case
if not useragent:
return True
@ -303,9 +412,16 @@ class Entry (object):
return False
def allowance (self, filename):
"""Preconditions:
"""
Preconditions:
- our agent applies to this entry
- filename is URL decoded"""
- filename is URL decoded
Check if given filename is allowed to acces this entry.
@return: True if allowed, else False
@rtype: c{bool}
"""
for line in self.rulelines:
debug("%s %s %s" % (filename, str(line), line.allowance))
if line.applies_to(filename):
@ -334,7 +450,9 @@ class Entry (object):
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
## SOFTWARE.
def decode (page):
"""gunzip or deflate a compressed page"""
"""
Gunzip or deflate a compressed page.
"""
debug("robots.txt page info %s" % str(page.info()))
encoding = page.info().get("Content-Encoding")
if encoding in ('gzip', 'x-gzip', 'deflate'):
@ -367,19 +485,26 @@ def decode (page):
class HttpWithGzipHandler (urllib2.HTTPHandler):
"support gzip encoding"
"""
Support gzip encoding.
"""
def http_open (self, req):
"""send request and decode answer"""
"""
Send request and decode answer.
"""
return decode(urllib2.HTTPHandler.http_open(self, req))
if hasattr(linkcheck.httplib2, 'HTTPS'):
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
"support gzip encoding"
"""
Support gzip encoding.
"""
def http_open (self, req):
"""send request and decode answer"""
"""
Send request and decode answer.
"""
return decode(urllib2.HTTPSHandler.http_open(self, req))
# end of urlutils.py routines

View file

@ -1,6 +1,8 @@
# -*- coding: iso-8859-1 -*-
"""Various string utility functions. Note that these functions are not
necessarily optimised for large strings, so use with care."""
"""
Various string utility functions. Note that these functions are not
necessarily optimised for large strings, so use with care.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -25,7 +27,13 @@ import time
def unquote (s):
"""if string s is not empty, strip quotes from s"""
"""
Remove leading and ending quotes.
@return: if s evaluates to False, return s as is, else return
string with stripped quotes
@rtype: string or type of s if evaluting to False
"""
if not s:
return s
if len(s) < 2:
@ -43,9 +51,10 @@ _para_win = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\r\n'}
_para_ro = re.compile("%s|%s|%s" % (_para_mac, _para_posix, _para_win))
def get_paragraphs (text):
"""A new paragraph is considered to start at a line which follows
one or more blank lines (lines containing nothing or just spaces).
The first line of the text also starts a paragraph.
"""
A new paragraph is considered to start at a line which follows
one or more blank lines (lines containing nothing or just spaces).
The first line of the text also starts a paragraph.
"""
if not text:
return []
@ -53,10 +62,11 @@ def get_paragraphs (text):
def wrap (text, width, **kwargs):
"""Adjust lines of text to be not longer than width. The text will be
returned unmodified if width <= 0.
See textwrap.wrap() for a list of supported kwargs.
Returns text with lines no longer than given width.
"""
Adjust lines of text to be not longer than width. The text will be
returned unmodified if width <= 0.
See textwrap.wrap() for a list of supported kwargs.
Returns text with lines no longer than given width.
"""
if width <= 0 or not text:
return text
@ -67,8 +77,10 @@ def wrap (text, width, **kwargs):
def get_line_number (s, index):
"""Return the line number of s[index]. Lines are assumed to be separated
by the ASCII character '\\n'"""
"""
Return the line number of s[index]. Lines are assumed to be separated
by the ASCII character '\\n'.
"""
i = 0
if index < 0:
index = 0
@ -81,7 +93,9 @@ def get_line_number (s, index):
def paginate (text, lines=22):
"""print text in pages of lines"""
"""
Print text in pages of lines.
"""
curline = 1
for line in text.splitlines():
print line
@ -96,7 +110,9 @@ def paginate (text, lines=22):
_markup_re = re.compile("<.*?>", re.DOTALL)
def remove_markup (s):
"""remove all <*> html markup tags from s"""
"""
Remove all <*> html markup tags from s.
"""
mo = _markup_re.search(s)
while mo:
s = s[0:mo.start()] + s[mo.end():]
@ -105,8 +121,9 @@ def remove_markup (s):
def strsize (b):
"""Return human representation of bytes b. A negative number of bytes
raises a value error.
"""
Return human representation of bytes b. A negative number of bytes
raises a value error.
"""
if b < 0:
raise ValueError("Invalid negative byte number")
@ -125,13 +142,17 @@ def strsize (b):
def strtime (t):
"""return ISO 8601 formatted time"""
"""
Return ISO 8601 formatted time.
"""
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
strtimezone()
def strduration (duration):
"""return translated and formatted time duration"""
"""
Return translated and formatted time duration.
"""
name = _("seconds")
if duration > 60:
duration = duration / 60
@ -143,7 +164,9 @@ def strduration (duration):
def strtimezone ():
"""return timezone info, %z on some platforms, but not supported on all"""
"""
Return timezone info, %z on some platforms, but not supported on all.
"""
if time.daylight:
zone = time.altzone
else:

View file

@ -23,11 +23,14 @@ except ImportError:
class Threader (object):
"""A thread generating class"""
"""
A thread generating class.
"""
def __init__ (self, num=5):
"""store maximum number of threads to generate, and initialize
an empty thread list
"""
Store maximum number of threads to generate, and initialize
an empty thread list.
"""
# this allows negative numbers
self.threads_max = max(num, 1)
@ -35,32 +38,44 @@ class Threader (object):
self.threads = []
def _acquire (self):
"""Wait until we are allowed to start a new thread"""
"""
Wait until we are allowed to start a new thread.
"""
while self.active_threads() >= self.threads_max:
self._reduce_threads()
time.sleep(0.1)
def _reduce_threads (self):
"""remove inactive threads"""
"""
Remove inactive threads.
"""
self.threads = [ t for t in self.threads if t.isAlive() ]
def active_threads (self):
"""return number of active threads"""
"""
Return number of active threads.
"""
return len(self.threads)
def finished (self):
"""return True if no active threads are left"""
"""
Return True if no active threads are left.
"""
if self.threads_max > 0:
self._reduce_threads()
return self.active_threads() == 0
def finish (self):
"""remove inactive threads"""
"""
Remove inactive threads.
"""
self._reduce_threads()
# XXX don't know how to stop a thread
def start_thread (self, func, args):
"""Generate a new thread"""
"""
Generate a new thread.
"""
if self.threads_max < 1:
# threading is disabled
func(*args)
@ -71,6 +86,8 @@ class Threader (object):
self.threads.append(t)
def __str__ (self):
"""string representation of threader state"""
"""
String representation of threader state.
"""
return "Threader with %d threads (max %d)" % \
(self.active_threads(), self.threads_max)

View file

@ -1,5 +1,7 @@
# -*- coding: iso-8859-1 -*-
"""url utils"""
"""
Functions for parsing and matching URL strings.
"""
# Copyright (C) 2000-2005 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
@ -70,8 +72,9 @@ is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match
# snatched form urlparse.py
def splitparams (path):
"""Split off parameter part from path.
Returns tuple (path-without-param, param)
"""
Split off parameter part from path.
Returns tuple (path-without-param, param)
"""
if '/' in path:
i = path.find(';', path.rfind('/'))
@ -83,7 +86,9 @@ def splitparams (path):
def is_safe_js_url (urlstr):
"""test javascript URLs"""
"""
Test javascript URL strings.
"""
url = list(urlparse.urlsplit(urlstr))
if url[0].lower() != 'http':
return False
@ -103,7 +108,9 @@ def is_safe_js_url (urlstr):
def is_numeric_port (portstr):
"""return True iff portstr is a valid port number"""
"""
return True iff portstr is a valid port number
"""
if portstr.isdigit():
port = int(portstr)
# 65536 == 2**16
@ -112,20 +119,25 @@ def is_numeric_port (portstr):
def safe_host_pattern (host):
"""return regular expression pattern with given host for URL testing"""
"""
return regular expression pattern with given host for URL testing
"""
return "(?i)%s://%s%s(#%s)?" % \
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
# XXX better name/implementation for this function
def stripsite (url):
"""remove scheme and host from URL. return host, newurl"""
"""
remove scheme and host from URL. return host, newurl
"""
url = urlparse.urlsplit(url)
return url[1], urlparse.urlunsplit((0, 0, url[2], url[3], url[4]))
def parse_qsl (qs, keep_blank_values=0, strict_parsing=0):
"""Parse a query given as a string argument.
"""
Parse a query given as a string argument.
Arguments:
@ -166,8 +178,10 @@ def parse_qsl (qs, keep_blank_values=0, strict_parsing=0):
def idna_encode (host):
"""Encode hostname as internationalized domain name (IDN) according
to RFC 3490."""
"""
Encode hostname as internationalized domain name (IDN) according
to RFC 3490.
"""
if host and isinstance(host, unicode):
uhost = host.encode('idna').decode('ascii')
return uhost, uhost != host
@ -175,7 +189,9 @@ def idna_encode (host):
def url_fix_host (urlparts):
"""Unquote and fix hostname. Returns is_idn."""
"""
Unquote and fix hostname. Returns is_idn.
"""
urlparts[1], is_idn = idna_encode(urllib.unquote(urlparts[1]).lower())
# a leading backslash in path causes urlsplit() to add the
# path components up to the first slash to host
@ -216,7 +232,9 @@ def url_fix_host (urlparts):
def url_fix_common_typos (url):
"""Fix common typos in given URL like forgotten colon."""
"""
Fix common typos in given URL like forgotten colon.
"""
if url.startswith("http//"):
url = "http://" + url[6:]
elif url.startswith("https//"):
@ -225,13 +243,17 @@ def url_fix_common_typos (url):
def url_fix_mailto_urlsplit (urlparts):
"""Split query part of mailto url if found."""
"""
Split query part of mailto url if found.
"""
if "?" in urlparts[2]:
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
def url_parse_query (query):
"""Parse and re-join the given CGI query."""
"""
Parse and re-join the given CGI query.
"""
# if ? is in the query, split it off, seen at msdn.microsoft.com
if '?' in query:
query, append = query.split('?', 1)
@ -253,10 +275,12 @@ def url_parse_query (query):
def url_norm (url):
"""Normalize the given URL which must be quoted. Supports unicode
hostnames (IDNA encoding) according to RFC 3490.
"""
Normalize the given URL which must be quoted. Supports unicode
hostnames (IDNA encoding) according to RFC 3490.
@return (normed url, idna flag)
@return: (normed url, idna flag)
@rtype: c{tuple} of length two
"""
urlparts = list(urlparse.urlsplit(url))
# scheme
@ -293,8 +317,9 @@ _samedir_ro = re.compile(r"/\./|/\.$")
_parentdir_ro = re.compile(r"^/(\.\./)+|/(?!\.\./)[^/]+/\.\.(/|$)")
_relparentdir_ro = re.compile(r"^(?!\.\./)[^/]+/\.\.(/|$)")
def collapse_segments (path):
"""Remove all redundant segments from the given URL path.
Precondition: path is an unquoted url path
"""
Remove all redundant segments from the given URL path.
Precondition: path is an unquoted url path
"""
# replace backslashes
# note: this is _against_ the specification (which would require
@ -329,7 +354,9 @@ url_is_absolute = re.compile("^[a-z]+:", re.I).match
def url_quote (url):
"""quote given URL"""
"""
Quote given URL.
"""
if not url_is_absolute(url):
return document_quote(url)
urlparts = list(urlparse.urlsplit(url))
@ -351,7 +378,9 @@ def url_quote (url):
def document_quote (document):
"""quote given document"""
"""
Quote given document.
"""
doc, query = urllib.splitquery(document)
doc = urllib.quote(doc, '/=,')
if query:
@ -360,15 +389,18 @@ def document_quote (document):
def match_url (url, domainlist):
"""return True if host part of url matches an entry in given domain
list"""
"""
Return True if host part of url matches an entry in given domain list.
"""
if not url:
return False
return match_host(url_split(url)[1], domainlist)
def match_host (host, domainlist):
"""return True if host matches an entry in given domain list"""
"""
Return True if host matches an entry in given domain list.
"""
if not host:
return False
for domain in domainlist:
@ -386,10 +418,11 @@ if os.name == 'nt':
_safe_url_chars = _nopathquote_chars + r"a-zA-Z0-9_:\.&#%\?"
_safe_url_chars_ro = re.compile(r"^[%s]*$" % _safe_url_chars)
def url_needs_quoting (url):
"""Check if url needs percent quoting. Note that the method does
only check basic character sets, and not any other syntax.
The URL might still be syntactically incorrect even when
it is properly quoted.
"""
Check if url needs percent quoting. Note that the method does
only check basic character sets, and not any other syntax.
The URL might still be syntactically incorrect even when
it is properly quoted.
"""
if url.rstrip() != url:
# handle trailing whitespace as a special case
@ -399,9 +432,10 @@ def url_needs_quoting (url):
def url_split (url):
"""Split url in a tuple (scheme, hostname, port, document) where
hostname is always lowercased.
Precondition: url is syntactically correct URI (eg has no whitespace)
"""
Split url in a tuple (scheme, hostname, port, document) where
hostname is always lowercased.
Precondition: url is syntactically correct URI (eg has no whitespace)
"""
scheme, netloc = urllib.splittype(url)
host, document = urllib.splithost(netloc)
@ -413,5 +447,7 @@ def url_split (url):
def url_unicode_split (url):
"""Like urlparse.urlsplit(), but always returning unicode parts."""
"""
Like urlparse.urlsplit(), but always returning unicode parts.
"""
return [unicode(s) for s in urlparse.urlsplit(url)]