mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-17 21:01:00 +00:00
documentation updates
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2148 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
f779f065f9
commit
700d564be7
17 changed files with 726 additions and 261 deletions
|
|
@ -39,12 +39,16 @@ lognamelist = ", ".join(["%r"%name for name in lognames.keys()])
|
|||
|
||||
|
||||
class LinkCheckerError (Exception):
|
||||
"""exception to be raised on linkchecker-specific check errors"""
|
||||
"""
|
||||
Exception to be raised on linkchecker-specific check errors.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def get_link_pat (arg, strict=False):
|
||||
"""get a link pattern matcher for intern/extern links"""
|
||||
"""
|
||||
Get a link pattern matcher for intern/extern links.
|
||||
"""
|
||||
linkcheck.log.debug(LOG_CHECK, "Link pattern %r", arg)
|
||||
if arg[0:1] == '!':
|
||||
pattern = arg[1:]
|
||||
|
|
@ -86,8 +90,12 @@ LoggerKeys = ", ".join(["%r"%name for name in Loggers.keys()])
|
|||
|
||||
|
||||
def init_i18n ():
|
||||
"""Initialize i18n with the configured locale dir. The environment
|
||||
variable LOCPATH can also specify a locale dir."""
|
||||
"""
|
||||
Initialize i18n with the configured locale dir. The environment
|
||||
variable LOCPATH can also specify a locale dir.
|
||||
|
||||
@return: C{None}
|
||||
"""
|
||||
locdir = os.environ.get('LOCPATH')
|
||||
if locdir is None:
|
||||
locdir = os.path.join(configdata.install_data, 'share', 'locale')
|
||||
|
|
|
|||
|
|
@ -134,7 +134,9 @@ AnsiReset = esc_ansicolor("default")
|
|||
|
||||
|
||||
def has_colors (fp):
|
||||
"""see if given file is an ANSI color enabled tty"""
|
||||
"""
|
||||
Test if given file is an ANSI color enabled tty.
|
||||
"""
|
||||
# note: the isatty() function ensures that we do not colorize
|
||||
# redirected streams, as this is almost never what we want
|
||||
if hasattr(fp, "isatty") and fp.isatty():
|
||||
|
|
@ -155,8 +157,9 @@ def has_colors (fp):
|
|||
|
||||
|
||||
def has_colors_nt ():
|
||||
"""windows has no curses; check if running in an environment
|
||||
which supports ANSI colors
|
||||
"""
|
||||
Check if running in a Windows environment which supports ANSI colors.
|
||||
Do this by searching for a loaded ANSI driver.
|
||||
"""
|
||||
_in = None
|
||||
_out = None
|
||||
|
|
@ -176,7 +179,10 @@ def has_colors_nt ():
|
|||
|
||||
|
||||
def colorize (text, color=None):
|
||||
"""return text colorized if color is given"""
|
||||
"""
|
||||
Colorize text with given color. If color is c{None}, leave the
|
||||
text as-is.
|
||||
"""
|
||||
if color is not None:
|
||||
return '%s%s%s' % (esc_ansicolor(color), text, AnsiReset)
|
||||
else:
|
||||
|
|
@ -184,17 +190,22 @@ def colorize (text, color=None):
|
|||
|
||||
|
||||
class Colorizer (object):
|
||||
"""prints colored messages to streams"""
|
||||
"""
|
||||
Prints colored messages to streams.
|
||||
"""
|
||||
|
||||
def __init__ (self, fp):
|
||||
"""initialize with given stream (file-like object)"""
|
||||
"""
|
||||
Initialize with given stream (file-like object).
|
||||
"""
|
||||
super(Colorizer, self).__init__()
|
||||
self._fp = fp
|
||||
|
||||
def write (self, s, color=None):
|
||||
"""Writes message s in color if output stream is
|
||||
a console stream (stderr or stdout).
|
||||
Else writes without color (i.e. black/white).
|
||||
"""
|
||||
Writes message s in color if output stream is
|
||||
a console stream (stderr or stdout).
|
||||
Else writes without color (i.e. black/white).
|
||||
"""
|
||||
if has_colors(self._fp):
|
||||
# stdout or stderr can be colorized
|
||||
|
|
@ -202,7 +213,9 @@ class Colorizer (object):
|
|||
self._fp.write(s)
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""delegate attribute access to the stored stream object"""
|
||||
"""
|
||||
Delegate attribute access to the stored stream object.
|
||||
"""
|
||||
return getattr(self._fp, name)
|
||||
|
||||
|
||||
|
|
@ -210,8 +223,9 @@ class ColoredStreamHandler (logging.StreamHandler, object):
|
|||
"""Send colored log messages to streams (file-like objects)."""
|
||||
|
||||
def __init__ (self, strm=None):
|
||||
"""Log to given stream (a file-like object) or to stderr if
|
||||
strm is None.
|
||||
"""
|
||||
Log to given stream (a file-like object) or to stderr if
|
||||
strm is None.
|
||||
"""
|
||||
super(ColoredStreamHandler, self).__init__(strm=strm)
|
||||
self.stream = Colorizer(self.stream)
|
||||
|
|
@ -224,7 +238,9 @@ class ColoredStreamHandler (logging.StreamHandler, object):
|
|||
}
|
||||
|
||||
def get_color (self, record):
|
||||
"""get appropriate color according to log level"""
|
||||
"""
|
||||
Get appropriate color according to log level.
|
||||
"""
|
||||
return self.colors.get(record.levelno, 'default')
|
||||
|
||||
def emit (self, record):
|
||||
|
|
|
|||
|
|
@ -97,8 +97,9 @@ class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
|
|||
else:
|
||||
self.url_connection.login(_user, _password)
|
||||
except EOFError, msg:
|
||||
msg = str(msg)
|
||||
raise linkcheck.LinkCheckerError(
|
||||
_("Remote host has closed connection")+": "+msg)
|
||||
_("Remote host has closed connection: %s") % msg)
|
||||
if not self.url_connection.getwelcome():
|
||||
self.close_connection()
|
||||
raise linkcheck.LinkCheckerError(
|
||||
|
|
|
|||
|
|
@ -48,18 +48,23 @@ distribution."""
|
|||
|
||||
|
||||
def norm (path):
|
||||
"""norm given system path with all available norm funcs in os.path"""
|
||||
"""
|
||||
Norm given system path with all available norm funcs in os.path.
|
||||
"""
|
||||
return os.path.normcase(os.path.normpath(os.path.expanduser(path)))
|
||||
|
||||
|
||||
# dynamic options
|
||||
class Configuration (dict):
|
||||
"""Storage for configuration options. Options can both be given from
|
||||
the command line as well as from configuration files.
|
||||
"""
|
||||
Storage for configuration options. Options can both be given from
|
||||
the command line as well as from configuration files.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize the default options"""
|
||||
"""
|
||||
Initialize the default options.
|
||||
"""
|
||||
super(Configuration, self).__init__()
|
||||
self["verbose"] = False
|
||||
self["warnings"] = False
|
||||
|
|
@ -130,12 +135,13 @@ class Configuration (dict):
|
|||
self["threads"] = 10
|
||||
|
||||
def init_logging (self, debug=None):
|
||||
"""Load logging.conf file settings to set up the
|
||||
application logging (not to be confused with check loggers).
|
||||
When debug is not None it is expected to be a list of
|
||||
logger names for which debugging will be enabled.
|
||||
"""
|
||||
Load logging.conf file settings to set up the
|
||||
application logging (not to be confused with check loggers).
|
||||
When debug is not None it is expected to be a list of
|
||||
logger names for which debugging will be enabled.
|
||||
|
||||
Activating debugging disables threading.
|
||||
Activating debugging disables threading.
|
||||
"""
|
||||
config_dir = _linkchecker_configdata.config_dir
|
||||
filename = norm(os.path.join(config_dir, "logging.conf"))
|
||||
|
|
@ -154,21 +160,27 @@ class Configuration (dict):
|
|||
logging.getLogger(name).setLevel(logging.DEBUG)
|
||||
|
||||
def logger_new (self, loggertype, **kwargs):
|
||||
"""instantiate new logger and return it"""
|
||||
"""
|
||||
Instantiate new logger and return it.
|
||||
"""
|
||||
args = {}
|
||||
args.update(self[loggertype])
|
||||
args.update(kwargs)
|
||||
return linkcheck.Loggers[loggertype](**args)
|
||||
|
||||
def logger_add (self, loggertype, loggerclass, loggerargs=None):
|
||||
"""add a new logger type to the known loggers"""
|
||||
"""
|
||||
Add a new logger type to the known loggers.
|
||||
"""
|
||||
if loggerargs is None:
|
||||
loggerargs = {}
|
||||
linkcheck.Loggers[loggertype] = loggerclass
|
||||
self[loggertype] = loggerargs
|
||||
|
||||
def read (self, files=None):
|
||||
"""read settings from given config files"""
|
||||
"""
|
||||
Read settings from given config files.
|
||||
"""
|
||||
if files is None:
|
||||
cfiles = []
|
||||
else:
|
||||
|
|
@ -184,7 +196,9 @@ class Configuration (dict):
|
|||
self['logger'] = self.logger_new('text')
|
||||
|
||||
def read_config (self, files):
|
||||
"""read all the configuration parameters from the given files"""
|
||||
"""
|
||||
Read all the configuration parameters from the given files.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"reading configuration from %s", files)
|
||||
try:
|
||||
|
|
@ -199,7 +213,9 @@ class Configuration (dict):
|
|||
self.read_filtering_config(cfgparser)
|
||||
|
||||
def read_output_config (self, cfgparser):
|
||||
"""read configuration options in section "output"."""
|
||||
"""
|
||||
Read configuration options in section "output".
|
||||
"""
|
||||
section = "output"
|
||||
for key in linkcheck.Loggers.keys():
|
||||
if cfgparser.has_section(key):
|
||||
|
|
@ -257,7 +273,9 @@ class Configuration (dict):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
|
||||
|
||||
def read_checking_config (self, cfgparser):
|
||||
"""read configuration options in section "checking"."""
|
||||
"""
|
||||
Read configuration options in section "checking".
|
||||
"""
|
||||
section = "checking"
|
||||
try:
|
||||
num = cfgparser.getint(section, "threads")
|
||||
|
|
@ -303,7 +321,9 @@ class Configuration (dict):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
|
||||
|
||||
def read_authentication_config (self, cfgparser):
|
||||
"""read configuration options in section "authentication"."""
|
||||
"""
|
||||
Read configuration options in section "authentication".
|
||||
"""
|
||||
section = "authentication"
|
||||
try:
|
||||
i = 1
|
||||
|
|
@ -320,7 +340,9 @@ class Configuration (dict):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
|
||||
|
||||
def read_filtering_config (self, cfgparser):
|
||||
"""read configuration options in section "filtering"."""
|
||||
"""
|
||||
Read configuration options in section "filtering".
|
||||
"""
|
||||
section = "filtering"
|
||||
try:
|
||||
i = 1
|
||||
|
|
|
|||
|
|
@ -18,74 +18,99 @@
|
|||
|
||||
|
||||
class SetList (list):
|
||||
"""a list that eliminates all duplicates"""
|
||||
"""
|
||||
A list that eliminates all duplicates.
|
||||
"""
|
||||
|
||||
def append (self, x):
|
||||
"""append only if not already there"""
|
||||
if x not in self:
|
||||
super(SetList, self).append(x)
|
||||
def append (self, item):
|
||||
"""
|
||||
Append only if not already there.
|
||||
"""
|
||||
if item not in self:
|
||||
super(SetList, self).append(item)
|
||||
|
||||
def extend (self, x):
|
||||
"""extend while eliminating duplicates by appending item for item"""
|
||||
for i in x:
|
||||
self.append(i)
|
||||
def extend (self, itemlist):
|
||||
"""
|
||||
Extend while eliminating duplicates by appending item for item.
|
||||
"""
|
||||
for item in itemlist:
|
||||
self.append(item)
|
||||
|
||||
def insert (self, i, x):
|
||||
"""insert only if not already there"""
|
||||
if x not in self:
|
||||
super(SetList, self).insert(i, x)
|
||||
def insert (self, index, item):
|
||||
"""
|
||||
Insert item at given index only if it is not already there.
|
||||
"""
|
||||
if item not in self:
|
||||
super(SetList, self).insert(index, item)
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""set new value, and eliminate a possible duplicate value"""
|
||||
# search index idx with self[i] == value
|
||||
idx = -1
|
||||
def __setitem__ (self, index, item):
|
||||
"""
|
||||
Set new value, and eliminate a possible duplicate value.
|
||||
"""
|
||||
# search index i with self[i] == item
|
||||
delidx = -1
|
||||
for i in range(len(self)):
|
||||
if self[i] == value and i != key:
|
||||
idx = i
|
||||
if self[i] == item and i != index:
|
||||
delidx = i
|
||||
# break here, there can be only one duplicate
|
||||
break
|
||||
# insert new value
|
||||
super(SetList, self).__setitem__(key, value)
|
||||
if idx != -1:
|
||||
super(SetList, self).__setitem__(index, item)
|
||||
if delidx != -1:
|
||||
# remove duplicate
|
||||
del self[idx]
|
||||
del self[delidx]
|
||||
|
||||
|
||||
class ListDict (dict):
|
||||
"""a dictionary whose iterators reflect the order in which elements
|
||||
were added
|
||||
"""
|
||||
A dictionary whose iterators reflect the order in which elements
|
||||
were added.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""initialize sorted key list"""
|
||||
"""
|
||||
Initialize sorted key list.
|
||||
"""
|
||||
# sorted list of keys
|
||||
self._keys = []
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""add key,value to dict, append key to sorted list"""
|
||||
"""
|
||||
Add key,value to dict, append key to sorted list.
|
||||
"""
|
||||
if not self.has_key(key):
|
||||
self._keys.append(key)
|
||||
super(ListDict, self).__setitem__(key, value)
|
||||
|
||||
def __delitem__ (self, key):
|
||||
"""remove key from dict"""
|
||||
"""
|
||||
Remove key from dict.
|
||||
"""
|
||||
self._keys.remove(key)
|
||||
super(ListDict, self).__delitem__(key)
|
||||
|
||||
def values (self):
|
||||
"""return sorted list of values"""
|
||||
"""
|
||||
Return sorted list of values.
|
||||
"""
|
||||
return [self[k] for k in self._keys]
|
||||
|
||||
def items (self):
|
||||
"""return sorted list of items"""
|
||||
"""
|
||||
Return sorted list of items.
|
||||
"""
|
||||
return [(k, self[k]) for k in self._keys]
|
||||
|
||||
def keys (self):
|
||||
"""return sorted list of keys"""
|
||||
"""
|
||||
Return sorted list of keys.
|
||||
"""
|
||||
return self._keys[:]
|
||||
|
||||
def itervalues (self):
|
||||
"""return iterator over sorted values"""
|
||||
"""
|
||||
Return iterator over sorted values.
|
||||
"""
|
||||
return iter(self.values())
|
||||
|
||||
def iteritems (self):
|
||||
|
|
@ -93,11 +118,15 @@ class ListDict (dict):
|
|||
return iter(self.items())
|
||||
|
||||
def iterkeys (self):
|
||||
"""return iterator over sorted keys"""
|
||||
"""
|
||||
Return iterator over sorted keys.
|
||||
"""
|
||||
return iter(self.keys())
|
||||
|
||||
def clear (self):
|
||||
"""remove all dict entires"""
|
||||
"""
|
||||
Remove all dict entries.
|
||||
"""
|
||||
self._keys = []
|
||||
super(ListDict, self).clear()
|
||||
|
||||
|
|
@ -111,20 +140,28 @@ class LRU (object):
|
|||
"""
|
||||
|
||||
class Node (object):
|
||||
"""internal node with pointers to sisters"""
|
||||
"""
|
||||
Internal node with pointers to sisters.
|
||||
"""
|
||||
|
||||
def __init__ (self, prev, me):
|
||||
"""initialize pointers and data"""
|
||||
"""
|
||||
Initialize pointers and data.
|
||||
"""
|
||||
self.prev = prev
|
||||
self.me = me
|
||||
self.next = None
|
||||
|
||||
def __len__ (self):
|
||||
"""number of stored objects in the queue"""
|
||||
"""
|
||||
Number of stored objects in the queue.
|
||||
"""
|
||||
return len(self.d)
|
||||
|
||||
def __init__ (self, count, pairs=None):
|
||||
"""make new queue with given maximum count, and key/value pairs"""
|
||||
"""
|
||||
Make new queue with given maximum count, and key/value pairs.
|
||||
"""
|
||||
self.count = max(count, 1)
|
||||
self.d = {}
|
||||
self.first = None
|
||||
|
|
@ -134,21 +171,29 @@ class LRU (object):
|
|||
self[key] = value
|
||||
|
||||
def __contains__ (self, obj):
|
||||
"""look if obj is in the queue"""
|
||||
"""
|
||||
Look if obj is in the queue.
|
||||
"""
|
||||
return obj in self.d
|
||||
|
||||
def has_key (self, obj):
|
||||
"""look if obj is in the queue"""
|
||||
"""
|
||||
Look if obj is in the queue.
|
||||
"""
|
||||
return self.d.has_key(obj)
|
||||
|
||||
def __getitem__ (self, obj):
|
||||
"""get stored object data, and mark it as LRU"""
|
||||
"""
|
||||
Get stored object data, and mark it as LRU.
|
||||
"""
|
||||
a = self.d[obj].me
|
||||
self[a[0]] = a[1]
|
||||
return a[1]
|
||||
|
||||
def __setitem__ (self, obj, val):
|
||||
"""set given object data, and mark it as LRU"""
|
||||
"""
|
||||
Set given object data, and mark it as LRU.
|
||||
"""
|
||||
if obj in self.d:
|
||||
del self[obj]
|
||||
nobj = self.Node(self.last, (obj, val))
|
||||
|
|
@ -171,7 +216,9 @@ class LRU (object):
|
|||
del a
|
||||
|
||||
def __delitem__ (self, obj):
|
||||
"""remove object from queue"""
|
||||
"""
|
||||
Remove object from queue.
|
||||
"""
|
||||
nobj = self.d[obj]
|
||||
if nobj.prev:
|
||||
nobj.prev.next = nobj.next
|
||||
|
|
@ -184,7 +231,9 @@ class LRU (object):
|
|||
del self.d[obj]
|
||||
|
||||
def __iter__ (self):
|
||||
"""iterate over stored object values"""
|
||||
"""
|
||||
Iterate over stored object values.
|
||||
"""
|
||||
cur = self.first
|
||||
while cur != None:
|
||||
cur2 = cur.next
|
||||
|
|
@ -192,7 +241,9 @@ class LRU (object):
|
|||
cur = cur2
|
||||
|
||||
def iteritems (self):
|
||||
"""iterate over stored object items"""
|
||||
"""
|
||||
Iterate over stored object items.
|
||||
"""
|
||||
cur = self.first
|
||||
while cur != None:
|
||||
cur2 = cur.next
|
||||
|
|
@ -200,21 +251,29 @@ class LRU (object):
|
|||
cur = cur2
|
||||
|
||||
def iterkeys (self):
|
||||
"""iterate over stored object keys"""
|
||||
"""
|
||||
Iterate over stored object keys.
|
||||
"""
|
||||
return iter(self.d)
|
||||
|
||||
def itervalues (self):
|
||||
"""iterate over stored object values"""
|
||||
"""
|
||||
Iterate over stored object values.
|
||||
"""
|
||||
for i, j in self.iteritems():
|
||||
yield j
|
||||
|
||||
def keys (self):
|
||||
"""iterate over stored object keys"""
|
||||
"""
|
||||
Iterate over stored object keys.
|
||||
"""
|
||||
return self.d.keys()
|
||||
|
||||
def setdefault (self, key, failobj=None):
|
||||
"""get given object data, and mark it as LRU. If it is not already
|
||||
stored, store the given failobj."""
|
||||
"""
|
||||
Get given object data, and mark it as LRU. If it is not already
|
||||
stored, store the given failobj.
|
||||
"""
|
||||
if not self.has_key(key):
|
||||
self[key] = failobj
|
||||
return self[key]
|
||||
|
|
|
|||
|
|
@ -87,24 +87,32 @@ error = 'fcgi.error'
|
|||
# anywhere at the moment
|
||||
|
||||
def _error (msg):
|
||||
"""Append a string to /tmp/err"""
|
||||
"""
|
||||
Append a string to /tmp/err.
|
||||
"""
|
||||
errf = file('/tmp/err', 'a+')
|
||||
errf.write(msg+'\n')
|
||||
errf.close()
|
||||
|
||||
|
||||
class Record (object):
|
||||
"""Class representing FastCGI records"""
|
||||
"""
|
||||
Class representing FastCGI records.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""initialize record data"""
|
||||
"""
|
||||
Initialize record data.
|
||||
"""
|
||||
self.version = FCGI_VERSION_1
|
||||
self.rec_type = FCGI_UNKNOWN_TYPE
|
||||
self.req_id = FCGI_NULL_REQUEST_ID
|
||||
self.content = ""
|
||||
|
||||
def read_record (self, sock):
|
||||
"""read a FastCGI record from socket"""
|
||||
"""
|
||||
Read a FastCGI record from socket.
|
||||
"""
|
||||
s = [ord(x) for x in sock.recv(8)]
|
||||
self.version, self.rec_type, padding_length = s[0], s[1], s[6]
|
||||
self.req_id, content_length = (s[2]<<8)+s[3], (s[4]<<8)+s[5]
|
||||
|
|
@ -136,7 +144,9 @@ class Record (object):
|
|||
self.protocolStatus = ord(c[4])
|
||||
|
||||
def write_record (self, sock):
|
||||
"""write a FastCGI request to socket"""
|
||||
"""
|
||||
Write a FastCGI request to socket.
|
||||
"""
|
||||
content = self.content
|
||||
if self.rec_type == FCGI_BEGIN_REQUEST:
|
||||
content = chr(self.role>>8) + chr(self.role & 255) + \
|
||||
|
|
@ -219,60 +229,79 @@ def HandleManTypes (r, conn):
|
|||
|
||||
|
||||
class FastCGIWriter (object):
|
||||
"""File-like object writing FastCGI requests. All read operations
|
||||
return empty data.
|
||||
"""
|
||||
File-like object writing FastCGI requests. All read operations
|
||||
return empty data.
|
||||
"""
|
||||
|
||||
def __init__ (self, rec, conn):
|
||||
"""initialize with given record and connection"""
|
||||
"""
|
||||
Initialize with given record and connection.
|
||||
"""
|
||||
self.record = rec
|
||||
self.conn = conn
|
||||
self.closed = False
|
||||
|
||||
def close (self):
|
||||
"""close this writer"""
|
||||
"""
|
||||
Close this writer.
|
||||
"""
|
||||
if not self.closed:
|
||||
self.closed = True
|
||||
self.record.content = ""
|
||||
self.record.write_record(self.conn)
|
||||
|
||||
def isatty (self):
|
||||
"""returns False"""
|
||||
"""
|
||||
Returns False.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
return False
|
||||
|
||||
def seek (self, pos, mode=0):
|
||||
"""does nothing"""
|
||||
"""
|
||||
Does nothing.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
|
||||
def tell (self):
|
||||
"""Return zero"""
|
||||
"""
|
||||
Return zero.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
return 0
|
||||
|
||||
def read (self, n=-1):
|
||||
"""return empty string"""
|
||||
"""
|
||||
Return empty string.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
return ""
|
||||
|
||||
def readline (self, length=None):
|
||||
"""return empty string"""
|
||||
"""
|
||||
Return empty string.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
return ""
|
||||
|
||||
def readlines (self):
|
||||
"""return empty list"""
|
||||
"""
|
||||
Return empty list.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
return []
|
||||
|
||||
def write (self, s):
|
||||
"""write data in record for record to connection"""
|
||||
"""
|
||||
Write data in record for record to connection.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
while s:
|
||||
|
|
@ -281,17 +310,23 @@ class FastCGIWriter (object):
|
|||
self.record.write_record(self.conn)
|
||||
|
||||
def get_next_chunk (self, data):
|
||||
"""return tuple (chunk of data, newdata)"""
|
||||
"""
|
||||
Return tuple (chunk of data, newdata).
|
||||
"""
|
||||
chunk = data[:8192]
|
||||
data = data[8192:]
|
||||
return chunk, data
|
||||
|
||||
def writelines (self, lines):
|
||||
"""write given lines to the connection"""
|
||||
"""
|
||||
Write given lines to the connection.
|
||||
"""
|
||||
self.write(''.join(lines))
|
||||
|
||||
def flush (self):
|
||||
"""does nothing"""
|
||||
"""
|
||||
Does nothing.
|
||||
"""
|
||||
if self.closed:
|
||||
raise ValueError, "I/O operation on closed file"
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,9 @@ supported_languages = ['en']
|
|||
default_language = None
|
||||
|
||||
def install_builtin (translator, do_unicode):
|
||||
"""install _() and _n() gettext methods into default namespace"""
|
||||
"""
|
||||
Install _() and _n() gettext methods into default namespace.
|
||||
"""
|
||||
import __builtin__
|
||||
if do_unicode:
|
||||
__builtin__.__dict__['_'] = translator.ugettext
|
||||
|
|
@ -38,19 +40,35 @@ def install_builtin (translator, do_unicode):
|
|||
__builtin__.__dict__['_n'] = translator.ngettext
|
||||
|
||||
class Translator (gettext.GNUTranslations):
|
||||
"""
|
||||
A translation class always installing its gettext methods into the
|
||||
default namespace.
|
||||
"""
|
||||
|
||||
def install (self, do_unicode):
|
||||
"""
|
||||
Install gettext methods into the default namespace.
|
||||
"""
|
||||
install_builtin(self, do_unicode)
|
||||
|
||||
|
||||
class NullTranslator (gettext.NullTranslations):
|
||||
"""
|
||||
A translation class always installing its gettext methods into the
|
||||
default namespace.
|
||||
"""
|
||||
|
||||
def install (self, do_unicode):
|
||||
"""
|
||||
Install gettext methods into the default namespace.
|
||||
"""
|
||||
install_builtin(self, do_unicode)
|
||||
|
||||
|
||||
def init (domain, directory):
|
||||
"""initialize this gettext i18n module"""
|
||||
"""
|
||||
Initialize this gettext i18n module.
|
||||
"""
|
||||
global default_language
|
||||
# get supported languages
|
||||
for lang in os.listdir(directory):
|
||||
|
|
@ -74,7 +92,9 @@ def get_translator (domain, directory, languages=None,
|
|||
translatorklass=Translator,
|
||||
fallback=False,
|
||||
fallbackklass=NullTranslator):
|
||||
"""search the appropriate GNUTranslations class"""
|
||||
"""
|
||||
Search the appropriate GNUTranslations class.
|
||||
"""
|
||||
translator = gettext.translation(domain, localedir=directory,
|
||||
languages=languages, class_=translatorklass, fallback=fallback)
|
||||
if not isinstance(translator, gettext.GNUTranslations):
|
||||
|
|
@ -83,14 +103,18 @@ def get_translator (domain, directory, languages=None,
|
|||
|
||||
|
||||
def get_lang (lang):
|
||||
"""return lang if it is supported, or the default language"""
|
||||
"""
|
||||
Return lang if it is supported, or the default language.
|
||||
"""
|
||||
if lang in supported_languages:
|
||||
return lang
|
||||
return default_language
|
||||
|
||||
|
||||
def get_headers_lang (headers):
|
||||
"""return preferred supported language in given HTTP headers"""
|
||||
"""
|
||||
Return preferred supported language in given HTTP headers.
|
||||
"""
|
||||
if not headers.has_key('Accept-Language'):
|
||||
return default_language
|
||||
languages = headers['Accept-Language'].split(",")
|
||||
|
|
@ -103,7 +127,9 @@ def get_headers_lang (headers):
|
|||
|
||||
|
||||
def get_locale ():
|
||||
"""return current configured locale"""
|
||||
"""
|
||||
Return current configured locale.
|
||||
"""
|
||||
loc = locale.getdefaultlocale()[0]
|
||||
if loc is None:
|
||||
loc = 'C'
|
||||
|
|
@ -131,10 +157,14 @@ lang_transis = {
|
|||
}
|
||||
|
||||
def lang_name (lang):
|
||||
"""return full name of given language"""
|
||||
"""
|
||||
Return full name of given language.
|
||||
"""
|
||||
return lang_names[lang]
|
||||
|
||||
|
||||
def lang_trans (lang, curlang):
|
||||
"""return translated full name of given language"""
|
||||
"""
|
||||
Return translated full name of given language.
|
||||
"""
|
||||
return lang_transis[lang][curlang]
|
||||
|
|
|
|||
|
|
@ -34,19 +34,25 @@ _supported_langs = ('de', 'fr', 'nl', 'C')
|
|||
_is_level = re.compile(r'^[0123]$').match
|
||||
|
||||
class FormError (Exception):
|
||||
"""form related errors"""
|
||||
"""
|
||||
Form related errors.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def startoutput (out=sys.stdout):
|
||||
"""print leading HTML headers to given output stream"""
|
||||
"""
|
||||
Print leading HTML headers to given output stream.
|
||||
"""
|
||||
out.write("Content-type: text/html\r\n"
|
||||
"Cache-Control: no-cache\r\n"
|
||||
"Pragma: no-cache\r\n"
|
||||
"\r\n")
|
||||
|
||||
def checkaccess (out=sys.stdout, hosts=[], servers=[], env=os.environ):
|
||||
"""see if remote addr is allowed to access the CGI interface"""
|
||||
"""
|
||||
See if remote addr is allowed to access the CGI interface.
|
||||
"""
|
||||
if os.environ.get('REMOTE_ADDR') in hosts and \
|
||||
os.environ.get('SERVER_ADDR') in servers:
|
||||
return True
|
||||
|
|
@ -56,7 +62,9 @@ def checkaccess (out=sys.stdout, hosts=[], servers=[], env=os.environ):
|
|||
|
||||
|
||||
def checklink (out=sys.stdout, form={}, env=os.environ):
|
||||
"""main cgi function, check the given links and print out the result"""
|
||||
"""
|
||||
Main cgi function, check the given links and print out the result.
|
||||
"""
|
||||
try:
|
||||
checkform(form)
|
||||
except FormError, why:
|
||||
|
|
@ -96,14 +104,17 @@ def checklink (out=sys.stdout, form={}, env=os.environ):
|
|||
|
||||
|
||||
def get_host_name (form):
|
||||
"""return host name of given URL"""
|
||||
"""
|
||||
Return host name of given URL.
|
||||
"""
|
||||
return urlparse.urlparse(form["url"].value)[1]
|
||||
|
||||
|
||||
def checkform (form):
|
||||
"""check form data. throw exception on error
|
||||
Be sure to NOT print out any user-given data as HTML code, so use
|
||||
only plain strings as exception text.
|
||||
"""
|
||||
Check form data. throw exception on error
|
||||
Be sure to NOT print out any user-given data as HTML code, so use
|
||||
only plain strings as exception text.
|
||||
"""
|
||||
# check lang support
|
||||
if form.has_key("language"):
|
||||
|
|
@ -134,7 +145,9 @@ def checkform (form):
|
|||
raise FormError(_("invalid %s option syntax") % option)
|
||||
|
||||
def logit (form, env):
|
||||
"""log form errors"""
|
||||
"""
|
||||
Log form errors.
|
||||
"""
|
||||
global _logfile
|
||||
if not _logfile:
|
||||
return
|
||||
|
|
@ -151,7 +164,9 @@ def logit (form, env):
|
|||
|
||||
|
||||
def print_error (out, why):
|
||||
"""print standard error page"""
|
||||
"""
|
||||
Print standard error page.
|
||||
"""
|
||||
out.write(_("""<html><head>
|
||||
<title>LinkChecker Online Error</title></head>
|
||||
<body text=#192c83 bgcolor=#fff7e5 link=#191c83 vlink=#191c83 alink=#191c83>
|
||||
|
|
|
|||
|
|
@ -28,12 +28,16 @@ img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>])+>""")
|
|||
endtag_re = re.compile(r"""(?i)</a\s*>""")
|
||||
|
||||
def _unquote (txt):
|
||||
"""resolve entities and markup from txt"""
|
||||
"""
|
||||
Resolve entities and markup from txt.
|
||||
"""
|
||||
return linkcheck.HtmlParser.resolve_entities(
|
||||
linkcheck.strformat.remove_markup(txt))
|
||||
|
||||
def image_name (txt):
|
||||
"""return the alt part of the first <img alt=""> tag in txt"""
|
||||
"""
|
||||
Return the alt part of the first <img alt=""> tag in txt.
|
||||
"""
|
||||
mo = imgtag_re.search(txt)
|
||||
if mo:
|
||||
name = linkcheck.strformat.unquote(mo.group('name').strip())
|
||||
|
|
@ -42,7 +46,9 @@ def image_name (txt):
|
|||
|
||||
|
||||
def href_name (txt):
|
||||
"""return the name part of the first <a href="">name</a> link in txt"""
|
||||
"""
|
||||
Return the name part of the first <a href="">name</a> link in txt.
|
||||
"""
|
||||
name = u""
|
||||
endtag = endtag_re.search(txt)
|
||||
if not endtag:
|
||||
|
|
|
|||
|
|
@ -61,12 +61,15 @@ refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
|
|||
css_url_re = re.compile(ur"url\((?P<url>[^\)]+)\)")
|
||||
|
||||
class TagFinder (object):
|
||||
"""Base class storing HTML parse messages in a list.
|
||||
TagFinder instances are to be used as HtmlParser handlers.
|
||||
"""
|
||||
Base class storing HTML parse messages in a list.
|
||||
TagFinder instances are to be used as HtmlParser handlers.
|
||||
"""
|
||||
|
||||
def __init__ (self, content):
|
||||
"""store content in buffer"""
|
||||
"""
|
||||
Store content in buffer.
|
||||
"""
|
||||
super(TagFinder, self).__init__()
|
||||
self.content = content
|
||||
# warnings and errors during parsing
|
||||
|
|
@ -76,28 +79,40 @@ class TagFinder (object):
|
|||
self.parser = None
|
||||
|
||||
def _errorfun (self, msg, name):
|
||||
"""append msg to error list"""
|
||||
"""
|
||||
Append msg to error list.
|
||||
"""
|
||||
self.parse_info.append("%s at line %d col %d: %s" % \
|
||||
(name, self.parser.last_lineno(), self.parser.last_column(), msg))
|
||||
|
||||
def warning (self, msg):
|
||||
"""signal a filter/parser warning"""
|
||||
"""
|
||||
Signal a filter/parser warning.
|
||||
"""
|
||||
self._errorfun(msg, "warning")
|
||||
|
||||
def error (self, msg):
|
||||
"""signal a filter/parser error"""
|
||||
"""
|
||||
Signal a filter/parser error.
|
||||
"""
|
||||
self._errorfun(msg, "error")
|
||||
|
||||
def fatal_error (self, msg):
|
||||
"""signal a fatal filter/parser error"""
|
||||
"""
|
||||
Signal a fatal filter/parser error.
|
||||
"""
|
||||
self._errorfun(msg, "fatal error")
|
||||
|
||||
|
||||
class MetaRobotsFinder (TagFinder):
|
||||
"""class for finding robots.txt meta values in HTML"""
|
||||
"""
|
||||
Class for finding robots.txt meta values in HTML.
|
||||
"""
|
||||
|
||||
def __init__ (self, content):
|
||||
"""store content in buffer and initialize flags"""
|
||||
"""
|
||||
Store content in buffer and initialize flags.
|
||||
"""
|
||||
super(MetaRobotsFinder, self).__init__(content)
|
||||
self.follow = True
|
||||
self.index = True
|
||||
|
|
@ -105,7 +120,9 @@ class MetaRobotsFinder (TagFinder):
|
|||
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
"""search for meta robots.txt "nofollow" and "noindex" flags"""
|
||||
"""
|
||||
Search for meta robots.txt "nofollow" and "noindex" flags.
|
||||
"""
|
||||
if tag == 'meta':
|
||||
if attrs.get('name') == 'robots':
|
||||
val = attrs.get('content', u'').lower().split(u',')
|
||||
|
|
@ -114,13 +131,16 @@ class MetaRobotsFinder (TagFinder):
|
|||
|
||||
|
||||
class LinkFinder (TagFinder):
|
||||
"""Find a list of links. After parsing, self.urls
|
||||
"""
|
||||
Find a list of links. After parsing, self.urls
|
||||
will be a list of parsed links entries with the format
|
||||
(url, lineno, column, name, codebase)
|
||||
(url, lineno, column, name, codebase).
|
||||
"""
|
||||
|
||||
def __init__ (self, content, tags=None):
|
||||
"""store content in buffer and initialize URL list"""
|
||||
"""
|
||||
Store content in buffer and initialize URL list.
|
||||
"""
|
||||
super(LinkFinder, self).__init__(content)
|
||||
if tags is None:
|
||||
self.tags = LinkTags
|
||||
|
|
@ -131,7 +151,9 @@ class LinkFinder (TagFinder):
|
|||
linkcheck.log.debug(linkcheck.LOG_CHECK, "link finder")
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
"""search for links and store found URLs in a list"""
|
||||
"""
|
||||
Search for links and store found URLs in a list.
|
||||
"""
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "LinkFinder tag %s attrs %s",
|
||||
tag, attrs)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
|
|
@ -160,7 +182,9 @@ class LinkFinder (TagFinder):
|
|||
"LinkFinder finished tag %s", tag)
|
||||
|
||||
def get_link_name (self, tag, attrs, attr):
|
||||
"""Parse attrs for link name. Return name of link"""
|
||||
"""
|
||||
Parse attrs for link name. Return name of link.
|
||||
"""
|
||||
if tag == 'a' and attr == 'href':
|
||||
name = linkcheck.strformat.unquote(attrs.get('title', u''))
|
||||
if not name:
|
||||
|
|
@ -176,7 +200,9 @@ class LinkFinder (TagFinder):
|
|||
return name
|
||||
|
||||
def add_link (self, tag, attr, url, name, base):
|
||||
"""add given url data to url list"""
|
||||
"""
|
||||
Add given url data to url list.
|
||||
"""
|
||||
urls = []
|
||||
# look for meta refresh
|
||||
if tag == 'meta':
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""logging and debug functions"""
|
||||
"""
|
||||
Logging and debug functions.
|
||||
"""
|
||||
# Copyright (C) 2003-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -26,30 +28,54 @@ import logging
|
|||
#gc.enable()
|
||||
#gc.set_debug(gc.DEBUG_LEAK)
|
||||
def debug (log, msg, *args):
|
||||
"""log a debug message"""
|
||||
"""
|
||||
Log a debug message.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).debug(msg, *args)
|
||||
|
||||
|
||||
def info (log, msg, *args):
|
||||
"""log an informational message"""
|
||||
"""
|
||||
Log an informational message.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).info(msg, *args)
|
||||
|
||||
|
||||
def warn (log, msg, *args):
|
||||
"""log a warning"""
|
||||
"""
|
||||
Log a warning.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).warn(msg, *args)
|
||||
|
||||
|
||||
def error (log, msg, *args):
|
||||
"""log an error"""
|
||||
"""
|
||||
Log an error.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).error(msg, *args)
|
||||
|
||||
|
||||
def critical (log, msg, *args):
|
||||
"""log a critical error"""
|
||||
"""
|
||||
Log a critical error.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).critical(msg, *args)
|
||||
|
||||
|
||||
def exception (log, msg, *args):
|
||||
"""log an exception"""
|
||||
"""
|
||||
Log an exception.
|
||||
|
||||
return: c{None}
|
||||
"""
|
||||
logging.getLogger(log).exception(msg, *args)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""a csv logger"""
|
||||
"""A CSV logger."""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -25,8 +25,8 @@ import linkcheck.configuration
|
|||
|
||||
|
||||
class CSVLogger (linkcheck.logger.Logger):
|
||||
""" CSV output. CSV consists of one line per entry. Entries are
|
||||
separated by a semicolon.
|
||||
"""CSV output, consisting of one line per entry. Entries are
|
||||
separated by a semicolon.
|
||||
"""
|
||||
def __init__ (self, **args):
|
||||
"""store default separator and (os dependent) line terminator"""
|
||||
|
|
|
|||
|
|
@ -18,7 +18,12 @@ _scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
|
|||
'KB': 1024.0, 'MB': 1024.0*1024.0}
|
||||
|
||||
def _VmB (VmKey):
|
||||
'''Parse /proc/<pid>/status file for given key.'''
|
||||
"""
|
||||
Parse /proc/<pid>/status file for given key.
|
||||
|
||||
@return requested number value of status entry
|
||||
@rtype: c{float}
|
||||
"""
|
||||
if os.name != 'posix':
|
||||
# not supported
|
||||
return 0.0
|
||||
|
|
@ -41,15 +46,30 @@ def _VmB (VmKey):
|
|||
|
||||
|
||||
def memory (since=0.0):
|
||||
'''Return memory usage in bytes.'''
|
||||
"""
|
||||
Get memory usage.
|
||||
|
||||
@return: memory usage in bytes
|
||||
@rtype: c{float}
|
||||
"""
|
||||
return _VmB('VmSize:') - since
|
||||
|
||||
|
||||
def resident (since=0.0):
|
||||
'''Return resident memory usage in bytes.'''
|
||||
"""
|
||||
Get resident memory usage.
|
||||
|
||||
@return: resident memory usage in bytes
|
||||
@rtype: c{float}
|
||||
"""
|
||||
return _VmB('VmRSS:') - since
|
||||
|
||||
|
||||
def stacksize (since=0.0):
|
||||
'''Return stack size in bytes.'''
|
||||
"""
|
||||
Get stack size.
|
||||
|
||||
@return: stack size in bytes
|
||||
@rtype: c{float}
|
||||
"""
|
||||
return _VmB('VmStk:') - since
|
||||
|
|
|
|||
|
|
@ -1,14 +1,24 @@
|
|||
""" robotparser.py
|
||||
|
||||
Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
|
||||
You can choose between two licenses when using this package:
|
||||
1) GNU GPLv2
|
||||
2) PSF license for Python 2.2
|
||||
|
||||
The robots.txt Exclusion Protocol is implemented as specified in
|
||||
http://www.robotstxt.org/wc/norobots-rfc.html
|
||||
"""
|
||||
robotparser.py
|
||||
|
||||
The robots.txt Exclusion Protocol is implemented as specified in
|
||||
http://www.robotstxt.org/wc/norobots-rfc.html
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
import urlparse
|
||||
import httplib
|
||||
import urllib
|
||||
|
|
@ -27,34 +37,60 @@ __all__ = ["RobotFileParser"]
|
|||
_debug = False
|
||||
|
||||
def _msg (prefix, msg):
|
||||
"""print debug message"""
|
||||
"""
|
||||
Print given prefix and debug message to stderr if the _debug flag is
|
||||
set.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
if _debug:
|
||||
print >> sys.stderr, prefix, msg
|
||||
|
||||
# methods for debug, warning and error messages
|
||||
debug = lambda txt: _msg("debug:", txt)
|
||||
warn = lambda txt: _msg("warning:", txt)
|
||||
error = lambda txt: _msg("error:", txt)
|
||||
|
||||
class PasswordManager (object):
|
||||
"""
|
||||
Simple password manager storing username and password. Suitable
|
||||
for use as an AuthHandler instance in urllib2.
|
||||
"""
|
||||
|
||||
def __init__ (self, user, password):
|
||||
"""
|
||||
Store given username and password.
|
||||
"""
|
||||
self.user = user
|
||||
self.password = password
|
||||
|
||||
def add_password (self, realm, uri, user, passwd):
|
||||
# we have already our password
|
||||
"""
|
||||
Does nothing since username and password are already stored.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
pass
|
||||
|
||||
def find_user_password (self, realm, authuri):
|
||||
"""
|
||||
Get stored username and password.
|
||||
|
||||
@return: A tuple (user, password)
|
||||
@rtype: c{tuple}
|
||||
"""
|
||||
return self.user, self.password
|
||||
|
||||
|
||||
class RobotFileParser (object):
|
||||
""" This class provides a set of methods to read, parse and answer
|
||||
"""
|
||||
This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file.
|
||||
"""
|
||||
|
||||
def __init__ (self, url='', user=None, password=None):
|
||||
"""Initialize internal entry lists and store given url and
|
||||
"""
|
||||
Initialize internal entry lists and store given url and
|
||||
credentials.
|
||||
"""
|
||||
self.set_url(url)
|
||||
|
|
@ -63,7 +99,11 @@ class RobotFileParser (object):
|
|||
self._reset()
|
||||
|
||||
def _reset (self):
|
||||
"""reset internal entry lists"""
|
||||
"""
|
||||
Reset internal flags and entry lists.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
self.entries = []
|
||||
self.default_entry = None
|
||||
self.disallow_all = False
|
||||
|
|
@ -71,28 +111,47 @@ class RobotFileParser (object):
|
|||
self.last_checked = 0
|
||||
|
||||
def mtime (self):
|
||||
"""Returns the time the robots.txt file was last fetched.
|
||||
"""
|
||||
Returns the time the robots.txt file was last fetched.
|
||||
|
||||
This is useful for long-running web spiders that need to
|
||||
check for new robots.txt files periodically.
|
||||
|
||||
@return: last modified in time.time() format
|
||||
@rtype: c{number}
|
||||
"""
|
||||
return self.last_checked
|
||||
|
||||
def modified (self):
|
||||
"""Sets the time the robots.txt file was last fetched to the
|
||||
current time.
|
||||
"""
|
||||
Sets the time the robots.txt file was last fetched to the
|
||||
current time.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
import time
|
||||
self.last_checked = time.time()
|
||||
|
||||
def set_url (self, url):
|
||||
"""Sets the URL referring to a robots.txt file."""
|
||||
"""
|
||||
Sets the URL referring to a robots.txt file.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
self.url = url
|
||||
self.host, self.path = urlparse.urlparse(url)[1:3]
|
||||
|
||||
def get_opener (self):
|
||||
"""
|
||||
Construct an URL opener object. It considers the given credentials
|
||||
from the __init__() method and supports proxies.
|
||||
|
||||
@return URL opener
|
||||
@rtype: c{urllib2.OpenerDirector}
|
||||
"""
|
||||
pwd_manager = PasswordManager(self.user, self.password)
|
||||
handlers = [urllib2.ProxyHandler(urllib.getproxies()),
|
||||
handlers = [
|
||||
urllib2.ProxyHandler(urllib.getproxies()),
|
||||
urllib2.UnknownHandler,
|
||||
HttpWithGzipHandler,
|
||||
urllib2.HTTPBasicAuthHandler(pwd_manager),
|
||||
|
|
@ -107,7 +166,11 @@ class RobotFileParser (object):
|
|||
return urllib2.build_opener(*handlers)
|
||||
|
||||
def read (self):
|
||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||
"""
|
||||
Reads the robots.txt URL and feeds it to the parser.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
self._reset()
|
||||
headers = {
|
||||
'User-Agent': 'Python RobotFileParser/2.1',
|
||||
|
|
@ -149,7 +212,11 @@ class RobotFileParser (object):
|
|||
self.parse(lines)
|
||||
|
||||
def _add_entry (self, entry):
|
||||
"""add entry to entry list"""
|
||||
"""
|
||||
Add a parsed entry to entry list.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
if "*" in entry.useragents:
|
||||
# the default entry is considered last
|
||||
self.default_entry = entry
|
||||
|
|
@ -157,9 +224,12 @@ class RobotFileParser (object):
|
|||
self.entries.append(entry)
|
||||
|
||||
def parse (self, lines):
|
||||
"""parse the input lines from a robot.txt file.
|
||||
We allow that a user-agent: line is not preceded by
|
||||
one or more blank lines.
|
||||
"""
|
||||
Parse the input lines from a robot.txt file.
|
||||
We allow that a user-agent: line is not preceded by
|
||||
one or more blank lines.
|
||||
|
||||
@return: c{None}
|
||||
"""
|
||||
debug("robots.txt parse lines")
|
||||
state = 0
|
||||
|
|
@ -193,8 +263,8 @@ class RobotFileParser (object):
|
|||
if line[0] == "user-agent":
|
||||
if state == 2:
|
||||
warn("line %d: you should insert a blank"
|
||||
" line before any user-agent"
|
||||
" directive" % linenumber)
|
||||
" line before any user-agent"
|
||||
" directive" % linenumber)
|
||||
self._add_entry(entry)
|
||||
entry = Entry()
|
||||
entry.useragents.append(line[1])
|
||||
|
|
@ -221,7 +291,12 @@ class RobotFileParser (object):
|
|||
debug("Parsed rules:\n%s" % str(self))
|
||||
|
||||
def can_fetch (self, useragent, url):
|
||||
"""using the parsed robots.txt decide if useragent can fetch url"""
|
||||
"""
|
||||
Using the parsed robots.txt decide if useragent can fetch url.
|
||||
|
||||
@return: True if agent can fetch url, else False
|
||||
@rtype: c{bool}
|
||||
"""
|
||||
debug("Checking robot.txt allowance for:\n"\
|
||||
" user agent: %r\n url: %r" % (useragent, url))
|
||||
if not isinstance(useragent, str):
|
||||
|
|
@ -245,7 +320,13 @@ class RobotFileParser (object):
|
|||
return True
|
||||
|
||||
def __str__ (self):
|
||||
"""return string representation in robots.txt format"""
|
||||
"""
|
||||
Constructs string representation, usable as contents of a
|
||||
robots.txt file.
|
||||
|
||||
@return: robots.txt format
|
||||
@rtype: c{string}
|
||||
"""
|
||||
lines = [str(entry) for entry in self.entries]
|
||||
if self.default_entry is not None:
|
||||
lines.append(str(self.default_entry))
|
||||
|
|
@ -253,11 +334,15 @@ class RobotFileParser (object):
|
|||
|
||||
|
||||
class RuleLine (object):
|
||||
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
|
||||
(allowance==0) followed by a path."""
|
||||
"""
|
||||
A rule line is a single "Allow:" (allowance==1) or "Disallow:"
|
||||
(allowance==0) followed by a path.
|
||||
"""
|
||||
|
||||
def __init__ (self, path, allowance):
|
||||
"""initialize with given path and allowance info"""
|
||||
"""
|
||||
Initialize with given path and allowance info.
|
||||
"""
|
||||
if path == '' and not allowance:
|
||||
# an empty value means allow all
|
||||
allowance = True
|
||||
|
|
@ -265,30 +350,54 @@ class RuleLine (object):
|
|||
self.allowance = allowance
|
||||
|
||||
def applies_to (self, path):
|
||||
"""return True if pathname applies to this rule"""
|
||||
"""
|
||||
Look if given path applies to this rule.
|
||||
|
||||
@return: True if pathname applies to this rule, else False
|
||||
@rtype: c{bool}
|
||||
"""
|
||||
return self.path == "*" or path.startswith(self.path)
|
||||
|
||||
def __str__ (self):
|
||||
"""return string representation in robots.txt format"""
|
||||
"""
|
||||
Construct string representation in robots.txt format.
|
||||
|
||||
@return: robots.txt format
|
||||
@rtype: c{string}
|
||||
"""
|
||||
return (self.allowance and "Allow" or "Disallow")+": "+self.path
|
||||
|
||||
|
||||
class Entry (object):
|
||||
"""An entry has one or more user-agents and zero or more rulelines"""
|
||||
"""
|
||||
An entry has one or more user-agents and zero or more rulelines.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""initialize user agent and rule list"""
|
||||
"""
|
||||
Initialize user agent and rule list.
|
||||
"""
|
||||
self.useragents = []
|
||||
self.rulelines = []
|
||||
|
||||
def __str__ (self):
|
||||
"""return string representation in robots.txt format"""
|
||||
"""
|
||||
string representation in robots.txt format.
|
||||
|
||||
@return: robots.txt format
|
||||
@rtype: c{string}
|
||||
"""
|
||||
lines = ["User-agent: %r" % agent for agent in self.useragents]
|
||||
lines.extend([str(line) for line in self.rulelines])
|
||||
return "\n".join(lines)
|
||||
|
||||
def applies_to (self, useragent):
|
||||
"""check if this entry applies to the specified agent"""
|
||||
"""
|
||||
Check if this entry applies to the specified agent.
|
||||
|
||||
@return: True if this entry applies to the agent, else False.
|
||||
@rtype: c{bool}
|
||||
"""
|
||||
# split the name token and make it lower case
|
||||
if not useragent:
|
||||
return True
|
||||
|
|
@ -303,9 +412,16 @@ class Entry (object):
|
|||
return False
|
||||
|
||||
def allowance (self, filename):
|
||||
"""Preconditions:
|
||||
"""
|
||||
Preconditions:
|
||||
- our agent applies to this entry
|
||||
- filename is URL decoded"""
|
||||
- filename is URL decoded
|
||||
|
||||
Check if given filename is allowed to acces this entry.
|
||||
|
||||
@return: True if allowed, else False
|
||||
@rtype: c{bool}
|
||||
"""
|
||||
for line in self.rulelines:
|
||||
debug("%s %s %s" % (filename, str(line), line.allowance))
|
||||
if line.applies_to(filename):
|
||||
|
|
@ -334,7 +450,9 @@ class Entry (object):
|
|||
## ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
|
||||
## SOFTWARE.
|
||||
def decode (page):
|
||||
"""gunzip or deflate a compressed page"""
|
||||
"""
|
||||
Gunzip or deflate a compressed page.
|
||||
"""
|
||||
debug("robots.txt page info %s" % str(page.info()))
|
||||
encoding = page.info().get("Content-Encoding")
|
||||
if encoding in ('gzip', 'x-gzip', 'deflate'):
|
||||
|
|
@ -367,19 +485,26 @@ def decode (page):
|
|||
|
||||
|
||||
class HttpWithGzipHandler (urllib2.HTTPHandler):
|
||||
"support gzip encoding"
|
||||
|
||||
"""
|
||||
Support gzip encoding.
|
||||
"""
|
||||
def http_open (self, req):
|
||||
"""send request and decode answer"""
|
||||
"""
|
||||
Send request and decode answer.
|
||||
"""
|
||||
return decode(urllib2.HTTPHandler.http_open(self, req))
|
||||
|
||||
|
||||
if hasattr(linkcheck.httplib2, 'HTTPS'):
|
||||
class HttpsWithGzipHandler (urllib2.HTTPSHandler):
|
||||
"support gzip encoding"
|
||||
"""
|
||||
Support gzip encoding.
|
||||
"""
|
||||
|
||||
def http_open (self, req):
|
||||
"""send request and decode answer"""
|
||||
"""
|
||||
Send request and decode answer.
|
||||
"""
|
||||
return decode(urllib2.HTTPSHandler.http_open(self, req))
|
||||
|
||||
# end of urlutils.py routines
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""Various string utility functions. Note that these functions are not
|
||||
necessarily optimised for large strings, so use with care."""
|
||||
"""
|
||||
Various string utility functions. Note that these functions are not
|
||||
necessarily optimised for large strings, so use with care.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -25,7 +27,13 @@ import time
|
|||
|
||||
|
||||
def unquote (s):
|
||||
"""if string s is not empty, strip quotes from s"""
|
||||
"""
|
||||
Remove leading and ending quotes.
|
||||
|
||||
@return: if s evaluates to False, return s as is, else return
|
||||
string with stripped quotes
|
||||
@rtype: string or type of s if evaluting to False
|
||||
"""
|
||||
if not s:
|
||||
return s
|
||||
if len(s) < 2:
|
||||
|
|
@ -43,9 +51,10 @@ _para_win = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\r\n'}
|
|||
_para_ro = re.compile("%s|%s|%s" % (_para_mac, _para_posix, _para_win))
|
||||
|
||||
def get_paragraphs (text):
|
||||
"""A new paragraph is considered to start at a line which follows
|
||||
one or more blank lines (lines containing nothing or just spaces).
|
||||
The first line of the text also starts a paragraph.
|
||||
"""
|
||||
A new paragraph is considered to start at a line which follows
|
||||
one or more blank lines (lines containing nothing or just spaces).
|
||||
The first line of the text also starts a paragraph.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
|
@ -53,10 +62,11 @@ def get_paragraphs (text):
|
|||
|
||||
|
||||
def wrap (text, width, **kwargs):
|
||||
"""Adjust lines of text to be not longer than width. The text will be
|
||||
returned unmodified if width <= 0.
|
||||
See textwrap.wrap() for a list of supported kwargs.
|
||||
Returns text with lines no longer than given width.
|
||||
"""
|
||||
Adjust lines of text to be not longer than width. The text will be
|
||||
returned unmodified if width <= 0.
|
||||
See textwrap.wrap() for a list of supported kwargs.
|
||||
Returns text with lines no longer than given width.
|
||||
"""
|
||||
if width <= 0 or not text:
|
||||
return text
|
||||
|
|
@ -67,8 +77,10 @@ def wrap (text, width, **kwargs):
|
|||
|
||||
|
||||
def get_line_number (s, index):
|
||||
"""Return the line number of s[index]. Lines are assumed to be separated
|
||||
by the ASCII character '\\n'"""
|
||||
"""
|
||||
Return the line number of s[index]. Lines are assumed to be separated
|
||||
by the ASCII character '\\n'.
|
||||
"""
|
||||
i = 0
|
||||
if index < 0:
|
||||
index = 0
|
||||
|
|
@ -81,7 +93,9 @@ def get_line_number (s, index):
|
|||
|
||||
|
||||
def paginate (text, lines=22):
|
||||
"""print text in pages of lines"""
|
||||
"""
|
||||
Print text in pages of lines.
|
||||
"""
|
||||
curline = 1
|
||||
for line in text.splitlines():
|
||||
print line
|
||||
|
|
@ -96,7 +110,9 @@ def paginate (text, lines=22):
|
|||
_markup_re = re.compile("<.*?>", re.DOTALL)
|
||||
|
||||
def remove_markup (s):
|
||||
"""remove all <*> html markup tags from s"""
|
||||
"""
|
||||
Remove all <*> html markup tags from s.
|
||||
"""
|
||||
mo = _markup_re.search(s)
|
||||
while mo:
|
||||
s = s[0:mo.start()] + s[mo.end():]
|
||||
|
|
@ -105,8 +121,9 @@ def remove_markup (s):
|
|||
|
||||
|
||||
def strsize (b):
|
||||
"""Return human representation of bytes b. A negative number of bytes
|
||||
raises a value error.
|
||||
"""
|
||||
Return human representation of bytes b. A negative number of bytes
|
||||
raises a value error.
|
||||
"""
|
||||
if b < 0:
|
||||
raise ValueError("Invalid negative byte number")
|
||||
|
|
@ -125,13 +142,17 @@ def strsize (b):
|
|||
|
||||
|
||||
def strtime (t):
|
||||
"""return ISO 8601 formatted time"""
|
||||
"""
|
||||
Return ISO 8601 formatted time.
|
||||
"""
|
||||
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)) + \
|
||||
strtimezone()
|
||||
|
||||
|
||||
def strduration (duration):
|
||||
"""return translated and formatted time duration"""
|
||||
"""
|
||||
Return translated and formatted time duration.
|
||||
"""
|
||||
name = _("seconds")
|
||||
if duration > 60:
|
||||
duration = duration / 60
|
||||
|
|
@ -143,7 +164,9 @@ def strduration (duration):
|
|||
|
||||
|
||||
def strtimezone ():
|
||||
"""return timezone info, %z on some platforms, but not supported on all"""
|
||||
"""
|
||||
Return timezone info, %z on some platforms, but not supported on all.
|
||||
"""
|
||||
if time.daylight:
|
||||
zone = time.altzone
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -23,11 +23,14 @@ except ImportError:
|
|||
|
||||
|
||||
class Threader (object):
|
||||
"""A thread generating class"""
|
||||
"""
|
||||
A thread generating class.
|
||||
"""
|
||||
|
||||
def __init__ (self, num=5):
|
||||
"""store maximum number of threads to generate, and initialize
|
||||
an empty thread list
|
||||
"""
|
||||
Store maximum number of threads to generate, and initialize
|
||||
an empty thread list.
|
||||
"""
|
||||
# this allows negative numbers
|
||||
self.threads_max = max(num, 1)
|
||||
|
|
@ -35,32 +38,44 @@ class Threader (object):
|
|||
self.threads = []
|
||||
|
||||
def _acquire (self):
|
||||
"""Wait until we are allowed to start a new thread"""
|
||||
"""
|
||||
Wait until we are allowed to start a new thread.
|
||||
"""
|
||||
while self.active_threads() >= self.threads_max:
|
||||
self._reduce_threads()
|
||||
time.sleep(0.1)
|
||||
|
||||
def _reduce_threads (self):
|
||||
"""remove inactive threads"""
|
||||
"""
|
||||
Remove inactive threads.
|
||||
"""
|
||||
self.threads = [ t for t in self.threads if t.isAlive() ]
|
||||
|
||||
def active_threads (self):
|
||||
"""return number of active threads"""
|
||||
"""
|
||||
Return number of active threads.
|
||||
"""
|
||||
return len(self.threads)
|
||||
|
||||
def finished (self):
|
||||
"""return True if no active threads are left"""
|
||||
"""
|
||||
Return True if no active threads are left.
|
||||
"""
|
||||
if self.threads_max > 0:
|
||||
self._reduce_threads()
|
||||
return self.active_threads() == 0
|
||||
|
||||
def finish (self):
|
||||
"""remove inactive threads"""
|
||||
"""
|
||||
Remove inactive threads.
|
||||
"""
|
||||
self._reduce_threads()
|
||||
# XXX don't know how to stop a thread
|
||||
|
||||
def start_thread (self, func, args):
|
||||
"""Generate a new thread"""
|
||||
"""
|
||||
Generate a new thread.
|
||||
"""
|
||||
if self.threads_max < 1:
|
||||
# threading is disabled
|
||||
func(*args)
|
||||
|
|
@ -71,6 +86,8 @@ class Threader (object):
|
|||
self.threads.append(t)
|
||||
|
||||
def __str__ (self):
|
||||
"""string representation of threader state"""
|
||||
"""
|
||||
String representation of threader state.
|
||||
"""
|
||||
return "Threader with %d threads (max %d)" % \
|
||||
(self.active_threads(), self.threads_max)
|
||||
|
|
|
|||
100
linkcheck/url.py
100
linkcheck/url.py
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
"""url utils"""
|
||||
"""
|
||||
Functions for parsing and matching URL strings.
|
||||
"""
|
||||
# Copyright (C) 2000-2005 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
|
|
@ -70,8 +72,9 @@ is_safe_fragment = re.compile("(?i)^%s$" % _safe_fragment_pattern).match
|
|||
|
||||
# snatched form urlparse.py
|
||||
def splitparams (path):
|
||||
"""Split off parameter part from path.
|
||||
Returns tuple (path-without-param, param)
|
||||
"""
|
||||
Split off parameter part from path.
|
||||
Returns tuple (path-without-param, param)
|
||||
"""
|
||||
if '/' in path:
|
||||
i = path.find(';', path.rfind('/'))
|
||||
|
|
@ -83,7 +86,9 @@ def splitparams (path):
|
|||
|
||||
|
||||
def is_safe_js_url (urlstr):
|
||||
"""test javascript URLs"""
|
||||
"""
|
||||
Test javascript URL strings.
|
||||
"""
|
||||
url = list(urlparse.urlsplit(urlstr))
|
||||
if url[0].lower() != 'http':
|
||||
return False
|
||||
|
|
@ -103,7 +108,9 @@ def is_safe_js_url (urlstr):
|
|||
|
||||
|
||||
def is_numeric_port (portstr):
|
||||
"""return True iff portstr is a valid port number"""
|
||||
"""
|
||||
return True iff portstr is a valid port number
|
||||
"""
|
||||
if portstr.isdigit():
|
||||
port = int(portstr)
|
||||
# 65536 == 2**16
|
||||
|
|
@ -112,20 +119,25 @@ def is_numeric_port (portstr):
|
|||
|
||||
|
||||
def safe_host_pattern (host):
|
||||
"""return regular expression pattern with given host for URL testing"""
|
||||
"""
|
||||
return regular expression pattern with given host for URL testing
|
||||
"""
|
||||
return "(?i)%s://%s%s(#%s)?" % \
|
||||
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
|
||||
|
||||
|
||||
# XXX better name/implementation for this function
|
||||
def stripsite (url):
|
||||
"""remove scheme and host from URL. return host, newurl"""
|
||||
"""
|
||||
remove scheme and host from URL. return host, newurl
|
||||
"""
|
||||
url = urlparse.urlsplit(url)
|
||||
return url[1], urlparse.urlunsplit((0, 0, url[2], url[3], url[4]))
|
||||
|
||||
|
||||
def parse_qsl (qs, keep_blank_values=0, strict_parsing=0):
|
||||
"""Parse a query given as a string argument.
|
||||
"""
|
||||
Parse a query given as a string argument.
|
||||
|
||||
Arguments:
|
||||
|
||||
|
|
@ -166,8 +178,10 @@ def parse_qsl (qs, keep_blank_values=0, strict_parsing=0):
|
|||
|
||||
|
||||
def idna_encode (host):
|
||||
"""Encode hostname as internationalized domain name (IDN) according
|
||||
to RFC 3490."""
|
||||
"""
|
||||
Encode hostname as internationalized domain name (IDN) according
|
||||
to RFC 3490.
|
||||
"""
|
||||
if host and isinstance(host, unicode):
|
||||
uhost = host.encode('idna').decode('ascii')
|
||||
return uhost, uhost != host
|
||||
|
|
@ -175,7 +189,9 @@ def idna_encode (host):
|
|||
|
||||
|
||||
def url_fix_host (urlparts):
|
||||
"""Unquote and fix hostname. Returns is_idn."""
|
||||
"""
|
||||
Unquote and fix hostname. Returns is_idn.
|
||||
"""
|
||||
urlparts[1], is_idn = idna_encode(urllib.unquote(urlparts[1]).lower())
|
||||
# a leading backslash in path causes urlsplit() to add the
|
||||
# path components up to the first slash to host
|
||||
|
|
@ -216,7 +232,9 @@ def url_fix_host (urlparts):
|
|||
|
||||
|
||||
def url_fix_common_typos (url):
|
||||
"""Fix common typos in given URL like forgotten colon."""
|
||||
"""
|
||||
Fix common typos in given URL like forgotten colon.
|
||||
"""
|
||||
if url.startswith("http//"):
|
||||
url = "http://" + url[6:]
|
||||
elif url.startswith("https//"):
|
||||
|
|
@ -225,13 +243,17 @@ def url_fix_common_typos (url):
|
|||
|
||||
|
||||
def url_fix_mailto_urlsplit (urlparts):
|
||||
"""Split query part of mailto url if found."""
|
||||
"""
|
||||
Split query part of mailto url if found.
|
||||
"""
|
||||
if "?" in urlparts[2]:
|
||||
urlparts[2], urlparts[3] = urlparts[2].split('?', 1)
|
||||
|
||||
|
||||
def url_parse_query (query):
|
||||
"""Parse and re-join the given CGI query."""
|
||||
"""
|
||||
Parse and re-join the given CGI query.
|
||||
"""
|
||||
# if ? is in the query, split it off, seen at msdn.microsoft.com
|
||||
if '?' in query:
|
||||
query, append = query.split('?', 1)
|
||||
|
|
@ -253,10 +275,12 @@ def url_parse_query (query):
|
|||
|
||||
|
||||
def url_norm (url):
|
||||
"""Normalize the given URL which must be quoted. Supports unicode
|
||||
hostnames (IDNA encoding) according to RFC 3490.
|
||||
"""
|
||||
Normalize the given URL which must be quoted. Supports unicode
|
||||
hostnames (IDNA encoding) according to RFC 3490.
|
||||
|
||||
@return (normed url, idna flag)
|
||||
@return: (normed url, idna flag)
|
||||
@rtype: c{tuple} of length two
|
||||
"""
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
# scheme
|
||||
|
|
@ -293,8 +317,9 @@ _samedir_ro = re.compile(r"/\./|/\.$")
|
|||
_parentdir_ro = re.compile(r"^/(\.\./)+|/(?!\.\./)[^/]+/\.\.(/|$)")
|
||||
_relparentdir_ro = re.compile(r"^(?!\.\./)[^/]+/\.\.(/|$)")
|
||||
def collapse_segments (path):
|
||||
"""Remove all redundant segments from the given URL path.
|
||||
Precondition: path is an unquoted url path
|
||||
"""
|
||||
Remove all redundant segments from the given URL path.
|
||||
Precondition: path is an unquoted url path
|
||||
"""
|
||||
# replace backslashes
|
||||
# note: this is _against_ the specification (which would require
|
||||
|
|
@ -329,7 +354,9 @@ url_is_absolute = re.compile("^[a-z]+:", re.I).match
|
|||
|
||||
|
||||
def url_quote (url):
|
||||
"""quote given URL"""
|
||||
"""
|
||||
Quote given URL.
|
||||
"""
|
||||
if not url_is_absolute(url):
|
||||
return document_quote(url)
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
|
|
@ -351,7 +378,9 @@ def url_quote (url):
|
|||
|
||||
|
||||
def document_quote (document):
|
||||
"""quote given document"""
|
||||
"""
|
||||
Quote given document.
|
||||
"""
|
||||
doc, query = urllib.splitquery(document)
|
||||
doc = urllib.quote(doc, '/=,')
|
||||
if query:
|
||||
|
|
@ -360,15 +389,18 @@ def document_quote (document):
|
|||
|
||||
|
||||
def match_url (url, domainlist):
|
||||
"""return True if host part of url matches an entry in given domain
|
||||
list"""
|
||||
"""
|
||||
Return True if host part of url matches an entry in given domain list.
|
||||
"""
|
||||
if not url:
|
||||
return False
|
||||
return match_host(url_split(url)[1], domainlist)
|
||||
|
||||
|
||||
def match_host (host, domainlist):
|
||||
"""return True if host matches an entry in given domain list"""
|
||||
"""
|
||||
Return True if host matches an entry in given domain list.
|
||||
"""
|
||||
if not host:
|
||||
return False
|
||||
for domain in domainlist:
|
||||
|
|
@ -386,10 +418,11 @@ if os.name == 'nt':
|
|||
_safe_url_chars = _nopathquote_chars + r"a-zA-Z0-9_:\.&#%\?"
|
||||
_safe_url_chars_ro = re.compile(r"^[%s]*$" % _safe_url_chars)
|
||||
def url_needs_quoting (url):
|
||||
"""Check if url needs percent quoting. Note that the method does
|
||||
only check basic character sets, and not any other syntax.
|
||||
The URL might still be syntactically incorrect even when
|
||||
it is properly quoted.
|
||||
"""
|
||||
Check if url needs percent quoting. Note that the method does
|
||||
only check basic character sets, and not any other syntax.
|
||||
The URL might still be syntactically incorrect even when
|
||||
it is properly quoted.
|
||||
"""
|
||||
if url.rstrip() != url:
|
||||
# handle trailing whitespace as a special case
|
||||
|
|
@ -399,9 +432,10 @@ def url_needs_quoting (url):
|
|||
|
||||
|
||||
def url_split (url):
|
||||
"""Split url in a tuple (scheme, hostname, port, document) where
|
||||
hostname is always lowercased.
|
||||
Precondition: url is syntactically correct URI (eg has no whitespace)
|
||||
"""
|
||||
Split url in a tuple (scheme, hostname, port, document) where
|
||||
hostname is always lowercased.
|
||||
Precondition: url is syntactically correct URI (eg has no whitespace)
|
||||
"""
|
||||
scheme, netloc = urllib.splittype(url)
|
||||
host, document = urllib.splithost(netloc)
|
||||
|
|
@ -413,5 +447,7 @@ def url_split (url):
|
|||
|
||||
|
||||
def url_unicode_split (url):
|
||||
"""Like urlparse.urlsplit(), but always returning unicode parts."""
|
||||
"""
|
||||
Like urlparse.urlsplit(), but always returning unicode parts.
|
||||
"""
|
||||
return [unicode(s) for s in urlparse.urlsplit(url)]
|
||||
|
|
|
|||
Loading…
Reference in a new issue