2004-07-07 18:15:17 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
2010-03-07 10:59:18 +00:00
|
|
|
# Copyright (C) 2000-2010 Bastian Kleineidam
|
2004-07-07 18:15:17 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
2009-07-24 21:58:20 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2005-01-19 15:08:02 +00:00
|
|
|
"""
|
|
|
|
|
Main functions for link checking.
|
|
|
|
|
"""
|
2004-07-07 18:15:17 +00:00
|
|
|
|
2004-09-20 17:48:52 +00:00
|
|
|
import os
|
|
|
|
|
import cgi
|
2008-04-24 09:44:18 +00:00
|
|
|
import logging
|
2004-09-20 17:48:52 +00:00
|
|
|
import urllib
|
2008-05-09 06:16:03 +00:00
|
|
|
from .. import strformat, url as urlutil
|
2004-07-07 18:15:17 +00:00
|
|
|
|
2004-07-19 08:58:59 +00:00
|
|
|
|
2004-08-16 19:20:53 +00:00
|
|
|
def absolute_url (base_url, base_ref, parent_url):
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
|
|
|
|
Search for the absolute url to detect the link type. This does not
|
2006-02-01 18:49:31 +00:00
|
|
|
join any url fragments together!
|
2005-01-19 00:08:41 +00:00
|
|
|
|
|
|
|
|
@param base_url: base url from a link tag
|
2005-01-19 15:56:48 +00:00
|
|
|
@type base_url: string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
@param base_ref: base url from <base> tag
|
2005-01-19 15:56:48 +00:00
|
|
|
@type base_ref: string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
@param parent_url: url of parent document
|
2005-01-19 15:56:48 +00:00
|
|
|
@type parent_url: string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
2008-05-09 06:16:03 +00:00
|
|
|
if base_url and urlutil.url_is_absolute(base_url):
|
2006-02-01 18:49:31 +00:00
|
|
|
return base_url
|
2008-05-09 06:16:03 +00:00
|
|
|
elif base_ref and urlutil.url_is_absolute(base_ref):
|
2006-02-01 18:49:31 +00:00
|
|
|
return base_ref
|
2008-05-09 06:16:03 +00:00
|
|
|
elif parent_url and urlutil.url_is_absolute(parent_url):
|
2006-02-01 18:49:31 +00:00
|
|
|
return parent_url
|
2004-11-10 16:41:41 +00:00
|
|
|
return u""
|
2004-07-26 11:43:11 +00:00
|
|
|
|
|
|
|
|
|
2006-05-13 13:44:52 +00:00
|
|
|
def get_url_from (base_url, recursion_level, aggregate,
|
2004-08-19 21:35:47 +00:00
|
|
|
parent_url=None, base_ref=None, line=0, column=0,
|
2007-11-28 18:46:50 +00:00
|
|
|
name=u""):
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
|
|
|
|
Get url data from given base data.
|
|
|
|
|
|
|
|
|
|
@param base_url: base url from a link tag
|
2005-07-04 20:28:55 +00:00
|
|
|
@type base_url: string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
@param recursion_level: current recursion level
|
2005-01-19 15:56:48 +00:00
|
|
|
@type recursion_level: number
|
2006-05-13 13:44:52 +00:00
|
|
|
@param aggregate: aggregate object
|
2008-05-09 06:16:03 +00:00
|
|
|
@type aggregate: aggregate.Consumer
|
2005-01-19 00:08:41 +00:00
|
|
|
@param parent_url: parent url
|
2005-01-19 15:56:48 +00:00
|
|
|
@type parent_url: string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
@param base_ref: base url from <base> tag
|
2005-01-19 15:56:48 +00:00
|
|
|
@type base_ref string or None
|
2005-01-19 00:08:41 +00:00
|
|
|
@param line: line number
|
2005-01-19 15:56:48 +00:00
|
|
|
@type line: number
|
2005-01-19 00:08:41 +00:00
|
|
|
@param column: column number
|
2005-01-19 15:56:48 +00:00
|
|
|
@type column: number
|
2005-01-19 00:08:41 +00:00
|
|
|
@param name: link name
|
2005-01-19 15:56:48 +00:00
|
|
|
@type name: string
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
2005-07-04 20:28:55 +00:00
|
|
|
if base_url is not None:
|
2008-05-09 06:16:03 +00:00
|
|
|
base_url = strformat.unicode_safe(base_url)
|
2005-02-18 11:22:52 +00:00
|
|
|
if parent_url is not None:
|
2008-05-09 06:16:03 +00:00
|
|
|
parent_url = strformat.unicode_safe(parent_url)
|
2005-02-18 11:22:52 +00:00
|
|
|
if base_ref is not None:
|
2008-05-09 06:16:03 +00:00
|
|
|
base_ref = strformat.unicode_safe(base_ref)
|
|
|
|
|
name = strformat.unicode_safe(name)
|
2006-02-01 18:49:31 +00:00
|
|
|
url = absolute_url(base_url, base_ref, parent_url).lower()
|
2009-02-20 13:03:34 +00:00
|
|
|
if not (url or name):
|
2009-02-24 11:41:28 +00:00
|
|
|
# use filename as base url, with slash as path seperator
|
|
|
|
|
name = base_url.replace("\\", "/")
|
2007-11-28 18:46:50 +00:00
|
|
|
klass = get_urlclass_from(url)
|
2006-05-13 13:44:52 +00:00
|
|
|
return klass(base_url, recursion_level, aggregate,
|
|
|
|
|
parent_url=parent_url, base_ref=base_ref,
|
|
|
|
|
line=line, column=column, name=name)
|
|
|
|
|
|
|
|
|
|
|
2007-11-28 18:46:50 +00:00
|
|
|
def get_urlclass_from (url):
|
2006-05-13 13:44:52 +00:00
|
|
|
"""Return checker class for given URL."""
|
2004-07-07 18:15:17 +00:00
|
|
|
if url.startswith("http:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = httpurl.HttpUrl
|
2004-07-07 18:15:17 +00:00
|
|
|
elif url.startswith("ftp:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = ftpurl.FtpUrl
|
2004-07-07 18:15:17 +00:00
|
|
|
elif url.startswith("file:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = fileurl.FileUrl
|
2004-07-07 18:15:17 +00:00
|
|
|
elif url.startswith("telnet:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = telneturl.TelnetUrl
|
2004-07-07 18:15:17 +00:00
|
|
|
elif url.startswith("mailto:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = mailtourl.MailtoUrl
|
2004-07-07 18:15:17 +00:00
|
|
|
elif url.startswith("https:"):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = httpsurl.HttpsUrl
|
2008-04-27 11:39:21 +00:00
|
|
|
elif url.startswith(("nntp:", "news:", "snews:")):
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = nntpurl.NntpUrl
|
|
|
|
|
elif unknownurl.is_unknown_url(url):
|
2006-05-17 19:08:40 +00:00
|
|
|
# unknown url
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = unknownurl.UnknownUrl
|
2007-11-28 18:46:50 +00:00
|
|
|
else:
|
|
|
|
|
# assume local file
|
2008-05-09 06:16:03 +00:00
|
|
|
klass = fileurl.FileUrl
|
2006-05-13 13:44:52 +00:00
|
|
|
return klass
|
2004-09-20 17:48:52 +00:00
|
|
|
|
2004-11-03 21:29:25 +00:00
|
|
|
|
2004-09-20 17:48:52 +00:00
|
|
|
def get_index_html (urls):
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
|
|
|
|
Construct artificial index.html from given URLs.
|
|
|
|
|
|
2009-03-02 07:02:27 +00:00
|
|
|
@param urls: URL strings
|
|
|
|
|
@type urls: iterator of string
|
2005-01-19 00:08:41 +00:00
|
|
|
"""
|
2004-09-20 17:48:52 +00:00
|
|
|
lines = ["<html>", "<body>"]
|
|
|
|
|
for entry in urls:
|
|
|
|
|
name = cgi.escape(entry)
|
2010-07-30 18:12:52 +00:00
|
|
|
try:
|
|
|
|
|
url = cgi.escape(urllib.quote(entry))
|
|
|
|
|
except KeyError:
|
|
|
|
|
# Some unicode entries raise KeyError.
|
|
|
|
|
url = name
|
2004-09-20 17:48:52 +00:00
|
|
|
lines.append('<a href="%s">%s</a>' % (url, name))
|
|
|
|
|
lines.extend(["</body>", "</html>"])
|
|
|
|
|
return os.linesep.join(lines)
|
2008-04-24 09:44:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class StoringHandler (logging.Handler):
|
|
|
|
|
"""Store all emitted log messages in a size-limited list.
|
|
|
|
|
Used by the CSS syntax checker."""
|
|
|
|
|
|
|
|
|
|
def __init__ (self, maxrecords=100):
|
|
|
|
|
logging.Handler.__init__(self)
|
|
|
|
|
self.storage = []
|
|
|
|
|
self.maxrecords = maxrecords
|
|
|
|
|
|
|
|
|
|
def emit (self, record):
|
|
|
|
|
if len(self.storage) >= self.maxrecords:
|
|
|
|
|
self.storage.pop()
|
|
|
|
|
self.storage.append(record)
|
2008-05-09 06:16:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# all the URL classes
|
2009-01-24 17:34:18 +00:00
|
|
|
from . import (fileurl, unknownurl, ftpurl, httpurl,
|
2008-05-09 06:16:03 +00:00
|
|
|
httpsurl, mailtourl, telneturl, nntpurl)
|