2001-03-15 01:19:35 +00:00
|
|
|
"""Base URL handler"""
|
2001-05-23 21:20:44 +00:00
|
|
|
# Copyright (C) 2000,2001 Bastian Kleineidam
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
2000-11-11 00:38:04 +00:00
|
|
|
|
2002-09-26 15:03:11 +00:00
|
|
|
import sys, re, urlparse, urllib, time, traceback, socket, select
|
2002-11-26 23:27:43 +00:00
|
|
|
#try:
|
|
|
|
|
# from linkcheck import DNS
|
|
|
|
|
#except ImportError:
|
|
|
|
|
# print >>sys.stderr, "You have to install PyDNS from http://pydns.sf.net/"
|
|
|
|
|
# raise SystemExit
|
|
|
|
|
from linkcheck import DNS
|
2002-09-26 15:03:11 +00:00
|
|
|
DNS.DiscoverNameServers()
|
|
|
|
|
|
2002-06-26 20:46:42 +00:00
|
|
|
import Config, StringUtil, linkcheck, linkname, test_support, timeoutsocket
|
2002-11-24 19:53:37 +00:00
|
|
|
from linkparse import LinkParser
|
2001-05-23 21:20:44 +00:00
|
|
|
from debuglevels import *
|
2002-02-24 12:29:35 +00:00
|
|
|
debug = Config.debug
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2002-01-29 21:20:37 +00:00
|
|
|
# helper function for internal errors
|
|
|
|
|
def internal_error ():
|
2002-02-24 12:29:35 +00:00
|
|
|
print >> sys.stderr, linkcheck._("""\n********** Oops, I did it again. *************
|
2002-01-29 21:20:37 +00:00
|
|
|
|
2002-03-28 19:13:04 +00:00
|
|
|
You have found an internal error in LinkChecker. Please write a bug report
|
|
|
|
|
at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
|
2002-04-26 00:23:56 +00:00
|
|
|
or send mail to %s and include the following information:
|
|
|
|
|
1) The URL or file you are testing
|
|
|
|
|
2) Your commandline arguments and/or configuration.
|
|
|
|
|
3) The system information below.
|
|
|
|
|
|
2002-01-29 21:20:37 +00:00
|
|
|
If you disclose some information because its too private to you thats ok.
|
|
|
|
|
I will try to help you nontheless (but you have to give me *something*
|
|
|
|
|
I can work with ;).
|
|
|
|
|
""") % Config.Email
|
2002-02-14 15:33:53 +00:00
|
|
|
type,value = sys.exc_info()[:2]
|
|
|
|
|
print >> sys.stderr, type, value
|
2002-01-29 21:20:37 +00:00
|
|
|
traceback.print_exc()
|
|
|
|
|
print_app_info()
|
2002-02-24 12:29:35 +00:00
|
|
|
print >> sys.stderr, linkcheck._("\n******** LinkChecker internal error, bailing out ********")
|
2002-01-29 21:20:37 +00:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_app_info ():
|
|
|
|
|
import os
|
2002-02-24 12:29:35 +00:00
|
|
|
print >> sys.stderr, linkcheck._("System info:")
|
2002-01-29 21:20:37 +00:00
|
|
|
print >> sys.stderr, Config.App
|
|
|
|
|
print >> sys.stderr, "Python %s on %s" % (sys.version, sys.platform)
|
|
|
|
|
for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"):
|
|
|
|
|
value = os.getenv(key)
|
|
|
|
|
if value is not None:
|
|
|
|
|
print >> sys.stderr, key, "=", `value`
|
|
|
|
|
|
|
|
|
|
|
2001-07-30 18:46:28 +00:00
|
|
|
# we catch these exceptions, all other exceptions are internal
|
|
|
|
|
# or system errors
|
2000-06-11 19:33:01 +00:00
|
|
|
ExcList = [
|
|
|
|
|
IOError,
|
2001-01-22 23:02:54 +00:00
|
|
|
ValueError, # from httplib.py
|
2000-06-11 19:33:01 +00:00
|
|
|
linkcheck.error,
|
2002-06-26 20:46:42 +00:00
|
|
|
DNS.Error,
|
|
|
|
|
timeoutsocket.Timeout,
|
2002-02-14 15:33:53 +00:00
|
|
|
socket.error,
|
|
|
|
|
select.error,
|
2000-06-11 19:33:01 +00:00
|
|
|
]
|
2001-07-30 18:46:28 +00:00
|
|
|
|
2002-03-19 23:30:24 +00:00
|
|
|
if hasattr(socket, "sslerror"):
|
|
|
|
|
ExcList.append(socket.sslerror)
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
# regular expression for port numbers
|
|
|
|
|
port_re = re.compile(r"\d+")
|
2001-01-07 13:28:38 +00:00
|
|
|
|
2000-02-26 10:24:46 +00:00
|
|
|
class UrlData:
|
|
|
|
|
"Representing a URL with additional information like validity etc"
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def __init__ (self,
|
|
|
|
|
urlName,
|
|
|
|
|
recursionLevel,
|
|
|
|
|
config,
|
|
|
|
|
parentName = None,
|
|
|
|
|
baseRef = None,
|
|
|
|
|
line = 0,
|
2002-11-25 22:29:07 +00:00
|
|
|
column = 0,
|
2002-05-04 13:27:02 +00:00
|
|
|
name = ""):
|
2000-02-26 10:24:46 +00:00
|
|
|
self.urlName = urlName
|
|
|
|
|
self.recursionLevel = recursionLevel
|
2002-05-04 13:27:02 +00:00
|
|
|
self.config = config
|
2000-02-26 10:24:46 +00:00
|
|
|
self.parentName = parentName
|
|
|
|
|
self.baseRef = baseRef
|
2002-02-24 12:29:35 +00:00
|
|
|
self.errorString = linkcheck._("Error")
|
|
|
|
|
self.validString = linkcheck._("Valid")
|
2000-02-26 10:24:46 +00:00
|
|
|
self.warningString = None
|
|
|
|
|
self.infoString = None
|
|
|
|
|
self.valid = 1
|
|
|
|
|
self.url = None
|
|
|
|
|
self.line = line
|
2002-11-25 22:29:07 +00:00
|
|
|
self.column = column
|
2001-01-07 13:28:38 +00:00
|
|
|
self.name = name
|
2000-06-03 17:09:10 +00:00
|
|
|
self.downloadtime = 0
|
|
|
|
|
self.checktime = 0
|
2000-02-26 10:24:46 +00:00
|
|
|
self.cached = 0
|
|
|
|
|
self.urlConnection = None
|
2002-09-26 14:50:50 +00:00
|
|
|
self.extern = (1, 0)
|
2000-03-28 14:43:50 +00:00
|
|
|
self.data = None
|
2001-04-03 18:59:38 +00:00
|
|
|
self.has_content = 0
|
2001-12-10 13:51:07 +00:00
|
|
|
url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
|
2002-03-14 21:18:52 +00:00
|
|
|
# assume file link if no scheme is found
|
|
|
|
|
self.scheme = url.split(":", 1)[0] or "file"
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def setError (self, s):
|
2000-02-26 10:24:46 +00:00
|
|
|
self.valid=0
|
2002-02-24 12:29:35 +00:00
|
|
|
self.errorString = linkcheck._("Error")+": "+s
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def setValid (self, s):
|
2000-02-26 10:24:46 +00:00
|
|
|
self.valid=1
|
2002-02-24 12:29:35 +00:00
|
|
|
self.validString = linkcheck._("Valid")+": "+s
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def isHtml (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
return 0
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def setWarning (self, s):
|
2000-02-26 10:24:46 +00:00
|
|
|
if self.warningString:
|
2002-11-25 22:29:07 +00:00
|
|
|
self.warningString += "\n"+s
|
2000-02-26 10:24:46 +00:00
|
|
|
else:
|
|
|
|
|
self.warningString = s
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def setInfo (self, s):
|
2000-02-26 10:24:46 +00:00
|
|
|
if self.infoString:
|
2001-01-22 23:02:54 +00:00
|
|
|
self.infoString += "\n"+s
|
2000-02-26 10:24:46 +00:00
|
|
|
else:
|
|
|
|
|
self.infoString = s
|
2001-04-28 18:37:10 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def copyFrom (self, urlData):
|
2000-02-26 10:24:46 +00:00
|
|
|
self.errorString = urlData.errorString
|
|
|
|
|
self.validString = urlData.validString
|
|
|
|
|
self.warningString = urlData.warningString
|
|
|
|
|
self.infoString = urlData.infoString
|
|
|
|
|
self.valid = urlData.valid
|
2000-03-19 14:24:33 +00:00
|
|
|
self.downloadtime = urlData.downloadtime
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2001-04-28 18:37:10 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def buildUrl (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
if self.baseRef:
|
2002-11-18 19:48:28 +00:00
|
|
|
if ":" not in self.baseRef:
|
|
|
|
|
self.baseRef = urlparse.urljoin(self.parentName, self.baseRef)
|
2000-02-26 10:24:46 +00:00
|
|
|
self.url = urlparse.urljoin(self.baseRef, self.urlName)
|
|
|
|
|
elif self.parentName:
|
|
|
|
|
self.url = urlparse.urljoin(self.parentName, self.urlName)
|
|
|
|
|
else:
|
|
|
|
|
self.url = self.urlName
|
|
|
|
|
self.urlTuple = urlparse.urlparse(self.url)
|
|
|
|
|
# make host lowercase
|
2001-11-20 20:27:25 +00:00
|
|
|
self.urlTuple = (self.urlTuple[0], self.urlTuple[1].lower(),
|
2002-11-25 22:29:07 +00:00
|
|
|
self.urlTuple[2], self.urlTuple[3], self.urlTuple[4],
|
2000-02-26 10:24:46 +00:00
|
|
|
self.urlTuple[5])
|
|
|
|
|
self.url = urlparse.urlunparse(self.urlTuple)
|
2001-04-28 18:37:10 +00:00
|
|
|
# resolve HTML entities
|
|
|
|
|
self.url = StringUtil.unhtmlify(self.url)
|
2002-05-04 13:27:02 +00:00
|
|
|
# check host:port syntax
|
|
|
|
|
host = self.urlTuple[1]
|
|
|
|
|
if ":" in host:
|
|
|
|
|
host,port = host.split(":", 1)
|
|
|
|
|
if not port_re.match(port):
|
2002-05-14 21:19:38 +00:00
|
|
|
raise linkcheck.error(linkcheck._("URL has invalid port number"))
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def logMe (self):
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "logging url")
|
2002-05-04 13:27:02 +00:00
|
|
|
self.config.incrementLinknumber()
|
|
|
|
|
if self.config["verbose"] or not self.valid or \
|
|
|
|
|
(self.warningString and self.config["warnings"]):
|
|
|
|
|
self.config.log_newUrl(self)
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def check (self):
|
2002-01-29 21:20:37 +00:00
|
|
|
try:
|
2002-05-04 13:27:02 +00:00
|
|
|
self._check()
|
2002-01-29 21:20:37 +00:00
|
|
|
except KeyboardInterrupt:
|
2002-10-19 11:46:41 +00:00
|
|
|
raise
|
2002-02-14 15:33:53 +00:00
|
|
|
except (socket.error, select.error):
|
2002-02-08 19:08:28 +00:00
|
|
|
# on Unix, ctrl-c can raise
|
|
|
|
|
# error: (4, 'Interrupted system call')
|
|
|
|
|
type, value = sys.exc_info()[:2]
|
|
|
|
|
if type!=4:
|
|
|
|
|
raise
|
2002-03-28 18:58:54 +00:00
|
|
|
except test_support.Error:
|
|
|
|
|
raise
|
2002-01-29 21:20:37 +00:00
|
|
|
except:
|
|
|
|
|
internal_error()
|
|
|
|
|
|
2002-02-08 16:22:55 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def _check (self):
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "Checking", self)
|
2002-05-04 13:27:02 +00:00
|
|
|
if self.recursionLevel and self.config['wait']:
|
|
|
|
|
debug(BRING_IT_ON, "sleeping for", self.config['wait'], "seconds")
|
|
|
|
|
time.sleep(self.config['wait'])
|
2000-03-19 14:24:33 +00:00
|
|
|
t = time.time()
|
2000-02-26 10:24:46 +00:00
|
|
|
# check syntax
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "checking syntax")
|
2000-02-26 10:24:46 +00:00
|
|
|
if not self.urlName or self.urlName=="":
|
2002-02-24 12:29:35 +00:00
|
|
|
self.setError(linkcheck._("URL is null or empty"))
|
2002-05-04 13:27:02 +00:00
|
|
|
self.logMe()
|
2000-02-26 10:24:46 +00:00
|
|
|
return
|
2000-02-29 12:53:00 +00:00
|
|
|
try:
|
|
|
|
|
self.buildUrl()
|
2002-05-04 13:27:02 +00:00
|
|
|
self.extern = self._getExtern()
|
2001-10-17 17:03:08 +00:00
|
|
|
except tuple(ExcList):
|
2001-11-20 20:27:25 +00:00
|
|
|
type, value, tb = sys.exc_info()
|
|
|
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
2000-03-27 13:34:31 +00:00
|
|
|
self.setError(str(value))
|
2002-05-04 13:27:02 +00:00
|
|
|
self.logMe()
|
2000-02-26 10:24:46 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# check the cache
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "checking cache")
|
2002-05-04 13:27:02 +00:00
|
|
|
if self.config.urlCache_has_key(self.getCacheKey()):
|
|
|
|
|
self.copyFrom(self.config.urlCache_get(self.getCacheKey()))
|
2000-02-26 10:24:46 +00:00
|
|
|
self.cached = 1
|
2002-05-04 13:27:02 +00:00
|
|
|
self.logMe()
|
2000-02-26 10:24:46 +00:00
|
|
|
return
|
2002-05-04 13:27:02 +00:00
|
|
|
|
2000-02-26 10:24:46 +00:00
|
|
|
# apply filter
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "extern =", self.extern)
|
2002-09-26 14:50:50 +00:00
|
|
|
if self.extern[0] and (self.config["strict"] or self.extern[1]):
|
2002-01-04 20:56:03 +00:00
|
|
|
self.setWarning(
|
2002-02-24 12:29:35 +00:00
|
|
|
linkcheck._("outside of domain filter, checked only syntax"))
|
2002-05-04 13:27:02 +00:00
|
|
|
self.logMe()
|
2000-02-26 10:24:46 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# check connection
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "checking connection")
|
2000-02-26 10:24:46 +00:00
|
|
|
try:
|
2002-05-04 13:27:02 +00:00
|
|
|
self.checkConnection()
|
|
|
|
|
if self.urlTuple and self.config["anchors"]:
|
2000-02-26 10:24:46 +00:00
|
|
|
self.checkAnchors(self.urlTuple[5])
|
2000-06-10 18:06:43 +00:00
|
|
|
except tuple(ExcList):
|
2001-11-20 20:27:25 +00:00
|
|
|
type, value, tb = sys.exc_info()
|
|
|
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
2000-02-26 10:24:46 +00:00
|
|
|
self.setError(str(value))
|
2000-03-19 14:24:33 +00:00
|
|
|
|
2000-03-28 14:43:50 +00:00
|
|
|
# check content
|
2002-05-04 13:27:02 +00:00
|
|
|
warningregex = self.config["warningregex"]
|
2000-03-28 14:43:50 +00:00
|
|
|
if warningregex and self.valid:
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "checking content")
|
2001-02-20 09:32:36 +00:00
|
|
|
try: self.checkContent(warningregex)
|
|
|
|
|
except tuple(ExcList):
|
2001-11-20 20:27:25 +00:00
|
|
|
type, value, tb = sys.exc_info()
|
|
|
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
2001-02-20 09:32:36 +00:00
|
|
|
self.setError(str(value))
|
2000-03-28 14:43:50 +00:00
|
|
|
|
2000-03-19 14:24:33 +00:00
|
|
|
self.checktime = time.time() - t
|
2000-02-26 10:24:46 +00:00
|
|
|
# check recursion
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "checking recursion")
|
2002-05-04 13:27:02 +00:00
|
|
|
if self.allowsRecursion():
|
|
|
|
|
try: self.parseUrl()
|
2002-04-29 18:27:40 +00:00
|
|
|
except tuple(ExcList):
|
|
|
|
|
type, value, tb = sys.exc_info()
|
|
|
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
|
|
|
|
self.setError(str(value))
|
2000-02-26 10:24:46 +00:00
|
|
|
self.closeConnection()
|
2002-05-04 13:27:02 +00:00
|
|
|
self.logMe()
|
2001-10-17 17:03:08 +00:00
|
|
|
debug(BRING_IT_ON, "caching")
|
2002-05-04 13:27:02 +00:00
|
|
|
self.putInCache()
|
2000-02-26 10:24:46 +00:00
|
|
|
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def closeConnection (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
# brute force closing
|
2000-03-21 11:38:22 +00:00
|
|
|
if self.urlConnection is not None:
|
|
|
|
|
try: self.urlConnection.close()
|
|
|
|
|
except: pass
|
|
|
|
|
# release variable for garbage collection
|
|
|
|
|
self.urlConnection = None
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def putInCache (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
cacheKey = self.getCacheKey()
|
|
|
|
|
if cacheKey and not self.cached:
|
2002-05-04 13:27:02 +00:00
|
|
|
self.config.urlCache_set(cacheKey, self)
|
2000-02-26 10:24:46 +00:00
|
|
|
self.cached = 1
|
|
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def getCacheKey (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
if self.urlTuple:
|
|
|
|
|
return urlparse.urlunparse(self.urlTuple)
|
|
|
|
|
return None
|
|
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def checkConnection (self):
|
2000-02-26 10:24:46 +00:00
|
|
|
self.urlConnection = urllib.urlopen(self.url)
|
|
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def allowsRecursion (self):
|
2002-11-24 19:53:37 +00:00
|
|
|
# note: isHtml() might not be working if valid is false, so be
|
|
|
|
|
# sure to test it first.
|
2000-02-26 10:24:46 +00:00
|
|
|
return self.valid and \
|
|
|
|
|
self.isHtml() and \
|
|
|
|
|
not self.cached and \
|
2002-05-04 13:27:02 +00:00
|
|
|
self.recursionLevel < self.config["recursionlevel"] and \
|
2002-09-26 14:50:50 +00:00
|
|
|
not self.extern[0]
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def checkAnchors (self, anchor):
|
2002-11-24 19:53:37 +00:00
|
|
|
debug(HURT_ME_PLENTY, "checking anchor", anchor)
|
|
|
|
|
if not (self.valid and anchor and self.isHtml()):
|
2000-02-26 10:24:46 +00:00
|
|
|
return
|
2002-11-24 19:53:37 +00:00
|
|
|
h = LinkParser(self.getContent(), {'a': ['name']})
|
2002-11-25 22:29:07 +00:00
|
|
|
for cur_anchor,line,column,name,base in h.urls:
|
2000-02-26 10:24:46 +00:00
|
|
|
if cur_anchor == anchor:
|
|
|
|
|
return
|
2002-05-14 21:19:38 +00:00
|
|
|
self.setWarning(linkcheck._("anchor #%s not found") % anchor)
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def _getExtern (self):
|
|
|
|
|
if not (self.config["externlinks"] or self.config["internlinks"]):
|
2002-09-26 14:50:50 +00:00
|
|
|
return (0, 0)
|
2000-02-26 10:24:46 +00:00
|
|
|
# deny and allow external checking
|
2002-05-05 15:02:58 +00:00
|
|
|
Config.debug(HURT_ME_PLENTY, "Url", self.url)
|
2002-05-04 13:27:02 +00:00
|
|
|
if self.config["denyallow"]:
|
2002-05-05 15:02:58 +00:00
|
|
|
for entry in self.config["externlinks"]:
|
|
|
|
|
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
|
|
|
|
|
match = entry['pattern'].search(self.url)
|
|
|
|
|
if (entry['negate'] and not match) or \
|
|
|
|
|
(match and not entry['negate']):
|
|
|
|
|
return (1, entry['strict'])
|
|
|
|
|
for entry in self.config["internlinks"]:
|
|
|
|
|
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
|
|
|
|
|
match = entry['pattern'].search(self.url)
|
|
|
|
|
if (entry['negate'] and not match) or \
|
|
|
|
|
(match and not entry['negate']):
|
2002-09-26 14:50:50 +00:00
|
|
|
return (1, 0)
|
|
|
|
|
return (0, 0)
|
2000-11-20 22:25:55 +00:00
|
|
|
else:
|
2002-05-05 15:02:58 +00:00
|
|
|
for entry in self.config["internlinks"]:
|
|
|
|
|
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
|
|
|
|
|
match = entry['pattern'].search(self.url)
|
|
|
|
|
if (entry['negate'] and not match) or \
|
|
|
|
|
(match and not entry['negate']):
|
2002-09-26 14:50:50 +00:00
|
|
|
return (0, 0)
|
2002-05-05 15:02:58 +00:00
|
|
|
for entry in self.config["externlinks"]:
|
|
|
|
|
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
|
|
|
|
|
match = entry['pattern'].search(self.url)
|
|
|
|
|
if (entry['negate'] and not match) or \
|
|
|
|
|
(match and not entry['negate']):
|
|
|
|
|
return (1, entry['strict'])
|
2000-11-20 22:25:55 +00:00
|
|
|
return (1,0)
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def getContent (self):
|
2000-05-16 15:31:19 +00:00
|
|
|
"""Precondition: urlConnection is an opened URL."""
|
2001-04-03 18:59:38 +00:00
|
|
|
if not self.has_content:
|
|
|
|
|
self.has_content = 1
|
2000-03-28 14:43:50 +00:00
|
|
|
t = time.time()
|
2000-05-16 15:31:19 +00:00
|
|
|
self.data = self.urlConnection.read()
|
2000-03-28 14:43:50 +00:00
|
|
|
self.downloadtime = time.time() - t
|
2000-05-16 15:31:19 +00:00
|
|
|
return self.data
|
|
|
|
|
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def checkContent (self, warningregex):
|
2000-05-16 15:31:19 +00:00
|
|
|
match = warningregex.search(self.getContent())
|
2000-03-28 14:43:50 +00:00
|
|
|
if match:
|
2002-11-25 22:29:07 +00:00
|
|
|
self.setWarning(linkcheck._("Found %s in link contents") % \
|
|
|
|
|
`match.group()`)
|
2000-04-24 22:07:48 +00:00
|
|
|
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def parseUrl (self):
|
2001-05-23 21:20:44 +00:00
|
|
|
debug(BRING_IT_ON, "Parsing recursively into", self)
|
2000-02-26 10:24:46 +00:00
|
|
|
# search for a possible base reference
|
2002-11-24 22:13:05 +00:00
|
|
|
h = LinkParser(self.getContent(), {'base': ['href']})
|
2000-02-26 10:24:46 +00:00
|
|
|
baseRef = None
|
2002-11-24 19:53:37 +00:00
|
|
|
if len(h.urls)>=1:
|
|
|
|
|
baseRef = h.urls[0][0]
|
|
|
|
|
if len(h.urls)>1:
|
2002-11-25 22:29:07 +00:00
|
|
|
self.setWarning(linkcheck._(
|
|
|
|
|
"more than one <base> tag found, using only the first one"))
|
2002-11-24 19:53:37 +00:00
|
|
|
h = LinkParser(self.getContent())
|
2002-11-25 22:29:07 +00:00
|
|
|
for url,line,column,name,codebase in h.urls:
|
2002-11-24 19:53:37 +00:00
|
|
|
if codebase:
|
|
|
|
|
base = codebase
|
|
|
|
|
else:
|
|
|
|
|
base = baseRef
|
|
|
|
|
self.config.appendUrl(GetUrlDataFrom(url,
|
|
|
|
|
self.recursionLevel+1, self.config,
|
2002-11-25 22:29:07 +00:00
|
|
|
parentName=self.url, baseRef=base,
|
|
|
|
|
line=line, column=column, name=name))
|
2001-01-07 13:28:38 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def __str__ (self):
|
2001-01-07 13:28:38 +00:00
|
|
|
return ("%s link\n"
|
|
|
|
|
"urlname=%s\n"
|
|
|
|
|
"parentName=%s\n"
|
|
|
|
|
"baseRef=%s\n"
|
|
|
|
|
"cached=%s\n"
|
|
|
|
|
"recursionLevel=%s\n"
|
|
|
|
|
"urlConnection=%s\n"
|
|
|
|
|
"line=%s\n"
|
2002-11-25 22:29:07 +00:00
|
|
|
"column=%s\n"
|
2001-01-07 13:28:38 +00:00
|
|
|
"name=%s" % \
|
2001-12-10 13:51:07 +00:00
|
|
|
(self.scheme, self.urlName, self.parentName, self.baseRef,
|
2002-11-25 22:29:07 +00:00
|
|
|
self.cached, self.recursionLevel, self.urlConnection, self.line,
|
|
|
|
|
self.column, self.name))
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2000-04-24 22:07:48 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def _getUserPassword (self):
|
|
|
|
|
for auth in self.config["authentication"]:
|
2001-05-23 21:20:44 +00:00
|
|
|
if auth['pattern'].match(self.url):
|
|
|
|
|
return auth['user'], auth['password']
|
2000-06-19 08:43:18 +00:00
|
|
|
return None,None
|
2000-03-07 22:47:50 +00:00
|
|
|
|
|
|
|
|
|
2000-02-26 10:24:46 +00:00
|
|
|
from FileUrlData import FileUrlData
|
2001-12-10 13:51:07 +00:00
|
|
|
from IgnoredUrlData import IgnoredUrlData, ignored_schemes_re
|
2000-02-26 10:24:46 +00:00
|
|
|
from FtpUrlData import FtpUrlData
|
|
|
|
|
from GopherUrlData import GopherUrlData
|
|
|
|
|
from HttpUrlData import HttpUrlData
|
|
|
|
|
from HttpsUrlData import HttpsUrlData
|
|
|
|
|
from MailtoUrlData import MailtoUrlData
|
|
|
|
|
from TelnetUrlData import TelnetUrlData
|
2000-03-30 17:10:35 +00:00
|
|
|
from NntpUrlData import NntpUrlData
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2001-12-10 13:51:07 +00:00
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def get_absolute_url (urlName, baseRef, parentName):
|
2002-11-18 19:48:28 +00:00
|
|
|
"""Search for the absolute url to detect the link type. This does not
|
|
|
|
|
join any url fragments together! Returns the url in lower case to
|
|
|
|
|
simplify urltype matching."""
|
2000-02-26 10:24:46 +00:00
|
|
|
if urlName and ":" in urlName:
|
2001-12-10 13:51:07 +00:00
|
|
|
return urlName.lower()
|
2000-02-26 10:24:46 +00:00
|
|
|
elif baseRef and ":" in baseRef:
|
2001-12-10 13:51:07 +00:00
|
|
|
return baseRef.lower()
|
2000-02-26 10:24:46 +00:00
|
|
|
elif parentName and ":" in parentName:
|
2001-12-10 13:51:07 +00:00
|
|
|
return parentName.lower()
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
2002-05-04 13:27:02 +00:00
|
|
|
def GetUrlDataFrom (urlName, recursionLevel, config, parentName=None,
|
2002-11-25 22:29:07 +00:00
|
|
|
baseRef=None, line=0, column=0, name=None):
|
2001-12-10 13:51:07 +00:00
|
|
|
url = get_absolute_url(urlName, baseRef, parentName)
|
2000-02-26 10:24:46 +00:00
|
|
|
# test scheme
|
2002-09-20 20:49:26 +00:00
|
|
|
if url.startswith("http:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = HttpUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("ftp:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = FtpUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("file:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = FileUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("telnet:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = TelnetUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("mailto:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = MailtoUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("gopher:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = GopherUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("https:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = HttpsUrlData
|
2002-09-20 20:49:26 +00:00
|
|
|
elif url.startswith("nttp:") or \
|
|
|
|
|
url.startswith("news:") or \
|
|
|
|
|
url.startswith("snews:"):
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = NntpUrlData
|
2001-12-10 13:51:07 +00:00
|
|
|
# application specific links are ignored
|
|
|
|
|
elif ignored_schemes_re.search(url):
|
|
|
|
|
klass = IgnoredUrlData
|
2000-02-26 10:24:46 +00:00
|
|
|
# assume local file
|
2001-11-17 13:41:52 +00:00
|
|
|
else:
|
2001-11-17 13:38:31 +00:00
|
|
|
klass = FileUrlData
|
2002-11-25 22:29:07 +00:00
|
|
|
return klass(urlName, recursionLevel, config, parentName, baseRef,
|
|
|
|
|
line=line, column=column, name=name)
|