new base test, other tests updated

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@602 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-11-18 19:48:28 +00:00
parent d44718f1f3
commit 89d2d18e8d
7 changed files with 75 additions and 30 deletions

View file

@ -1,3 +1,10 @@
1.6.7
* Removed check for <applet> tags codebase attribute, but honor it
when checking applet links
* Handle <applet> tags archive attribute as a comma separated list
* Fix a deep flaw in tag searching, which ignored tags with more
than one link attribute in it.
1.6.6
* Use the new HTTPConnection/HTTPResponse interface of httplib
Closes: SF bug #634679

View file

@ -100,7 +100,7 @@ _linkMatcher = r"""
# ripped mainly from HTML::Tagset.pm
LinkTags = (
(['a'], ['href']),
(['applet'], ['archive', 'codebase', 'src']),
(['applet'], ['archive', 'src']),
(['area'], ['href']),
(['bgsound'], ['src']),
(['blockquote'], ['cite']),
@ -117,7 +117,7 @@ LinkTags = (
(['isindex'], ['action']),
(['layer'], ['background', 'src']),
(['link'], ['href']),
(['object'], ['classid', 'codebase', 'data', 'archive', 'usemap']),
(['object'], ['classid', 'data', 'archive', 'usemap']),
(['q'], ['cite']),
(['script'], ['src', 'for']),
(['body', 'table', 'td', 'th', 'tr'], ['background']),
@ -131,11 +131,11 @@ _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
LinkPatterns = []
for _tags,_attrs in LinkTags:
_tag = '(%s)'%'|'.join(_tags)
_attr = '(%s)'%'|'.join(_attrs)
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (_tag, _attr),
re.VERBOSE),
'tags': _tags,
'attrs': _attrs})
for _attr in _attrs:
LinkPatterns.append({'pattern': re.compile(_linkMatcher %
(_tag, _attr), re.VERBOSE),
'tags': _tags,
'attrs': _attrs})
AnchorPattern = {
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
'tags': ['a'],
@ -148,8 +148,6 @@ BasePattern = {
'attrs': ['href'],
}
#CommentPattern = re.compile("<!--.*?--\s*>", re.DOTALL)
# Workaround for Python 2.0 re module bug
CommentPatternBegin = re.compile(r"<!--")
CommentPatternEnd = re.compile(r"--\s*>")
@ -226,6 +224,8 @@ class UrlData:
def buildUrl (self):
if self.baseRef:
if ":" not in self.baseRef:
self.baseRef = urlparse.urljoin(self.parentName, self.baseRef)
self.url = urlparse.urljoin(self.baseRef, self.urlName)
elif self.parentName:
self.url = urlparse.urljoin(self.parentName, self.urlName)
@ -385,7 +385,7 @@ class UrlData:
if not (anchor!="" and self.isHtml() and self.valid):
return
self.getContent()
for cur_anchor,line,name in self.searchInForTag(AnchorPattern):
for cur_anchor,line,name,base in self.searchInForTag(AnchorPattern):
if cur_anchor == anchor:
return
self.setWarning(linkcheck._("anchor #%s not found") % anchor)
@ -479,27 +479,40 @@ class UrlData:
if len(bases)>=1:
baseRef = bases[0][0]
if len(bases)>1:
self.setWarning(linkcheck._("more than one base tag found"))
self.setWarning(linkcheck._("more than one <base> tag found, using only the first one"))
# search for tags and add found tags to URL queue
for pattern in LinkPatterns:
urls = self.searchInForTag(pattern)
for url,line,name in urls:
for url,line,name,codebase in urls:
if codebase:
base = codebase
else:
base = baseRef
self.config.appendUrl(GetUrlDataFrom(url,
self.recursionLevel+1, self.config, self.url, baseRef, line, name))
self.recursionLevel+1, self.config,
self.url, base, line, name))
def searchInForTag (self, pattern):
debug(HURT_ME_PLENTY, "Searching for tags", `pattern['tags']`,
"attributes", `pattern['attrs']`)
tags = pattern['tags']
attrs = pattern['attrs']
debug(HURT_ME_PLENTY, "Searching for <%s %s=value>"%(tags, attrs))
urls = []
index = 0
if 'a' in pattern['tags'] and 'href' in pattern['attrs']:
if 'applet' in tags and ('archive' in attrs or 'src' in attrs):
codebasetag = 'applet'
elif 'object' in tags and \
('classid' in attrs or 'data' in attrs or 'archive' in attrs):
codebasetag = 'object'
else:
codebasetag = None
if 'a' in tags and 'href' in attrs:
tag = 'a'
elif 'img' in pattern['tags']:
elif 'img' in tags:
tag = 'img'
else:
tag = ''
index = 0
while 1:
try:
match = pattern['pattern'].search(self.getContent(), index)
@ -507,14 +520,27 @@ class UrlData:
self.setError(linkcheck._("""Could not parse HTML content (%s).
You may have a syntax error.
LinkChecker is skipping the remaining content for the link type
<%s %s>.""") % (msg, "|".join(pattern['tags']), "|".join(pattern['attrs'])))
<%s %s>.""") % (msg, "|".join(tags), "|".join(attrs)))
break
if not match: break
index = match.end()
if self.is_in_comment(match.start()): continue
start = match.start()
if self.is_in_comment(start): continue
# strip quotes
url = StringUtil.stripQuotes(match.group('value'))
if 'meta' in pattern['tags']:
# look for applet and object codebase
codebase = None
if codebasetag:
cbr = re.compile(_linkMatcher%(codebasetag, "codebase"),
re.VERBOSE)
codebase = cbr.search(self.getContent()[start:])
if codebase and codebase.start()==0:
codebase = StringUtil.stripQuotes(codebase.group('value'))
codebase = StringUtil.unhtmlify(codebase)
else:
codebase = None
# look for meta refresh
elif 'meta' in pattern['tags']:
metamatch = _refresh_re.match(url)
if metamatch:
url = metamatch.group("url")
@ -528,7 +554,7 @@ LinkChecker is skipping the remaining content for the link type
name = self.searchInForName(tag, match.start(), match.end())
debug(HURT_ME_PLENTY, "Found", `url`, "name", `name`,
"at line", lineno)
urls.append((url, lineno, name))
urls.append((url, lineno, name, codebase))
return urls
@ -575,7 +601,9 @@ from NntpUrlData import NntpUrlData
def get_absolute_url (urlName, baseRef, parentName):
"""search for the absolute url"""
"""Search for the absolute url to detect the link type. This does not
join any url fragments together! Returns the url in lower case to
simplify urltype matching."""
if urlName and ":" in urlName:
return urlName.lower()
elif baseRef and ":" in baseRef:

View file

@ -4,7 +4,7 @@
<!-- legal -->
<a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>
<a href="mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla">2</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com">3</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40wummel.company.com">3</a>
<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">4</a>
<a href="mailto:">6</a>
<a href="mailto:o'hara@cs.uni-sb.de">5</a>

View file

@ -1,8 +1,19 @@
test_base
url file:///home/calvin/projects/linkchecker/test/html/base1.html
valid
url file:///home/calvin/projects/linkchecker/test/html/base2.html
valid
url file:///home/calvin/projects/linkchecker/test/html/codebase.html
valid
url misc.html
valid
url misc.html
cached
valid
url test.txt
baseurl file:///home/calvin/projects/linkchecker/test/html/base/
valid
url test.txt
cached
baseurl file:///home/calvin/projects/linkchecker/test/html/base/
valid

View file

@ -4,13 +4,13 @@ valid
url ftp:/ftp.debian.org/pub
error
url ftp://ftp.debian.org/pub
info 220 saens.debian.org FTP server (vsftpd)
info 220 raff.debian.org FTP server (vsftpd)
valid
url ftp://ftp.debian.org//pub
info 220 saens.debian.org FTP server (vsftpd)
info 220 raff.debian.org FTP server (vsftpd)
valid
url ftp://ftp.debian.org////////pub
info 220 saens.debian.org FTP server (vsftpd)
info 220 raff.debian.org FTP server (vsftpd)
valid
url ftp:///ftp.debian.org/pub
cached

View file

@ -8,9 +8,8 @@ valid
url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
name 2
valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40wummel.company.com
name 3
warning No MX mail host for company.com found
valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
name 4

View file

@ -7,7 +7,7 @@ config["anchors"] = 1
config["verbose"] = 1
config.disableThreading()
htmldir = "test/html"
for file in ('base1.html',):
for file in ('base1.html','base2.html', 'codebase.html'):
url = os.path.join(htmldir, file)
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0, config))
linkcheck.checkUrls(config)