new base test, other tests updated

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@602 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-09 15:14:45 +00:00 · 2002-11-18 19:48:28 +00:00 · 2002-11-18 19:48:28 +00:00 · 89d2d18e8d
commit 89d2d18e8d
parent d44718f1f3
7 changed files with 75 additions and 30 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+1.6.7
+  * Removed check for <applet> tags codebase attribute, but honor it
+    when checking applet links
+  * Handle <applet> tags archive attribute as a comma separated list
+  * Fix a deep flaw in tag searching, which ignored tags with more
+    than one link attribute in it.
+
 1.6.6
  * Use the new HTTPConnection/HTTPResponse interface of httplib
    Closes: SF bug #634679
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -100,7 +100,7 @@ _linkMatcher = r"""
 # ripped mainly from HTML::Tagset.pm
 LinkTags = (
    (['a'],       ['href']),
-    (['applet'],  ['archive', 'codebase', 'src']),
+    (['applet'],  ['archive', 'src']),
    (['area'],    ['href']),
    (['bgsound'], ['src']),
    (['blockquote'], ['cite']),
@ -117,7 +117,7 @@ LinkTags = (
    (['isindex'], ['action']),
    (['layer'],   ['background', 'src']),
    (['link'],    ['href']),
-    (['object'],  ['classid', 'codebase', 'data', 'archive', 'usemap']),
+    (['object'],  ['classid', 'data', 'archive', 'usemap']),
    (['q'],       ['cite']),
    (['script'],  ['src', 'for']),
    (['body', 'table', 'td', 'th', 'tr'], ['background']),
@ -131,11 +131,11 @@ _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
 LinkPatterns = []
 for _tags,_attrs in LinkTags:
    _tag = '(%s)'%'|'.join(_tags)
-    _attr = '(%s)'%'|'.join(_attrs)
-    LinkPatterns.append({'pattern': re.compile(_linkMatcher % (_tag, _attr),
-                                               re.VERBOSE),
-                         'tags': _tags,
-                         'attrs': _attrs})
+    for _attr in _attrs:
+        LinkPatterns.append({'pattern': re.compile(_linkMatcher %
+            (_tag, _attr), re.VERBOSE),
+            'tags': _tags,
+            'attrs': _attrs})
 AnchorPattern = {
    'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
    'tags': ['a'],
@ -148,8 +148,6 @@ BasePattern = {
    'attrs': ['href'],
 }

-#CommentPattern = re.compile("<!--.*?--\s*>", re.DOTALL)
-# Workaround for Python 2.0 re module bug
 CommentPatternBegin = re.compile(r"<!--")
 CommentPatternEnd = re.compile(r"--\s*>")

@ -226,6 +224,8 @@ class UrlData:

    def buildUrl (self):
        if self.baseRef:
+            if ":" not in self.baseRef:
+                self.baseRef = urlparse.urljoin(self.parentName, self.baseRef)
            self.url = urlparse.urljoin(self.baseRef, self.urlName)
        elif self.parentName:
            self.url = urlparse.urljoin(self.parentName, self.urlName)
@ -385,7 +385,7 @@ class UrlData:
        if not (anchor!="" and self.isHtml() and self.valid):
            return
        self.getContent()
-        for cur_anchor,line,name in self.searchInForTag(AnchorPattern):
+        for cur_anchor,line,name,base in self.searchInForTag(AnchorPattern):
            if cur_anchor == anchor:
                return
        self.setWarning(linkcheck._("anchor #%s not found") % anchor)
@ -479,27 +479,40 @@ class UrlData:
        if len(bases)>=1:
            baseRef = bases[0][0]
            if len(bases)>1:
-                self.setWarning(linkcheck._("more than one base tag found"))
+                self.setWarning(linkcheck._("more than one <base> tag found, using only the first one"))

        # search for tags and add found tags to URL queue
        for pattern in LinkPatterns:
            urls = self.searchInForTag(pattern)
-            for url,line,name in urls:
+            for url,line,name,codebase in urls:
+                if codebase:
+                    base = codebase
+                else:
+                    base = baseRef
                self.config.appendUrl(GetUrlDataFrom(url,
-         self.recursionLevel+1, self.config, self.url, baseRef, line, name))
+                                      self.recursionLevel+1, self.config,
+                                      self.url, base, line, name))


    def searchInForTag (self, pattern):
-        debug(HURT_ME_PLENTY, "Searching for tags", `pattern['tags']`,
-	      "attributes", `pattern['attrs']`)
+        tags = pattern['tags']
+        attrs = pattern['attrs']
+        debug(HURT_ME_PLENTY, "Searching for <%s %s=value>"%(tags, attrs))
        urls = []
-        index = 0
-        if 'a' in pattern['tags'] and 'href' in pattern['attrs']:
+        if 'applet' in tags and ('archive' in attrs or 'src' in attrs):
+            codebasetag = 'applet'
+        elif 'object' in tags and \
+            ('classid' in attrs or 'data' in attrs or 'archive' in attrs):
+            codebasetag = 'object'
+        else:
+            codebasetag = None
+        if 'a' in tags and 'href' in attrs:
            tag = 'a'
-        elif 'img' in pattern['tags']:
+        elif 'img' in tags:
            tag = 'img'
        else:
            tag = ''
+        index = 0
        while 1:
            try:
                match = pattern['pattern'].search(self.getContent(), index)
@ -507,14 +520,27 @@ class UrlData:
                self.setError(linkcheck._("""Could not parse HTML content (%s).
 You may have a syntax error.
 LinkChecker is skipping the remaining content for the link type
-<%s %s>.""") % (msg, "|".join(pattern['tags']), "|".join(pattern['attrs'])))
+<%s %s>.""") % (msg, "|".join(tags), "|".join(attrs)))
                break
            if not match: break
            index = match.end()
-            if self.is_in_comment(match.start()): continue
+            start = match.start()
+            if self.is_in_comment(start): continue
            # strip quotes
            url = StringUtil.stripQuotes(match.group('value'))
-	    if 'meta' in pattern['tags']:
+            # look for applet and object codebase
+            codebase = None
+            if codebasetag:
+                cbr = re.compile(_linkMatcher%(codebasetag, "codebase"),
+                                 re.VERBOSE)
+                codebase = cbr.search(self.getContent()[start:])
+                if codebase and codebase.start()==0:
+                    codebase = StringUtil.stripQuotes(codebase.group('value'))
+                    codebase = StringUtil.unhtmlify(codebase)
+                else:
+                    codebase = None
+            # look for meta refresh
+	    elif 'meta' in pattern['tags']:
 	        metamatch = _refresh_re.match(url)
 		if metamatch:
                    url = metamatch.group("url")
@ -528,7 +554,7 @@ LinkChecker is skipping the remaining content for the link type
            name = self.searchInForName(tag, match.start(), match.end())
            debug(HURT_ME_PLENTY, "Found", `url`, "name", `name`,
                  "at line", lineno)
-            urls.append((url, lineno, name))
+            urls.append((url, lineno, name, codebase))
        return urls


@ -575,7 +601,9 @@ from NntpUrlData import NntpUrlData


 def get_absolute_url (urlName, baseRef, parentName):
-    """search for the absolute url"""
+    """Search for the absolute url to detect the link type. This does not
+       join any url fragments together! Returns the url in lower case to
+       simplify urltype matching."""
    if urlName and ":" in urlName:
        return urlName.lower()
    elif baseRef and ":" in baseRef:
--- a/test/html/mail.html
+++ b/test/html/mail.html
@ -4,7 +4,7 @@
 <!-- legal -->
 <a href=mailto:calvin@LocalHost?subject=Hallo&to=michi>1</a>
 <a href="mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla">2</a>
-<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com">3</a>
+<a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40wummel.company.com">3</a>
 <a href="mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>">4</a>
 <a href="mailto:">6</a>
 <a href="mailto:o'hara@cs.uni-sb.de">5</a>
--- a/test/output/test_base
+++ b/test/output/test_base
@ -1,8 +1,19 @@
 test_base
 url file:///home/calvin/projects/linkchecker/test/html/base1.html
 valid
+url file:///home/calvin/projects/linkchecker/test/html/base2.html
+valid
+url file:///home/calvin/projects/linkchecker/test/html/codebase.html
+valid
 url misc.html
 valid
 url misc.html
 cached
 valid
+url test.txt
+baseurl file:///home/calvin/projects/linkchecker/test/html/base/
+valid
+url test.txt
+cached
+baseurl file:///home/calvin/projects/linkchecker/test/html/base/
+valid
--- a/test/output/test_ftp
+++ b/test/output/test_ftp
@ -4,13 +4,13 @@ valid
 url ftp:/ftp.debian.org/pub
 error
 url ftp://ftp.debian.org/pub
-info 220 saens.debian.org FTP server (vsftpd)
+info 220 raff.debian.org FTP server (vsftpd)
 valid
 url ftp://ftp.debian.org//pub
-info 220 saens.debian.org FTP server (vsftpd)
+info 220 raff.debian.org FTP server (vsftpd)
 valid
 url ftp://ftp.debian.org////////pub
-info 220 saens.debian.org FTP server (vsftpd)
+info 220 raff.debian.org FTP server (vsftpd)
 valid
 url ftp:///ftp.debian.org/pub
 cached
--- a/test/output/test_mail
+++ b/test/output/test_mail
@ -8,9 +8,8 @@ valid
 url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
 name 2
 valid
-url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
+url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40wummel.company.com
 name 3
-warning No MX mail host for company.com found
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
 name 4
--- a/test/test_base.py
+++ b/test/test_base.py
@ -7,7 +7,7 @@ config["anchors"] = 1
 config["verbose"] = 1
 config.disableThreading()
 htmldir = "test/html"
-for file in ('base1.html',):
+for file in ('base1.html','base2.html', 'codebase.html'):
    url = os.path.join(htmldir, file)
    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0, config))
 linkcheck.checkUrls(config)