tests, linkname

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@475 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-17 02:51:07 +00:00 · 2002-06-09 15:32:14 +00:00 · 2002-06-09 15:32:14 +00:00 · 94a5fb8ea3
commit 94a5fb8ea3
parent b718fd6a18
9 changed files with 81 additions and 34 deletions
--- a/3
+++ b/3
@ -57,6 +57,7 @@ deb_unsigned: cleandeb

 files:	locale
 	env http_proxy="" $(PYTHON) $(PACKAGE) $(LCOPTS) -i$(HOST) http://$(HOST)/
+	for f in linkchecker-out.*; do gzip --best $$f; done

 VERSION:
 	echo $(VERSION) > VERSION
@ -67,7 +68,7 @@ VERSION-DEVEL:
 upload: distclean dist files VERSION
 	scp debian/changelog $(HTMLDIR)/changes.txt
 	scp README $(HTMLDIR)/readme.txt
-	scp linkchecker-out.* $(HTMLDIR)
+	scp linkchecker-out.*.gz $(HTMLDIR)
 	scp VERSION $(HTMLDIR)/raw/
 	#scp dist/* $(HTMLDIR)/
 	ncftpput upload.sourceforge.net /incoming dist/* && read -p "Make new SF file releases and then press Enter:"
--- a/debian/changelog
+++ b/debian/changelog
@ -1,6 +1,6 @@
 linkchecker (1.5.5) unstable; urgency=low

-  * linkcheck/linkname.py: fix linkname regular expressions (from 1.4.8)
+  * linkcheck/linkname.py: fix linkname regular expressions (from 1.4.9)
  * linkchecker: documentation typos
  * linkcheck/__init__.py: use getLinkPat function for all config
    entries, not just commandline
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -77,16 +77,16 @@ _linkMatcher = r"""
    \s*            # whitespace
    %s             # tag name
    \s+            # whitespace
-    ([^"'>]|"[^"]*"|'[^']*')*         # skip leading attributes
+    ([^"'>]|"[^"\n]*"|'[^'\n]*')*         # skip leading attributes
    %s             # attrib name
    \s*            # whitespace
    =              # equal sign
    \s*            # whitespace
    (?P<value>     # attribute value
-     "[^"]*" |     # in double quotes
-     '[^']*' |     # in single quotes
+     "[^"\n]*" |   # in double quotes
+     '[^'\n]*' |   # in single quotes
     [^\s>]+)      # unquoted
-    ([^"'>]|"[^"]*"|'[^']*')*          # skip trailing attributes
+    ([^"'>]|"[^"\n]*"|'[^'\n]*')*         # skip trailing attributes
    >              # close tag
    """

@ -123,22 +123,22 @@ LinkTags = (

 LinkPatterns = []
 for tags,attrs in LinkTags:
-    attr = '(%s)'%'|'.join(attrs)
    tag = '(%s)'%'|'.join(tags)
+    attr = '(%s)'%'|'.join(attrs)
    LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
-                                               re.VERBOSE|re.DOTALL),
-                         'tag': tag,
-                         'attr': attr})
+                                               re.VERBOSE),
+                         'tags': tags,
+                         'attrs': attrs})
 AnchorPattern = {
-    'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE|re.DOTALL),
-    'tag': 'a',
-    'attr': 'name',
+    'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
+    'tags': ['a'],
+    'attrs': ['name'],
 }

 BasePattern = {
    'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE),
-    'tag': 'base',
-    'attr': 'href',
+    'tags': ['base'],
+    'attrs': ['href'],
 }

 #CommentPattern = re.compile("<!--.*?--\s*>", re.DOTALL)
@ -483,10 +483,16 @@ class UrlData:


    def searchInForTag (self, pattern):
-        debug(HURT_ME_PLENTY, "Searching for tag", `pattern['tag']`,
-	      "attribute", `pattern['attr']`)
+        debug(HURT_ME_PLENTY, "Searching for tags", `pattern['tags']`,
+	      "attributes", `pattern['attrs']`)
        urls = []
        index = 0
+        if 'a' in pattern['tags'] and 'href' in pattern['attrs']:
+            tag = 'a'
+        elif 'img' in pattern['tags']:
+            tag = 'img'
+        else:
+            tag = ''
        while 1:
            match = pattern['pattern'].search(self.getContent(), index)
            if not match: break
@ -498,19 +504,19 @@ class UrlData:
            url = StringUtil.unhtmlify(url)
            lineno= StringUtil.getLineNumber(self.getContent(), match.start())
            # extra feature: get optional name for this bookmark
-            name = self.searchInForName(pattern['tag'], pattern['attr'],
-	                                match.start(), match.end())
-            debug(HURT_ME_PLENTY, "Found", `url`, "at line", lineno)
+            name = self.searchInForName(tag, match.start(), match.end())
+            debug(HURT_ME_PLENTY, "Found", `url`, "name", `name`,
+                  "at line", lineno)
            urls.append((url, lineno, name))
        return urls


-    def searchInForName (self, tag, attr, start, end):
+    def searchInForName (self, tag, start, end):
        name=""
-        if tag=='img':
-            name = linkname.image_name(self.getContent()[start:end])
-        elif tag=='a' and attr=='href':
+        if tag=='a':
            name = linkname.href_name(self.getContent()[end:])
+        elif tag=='img':
+            name = linkname.image_name(self.getContent()[start:end])
        return name


--- a/linkcheck/linkname.py
+++ b/linkcheck/linkname.py
@ -18,7 +18,7 @@ import re, StringUtil

 imgtag_re = re.compile(r"""(?i)\s+alt\s*=\s*(?P<name>("[^"\n]*"|'[^'\n]*'|[^\s>]+))""")
 img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>]+)+>""")
-href_re = re.compile(r"""(?i)(?P<name>("[^"\n]*"|'[^'\n]*'|[^<]+|<(?!/a\s*>))*)</a\s*>""")
+endtag_re = re.compile(r"""(?i)</a\s*>""")

 def image_name(txt):
    name = ""
@ -34,15 +34,13 @@ def image_name(txt):

 def href_name(txt):
    name = ""
-    mo = href_re.search(txt)
-    if mo:
-        #print "DEBUG:", `mo.group(0)`
-        name = mo.group('name').strip()
-        if img_re.search(name):
-            return image_name(name)
-        name = StringUtil.remove_markup(name)
-        name = StringUtil.unhtmlify(name)
-    #print "NAME:", `name`
+    endtag = endtag_re.search(txt)
+    if not endtag: return name
+    name = txt[:endtag.start()]
+    if img_re.search(name):
+        return image_name(name)
+    name = StringUtil.remove_markup(name)
+    name = StringUtil.unhtmlify(name)
    return name

 _tests = (
@ -53,6 +51,10 @@ _tests = (
    "test<</a>",
    "test</</a>",
    "test</a</a>",
+    "test",
+    "\n",
+    "",
+    '"</a>"foo',
 )

 def _test ():
--- a/test/output/test_file
+++ b/test/output/test_file
@ -6,22 +6,30 @@ valid
 url file:///home/calvin/projects/linkchecker/test/html/file.asc
 valid
 url http.html
+name relative url
 valid
 url http.html#isnix
+name bad anchor
 warning anchor #isnix not found
 valid
 url http.html#iswas
+name good anchor
 valid
 url file:///etc/group
+name good file
 valid
 url file://etc/group
+name bad file
 error
 url file:/etc/group
 cached
+name good file
 valid
 url file:etc/group
+name bad file
 error
 url file:/etc/
+name good dir
 valid
 url file:///etc/group
 cached
--- a/test/output/test_http
+++ b/test/output/test_http
@ -2,44 +2,59 @@ test_http
 url file:///home/calvin/projects/linkchecker/test/html/http.html
 valid
 url http://www.garantiertnixgutt.bla
+name bad url
 warning Path is empty
 error
 url http://www.heise.de
+name ok
 warning Path is empty
 valid
 url http:/www.heise.de
+name one slash
 error
 url http:www.heise.de
+name no slash
 error
 url http://
+name no url
 warning Path is empty
 error
 url http:/
 cached
+name no url, one slash
 warning Path is empty
 error
 url http:
 cached
+name no url, no slash
 warning Path is empty
 error
 url http://www.blubb.de/stalter&sohn
+name unquoted ampersand
 error
 url http://slashdot.org/
+name unquoted
 valid
 url http://fsinfo.cs.uni-sb.de/~calvin/#isnix
+name invalid anchor
 warning anchor #isnix not found
 valid
 url HtTP://WWW.hEIsE.DE
 cached
+name should be cached
 warning Path is empty
 valid
 url HTTP://WWW.HEISE.DE
 cached
+name should be cached
 warning Path is empty
 valid
 url http://www.heise.de/?quoted=ü
+name html entities
 valid
 url illegalquote1
+name no beginning quote
 error
 url illegalquote2
+name no ending quote
 error
--- a/test/output/test_https
+++ b/test/output/test_https
@ -2,4 +2,5 @@ test_https
 url file:///home/calvin/projects/linkchecker/test/html/https.html
 valid
 url https://sourceforge.net/
+name https
 valid
--- a/test/output/test_mail
+++ b/test/output/test_mail
@ -2,31 +2,43 @@ test_mail
 url file:///home/calvin/projects/linkchecker/test/html/mail.html
 valid
 url mailto:calvin@LocalHost?subject=Hallo&to=michi
+name 1
 warning No MX mail host for LocalHost found
 valid
 url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
+name 2
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
+name 3
 warning No MX mail host for company.com found
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
+name 4
 valid
 url mailto:
+name 6
 warning No adresses found
 valid
 url mailto:o'hara@cs.uni-sb.de
+name 5
 valid
 url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
+name ...
 valid
 url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
+name ...
 valid
 url mailto:jan@jan-dittberner.de?subject=test
+name ...
 valid
 url mailto:a@d?subject=äöü
+name 5
 warning No MX mail host for d found
 valid
 url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
+name _
 valid
 url mailto:Bastian Kleineidam <calvin@host1?foo=bar>
+name 3
 warning No MX mail host for host1 found
 valid
--- a/test/output/test_misc
+++ b/test/output/test_misc
@ -2,8 +2,10 @@ test_misc
 url file:///home/calvin/projects/linkchecker/test/html/misc.html
 valid
 url hutzli:nixgutt
+name bad scheme
 error
 url javascript:loadthis()
+name javascript url
 warning Javascript url ignored
 valid
 url telnet:localhost