tests, linkname

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@475 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-06-09 15:32:14 +00:00
parent b718fd6a18
commit 94a5fb8ea3
9 changed files with 81 additions and 34 deletions

View file

@ -57,6 +57,7 @@ deb_unsigned: cleandeb
files: locale
env http_proxy="" $(PYTHON) $(PACKAGE) $(LCOPTS) -i$(HOST) http://$(HOST)/
for f in linkchecker-out.*; do gzip --best $$f; done
VERSION:
echo $(VERSION) > VERSION
@ -67,7 +68,7 @@ VERSION-DEVEL:
upload: distclean dist files VERSION
scp debian/changelog $(HTMLDIR)/changes.txt
scp README $(HTMLDIR)/readme.txt
scp linkchecker-out.* $(HTMLDIR)
scp linkchecker-out.*.gz $(HTMLDIR)
scp VERSION $(HTMLDIR)/raw/
#scp dist/* $(HTMLDIR)/
ncftpput upload.sourceforge.net /incoming dist/* && read -p "Make new SF file releases and then press Enter:"

2
debian/changelog vendored
View file

@ -1,6 +1,6 @@
linkchecker (1.5.5) unstable; urgency=low
* linkcheck/linkname.py: fix linkname regular expressions (from 1.4.8)
* linkcheck/linkname.py: fix linkname regular expressions (from 1.4.9)
* linkchecker: documentation typos
* linkcheck/__init__.py: use getLinkPat function for all config
entries, not just commandline

View file

@ -77,16 +77,16 @@ _linkMatcher = r"""
\s* # whitespace
%s # tag name
\s+ # whitespace
([^"'>]|"[^"]*"|'[^']*')* # skip leading attributes
([^"'>]|"[^"\n]*"|'[^'\n]*')* # skip leading attributes
%s # attrib name
\s* # whitespace
= # equal sign
\s* # whitespace
(?P<value> # attribute value
"[^"]*" | # in double quotes
'[^']*' | # in single quotes
"[^"\n]*" | # in double quotes
'[^'\n]*' | # in single quotes
[^\s>]+) # unquoted
([^"'>]|"[^"]*"|'[^']*')* # skip trailing attributes
([^"'>]|"[^"\n]*"|'[^'\n]*')* # skip trailing attributes
> # close tag
"""
@ -123,22 +123,22 @@ LinkTags = (
LinkPatterns = []
for tags,attrs in LinkTags:
attr = '(%s)'%'|'.join(attrs)
tag = '(%s)'%'|'.join(tags)
attr = '(%s)'%'|'.join(attrs)
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
re.VERBOSE|re.DOTALL),
'tag': tag,
'attr': attr})
re.VERBOSE),
'tags': tags,
'attrs': attrs})
AnchorPattern = {
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE|re.DOTALL),
'tag': 'a',
'attr': 'name',
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
'tags': ['a'],
'attrs': ['name'],
}
BasePattern = {
'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE),
'tag': 'base',
'attr': 'href',
'tags': ['base'],
'attrs': ['href'],
}
#CommentPattern = re.compile("<!--.*?--\s*>", re.DOTALL)
@ -483,10 +483,16 @@ class UrlData:
def searchInForTag (self, pattern):
debug(HURT_ME_PLENTY, "Searching for tag", `pattern['tag']`,
"attribute", `pattern['attr']`)
debug(HURT_ME_PLENTY, "Searching for tags", `pattern['tags']`,
"attributes", `pattern['attrs']`)
urls = []
index = 0
if 'a' in pattern['tags'] and 'href' in pattern['attrs']:
tag = 'a'
elif 'img' in pattern['tags']:
tag = 'img'
else:
tag = ''
while 1:
match = pattern['pattern'].search(self.getContent(), index)
if not match: break
@ -498,19 +504,19 @@ class UrlData:
url = StringUtil.unhtmlify(url)
lineno= StringUtil.getLineNumber(self.getContent(), match.start())
# extra feature: get optional name for this bookmark
name = self.searchInForName(pattern['tag'], pattern['attr'],
match.start(), match.end())
debug(HURT_ME_PLENTY, "Found", `url`, "at line", lineno)
name = self.searchInForName(tag, match.start(), match.end())
debug(HURT_ME_PLENTY, "Found", `url`, "name", `name`,
"at line", lineno)
urls.append((url, lineno, name))
return urls
def searchInForName (self, tag, attr, start, end):
def searchInForName (self, tag, start, end):
name=""
if tag=='img':
name = linkname.image_name(self.getContent()[start:end])
elif tag=='a' and attr=='href':
if tag=='a':
name = linkname.href_name(self.getContent()[end:])
elif tag=='img':
name = linkname.image_name(self.getContent()[start:end])
return name

View file

@ -18,7 +18,7 @@ import re, StringUtil
imgtag_re = re.compile(r"""(?i)\s+alt\s*=\s*(?P<name>("[^"\n]*"|'[^'\n]*'|[^\s>]+))""")
img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>]+)+>""")
href_re = re.compile(r"""(?i)(?P<name>("[^"\n]*"|'[^'\n]*'|[^<]+|<(?!/a\s*>))*)</a\s*>""")
endtag_re = re.compile(r"""(?i)</a\s*>""")
def image_name(txt):
name = ""
@ -34,15 +34,13 @@ def image_name(txt):
def href_name(txt):
name = ""
mo = href_re.search(txt)
if mo:
#print "DEBUG:", `mo.group(0)`
name = mo.group('name').strip()
if img_re.search(name):
return image_name(name)
name = StringUtil.remove_markup(name)
name = StringUtil.unhtmlify(name)
#print "NAME:", `name`
endtag = endtag_re.search(txt)
if not endtag: return name
name = txt[:endtag.start()]
if img_re.search(name):
return image_name(name)
name = StringUtil.remove_markup(name)
name = StringUtil.unhtmlify(name)
return name
_tests = (
@ -53,6 +51,10 @@ _tests = (
"test<</a>",
"test</</a>",
"test</a</a>",
"test",
"\n",
"",
'"</a>"foo',
)
def _test ():

View file

@ -6,22 +6,30 @@ valid
url file:///home/calvin/projects/linkchecker/test/html/file.asc
valid
url http.html
name relative url
valid
url http.html#isnix
name bad anchor
warning anchor #isnix not found
valid
url http.html#iswas
name good anchor
valid
url file:///etc/group
name good file
valid
url file://etc/group
name bad file
error
url file:/etc/group
cached
name good file
valid
url file:etc/group
name bad file
error
url file:/etc/
name good dir
valid
url file:///etc/group
cached

View file

@ -2,44 +2,59 @@ test_http
url file:///home/calvin/projects/linkchecker/test/html/http.html
valid
url http://www.garantiertnixgutt.bla
name bad url
warning Path is empty
error
url http://www.heise.de
name ok
warning Path is empty
valid
url http:/www.heise.de
name one slash
error
url http:www.heise.de
name no slash
error
url http://
name no url
warning Path is empty
error
url http:/
cached
name no url, one slash
warning Path is empty
error
url http:
cached
name no url, no slash
warning Path is empty
error
url http://www.blubb.de/stalter&sohn
name unquoted ampersand
error
url http://slashdot.org/
name unquoted
valid
url http://fsinfo.cs.uni-sb.de/~calvin/#isnix
name invalid anchor
warning anchor #isnix not found
valid
url HtTP://WWW.hEIsE.DE
cached
name should be cached
warning Path is empty
valid
url HTTP://WWW.HEISE.DE
cached
name should be cached
warning Path is empty
valid
url http://www.heise.de/?quoted=ü
name html entities
valid
url illegalquote1
name no beginning quote
error
url illegalquote2
name no ending quote
error

View file

@ -2,4 +2,5 @@ test_https
url file:///home/calvin/projects/linkchecker/test/html/https.html
valid
url https://sourceforge.net/
name https
valid

View file

@ -2,31 +2,43 @@ test_mail
url file:///home/calvin/projects/linkchecker/test/html/mail.html
valid
url mailto:calvin@LocalHost?subject=Hallo&to=michi
name 1
warning No MX mail host for LocalHost found
valid
url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
name 2
valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
name 3
warning No MX mail host for company.com found
valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
name 4
valid
url mailto:
name 6
warning No adresses found
valid
url mailto:o'hara@cs.uni-sb.de
name 5
valid
url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
name ...
valid
url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
name ...
valid
url mailto:jan@jan-dittberner.de?subject=test
name ...
valid
url mailto:a@d?subject=äöü
name 5
warning No MX mail host for d found
valid
url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
name _
valid
url mailto:Bastian Kleineidam <calvin@host1?foo=bar>
name 3
warning No MX mail host for host1 found
valid

View file

@ -2,8 +2,10 @@ test_misc
url file:///home/calvin/projects/linkchecker/test/html/misc.html
valid
url hutzli:nixgutt
name bad scheme
error
url javascript:loadthis()
name javascript url
warning Javascript url ignored
valid
url telnet:localhost