allow empty relative URLs

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2704 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-07-04 20:28:55 +00:00
parent 9eb96bc520
commit 2e207be127
17 changed files with 114 additions and 41 deletions

View file

@ -1,5 +1,12 @@
3.0 "" (released xx.xx.xxxx)
* Allow empty relative URLs. Note that a completely missing URL is
still an error (ie. <a href=""> is valid, <a href> is an error).
Type: bugfix
Closes: SF bug #
Changed: linkcheck/linkparse.py, linkcheck/logger/*.py,
linkcheck/checker/urlbase.py
* Added checks for more <meta> URL entries, especially favicons
was added.
Type: feature

3
TODO
View file

@ -1,8 +1,5 @@
Possible improvements people could work on:
- [BUGFIX] Accept empty (relative) URIs. Solution: adjust check_syntax()
Note: the empty string ('') seems to be already in the cache (?)
- [FEATURE] Assign numbers to warnings and let the user configure which to
ignore.

50
debian/control.gui vendored Normal file
View file

@ -0,0 +1,50 @@
Source: linkchecker
Section: web
Priority: optional
Maintainer: Bastian Kleineidam <calvin@debian.org>
Build-Depends: python2.4-dev, python, debhelper (>= 4.1.51), gettext, cdbs
Standards-Version: 3.6.1
Package: linkchecker
Architecture: any
Depends: python2.4
Conflicts: linkchecker-ssl
Suggests: apache | httpd, python2.4-psyco, python2.4-optcomplete, python2.4-profiler
Description: check websites and HTML documents for broken links
Provides a command line program and web interface to check links
of websites and HTML documents.
Users preferring a graphical interface can install the linkchecker-gui
package.
Features:
o recursive checking
o multithreaded
o output in colored or normal text, HTML, SQL, CSV or a sitemap
graph in DOT, GML or XML
o HTTP/1.1, HTTPS, FTP, mailto:, news:, nntp:, Gopher, Telnet and local
file links support
o restrict link checking with regular expression filters for URLs
o proxy support
o username/password authorization for HTTP, FTP and Telnet
o robots.txt exclusion protocol support
o Cookie support
o i18n support
Package: linkchecker-gui
Architecture: all
Depends: linkchecker, python2.4-glade2
Description: check websites and HTML documents for broken links (GUI version)
Provides a graphical interface "glinkchecker" to check links of websites
and HTML documents.
Features:
o recursive checking
o multithreaded
o output in colored or normal text, HTML, SQL, CSV or a sitemap
graph in DOT, GML or XML
o HTTP/1.1, HTTPS, FTP, mailto:, news:, nntp:, Gopher, Telnet and local
file links support
o restrict link checking with regular expression filters for URLs
o proxy support
o username/password authorization for HTTP, FTP and Telnet
o robots.txt exclusion protocol support
o Cookie support
o i18n support

View file

@ -206,7 +206,7 @@ def get_url_from (base_url, recursion_level, consumer,
Get url data from given base data.
@param base_url: base url from a link tag
@type base_url: string
@type base_url: string or None
@param recursion_level: current recursion level
@type recursion_level: number
@param consumer: consumer object
@ -222,7 +222,8 @@ def get_url_from (base_url, recursion_level, consumer,
@param name: link name
@type name: string
"""
base_url = linkcheck.strformat.unicode_safe(base_url)
if base_url is not None:
base_url = linkcheck.strformat.unicode_safe(base_url)
if parent_url is not None:
parent_url = linkcheck.strformat.unicode_safe(parent_url)
if base_ref is not None:

View file

@ -93,6 +93,8 @@ class FileUrl (urlbase.UrlBase):
"""
super(FileUrl, self).init(base_ref, base_url, parent_url,
recursion_level, consumer, line, column, name)
if self.base_url is None:
return
base_url = self.base_url
if not (parent_url or base_ref or base_url.startswith("file:")):
base_url = os.path.expanduser(base_url)

View file

@ -286,7 +286,8 @@ class UrlBase (object):
@rtype: bool
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
if not self.base_url:
if (self.base_url is None) or \
(not self.base_url and not self.parent_url):
self.set_result(_("URL is empty"), valid=False)
return
try:
@ -326,7 +327,11 @@ class UrlBase (object):
self.scheme)
self.url = urljoin(self.base_ref, base_url, self.scheme)
elif self.parent_url:
self.url = urljoin(self.parent_url, base_url, self.scheme)
# strip the parent url query and anchor
urlparts = list(urlparse.urlsplit(self.parent_url))
urlparts[3] = urlparts[4] = ""
parent_url = urlparse.urlunsplit(urlparts)
self.url = urljoin(parent_url, base_url, self.scheme)
else:
self.url = base_url
# note: urljoin can unnorm the url path, so norm it again
@ -704,7 +709,8 @@ class UrlBase (object):
Return serialized url check data as unicode string.
"""
sep = linkcheck.strformat.unicode_safe(os.linesep)
assert isinstance(self.base_url, unicode), self
if self.base_url is not None:
assert isinstance(self.base_url, unicode), self
if self.parent_url is not None:
assert isinstance(self.parent_url, unicode), self
if self.base_ref is not None:

View file

@ -61,7 +61,7 @@ class TestLogger (linkcheck.logger.Logger):
Append logger output to self.result.
"""
if self.has_part('url'):
url = u"url %s" % url_data.base_url
url = u"url %s" % (url_data.base_url or u"")
if url_data.cached:
url += u" (cached)"
self.result.append(url)

View file

@ -19,6 +19,10 @@ Just some HTTP links
<a href=""></a>
-->
<!-- empty form URL -->
<form action="" method="GET">
</form>
<!-- multiple links in one tag -->
<applet archive="http.html" src="http.html">
<!-- css urls -->

View file

@ -95,6 +95,11 @@ name html entities
warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%FC.
valid
url (cached)
cache key http://localhost:8001/linkcheck/ftests/data/http.html
real url http://localhost:8001/linkcheck/ftests/data/http.html
valid
url http.html (cached)
cache key http://localhost:8001/linkcheck/ftests/data/http.html
real url http://localhost:8001/linkcheck/ftests/data/http.html

View file

@ -207,7 +207,8 @@ class LinkFinder (TagFinder):
codebase = unquote(attrs.get_true('codebase', u''))
else:
codebase = u''
value = unquote(attrs.get_true(attr, u''))
# note: value can be None
value = unquote(attrs.get(attr))
# add link to url list
self.add_link(tag, attr, value, name, codebase)
linkcheck.log.debug(linkcheck.LOG_CHECK,
@ -242,7 +243,7 @@ class LinkFinder (TagFinder):
assert isinstance(attr, unicode), repr(attr)
assert isinstance(name, unicode), repr(name)
assert isinstance(base, unicode), repr(base)
assert isinstance(url, unicode), repr(url)
assert isinstance(url, unicode) or url is None, repr(url)
urls = []
# look for meta refresh
if tag == u'meta':
@ -261,7 +262,7 @@ class LinkFinder (TagFinder):
# no url found
return
for u in urls:
assert isinstance(u, unicode), repr(u)
assert isinstance(u, unicode) or u is None, repr(u)
linkcheck.log.debug(linkcheck.LOG_CHECK,
u"LinkParser add link %s %s %s %s %s", tag, attr, u, name, base)
self.urls.append((u, self.parser.last_lineno(),

View file

@ -99,12 +99,12 @@ class CSVLogger (linkcheck.logger.Logger):
if self.fd is None:
return
row = []
for s in [url_data.base_url, url_data.recursion_level,
url_data.parent_url or "", url_data.base_ref or "",
for s in [url_data.base_url or u"", url_data.recursion_level,
url_data.parent_url or u"", url_data.base_ref or u"",
url_data.result,
os.linesep.join(url_data.warning),
os.linesep.join(url_data.info),
url_data.valid, url_data.url or "",
url_data.valid, url_data.url or u"",
url_data.line, url_data.column,
url_data.name, url_data.dltime,
url_data.dlsize, url_data.checktime,

View file

@ -100,8 +100,8 @@ class GMLLogger (linkcheck.logger.Logger):
for node in self.nodes.values():
if self.nodes.has_key(node.parent_url):
self.writeln(u" edge [")
self.writeln(u' label "%s"' % node.base_url)
if self.has_part("parenturl"):
self.writeln(u' label "%s"' % (node.base_url or u""))
if self.has_part("parenturl") and node.parent_url:
self.writeln(u" source %d" % \
self.nodes[node.parent_url].id)
self.writeln(u" target %d" % node.id)

View file

@ -169,7 +169,7 @@ class HtmlLogger (linkcheck.logger.Logger):
self.writeln(u"<td bgcolor=\""+self.colorurl+u"\">"+
self.part("url")+u"</td>")
self.write(u"<td bgcolor=\""+self.colorurl+u"\">"+
cgi.escape(repr(url_data.base_url)[1:]))
cgi.escape(repr(url_data.base_url or u"")[1:]))
if url_data.cached:
self.write(_(" (cached)"))
self.writeln(u"</td></tr>")

View file

@ -116,15 +116,15 @@ class SQLLogger (linkcheck.logger.Logger):
"%(cached)d"
")%(separator)s" % \
{'table': self.dbname,
'base_url': sqlify(url_data.base_url),
'base_url': sqlify(url_data.base_url or u""),
'recursion_level': url_data.recursion_level,
'url_parent': sqlify((url_data.parent_url or "")),
'base_ref': sqlify((url_data.base_ref or "")),
'url_parent': sqlify((url_data.parent_url or u"")),
'base_ref': sqlify((url_data.base_ref or u"")),
'valid': intify(url_data.valid),
'result': sqlify(url_data.result),
'warning': sqlify(os.linesep.join(url_data.warning)),
'info': sqlify(os.linesep.join(url_data.info)),
'url': sqlify(linkcheck.url.url_quote(url_data.url or "")),
'url': sqlify(linkcheck.url.url_quote(url_data.url or u"")),
'line': url_data.line,
'column': url_data.column,
'name': sqlify(url_data.name),

View file

@ -141,7 +141,7 @@ class TextLogger (linkcheck.logger.Logger):
"""
self.writeln()
self.write(self.part('url') + self.spaces('url'))
txt = unicode(repr(url_data.base_url)[1:])
txt = unicode(repr(url_data.base_url or u"")[1:])
if url_data.cached:
txt += _(" (cached)")
self.writeln(txt, color=self.colorurl)

View file

@ -148,7 +148,7 @@ class XMLLogger (linkcheck.logger.Logger):
self.writeln(u' target="%d">' % node.id)
if self.has_part("url"):
self.writeln(u" <label>%s</label>" % \
xmlquote(node.base_url))
xmlquote(node.base_url or u""))
self.writeln(u" <data>")
if self.has_part("result"):
self.writeln(u" <valid>%d</valid>" % \

View file

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: calvin@users.sourceforge.net\n"
"POT-Creation-Date: 2005-07-04 01:39+0200\n"
"POT-Creation-Date: 2005-07-04 22:27+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@ -59,15 +59,15 @@ msgstr ""
msgid "%s URL ignored."
msgstr ""
#: ../linkcheck/checker/fileurl.py:117
#: ../linkcheck/checker/fileurl.py:119
msgid "Added trailing slash to directory."
msgstr ""
#: ../linkcheck/checker/fileurl.py:127
#: ../linkcheck/checker/fileurl.py:129
msgid "directory"
msgstr ""
#: ../linkcheck/checker/fileurl.py:143
#: ../linkcheck/checker/fileurl.py:145
#, python-format
msgid ""
"The URL path %r is not the same as the system path %r. You should always use "
@ -109,16 +109,16 @@ msgstr ""
msgid "Python %s on %s"
msgstr ""
#: ../linkcheck/checker/urlbase.py:290
#: ../linkcheck/checker/urlbase.py:291
msgid "URL is empty"
msgstr ""
#: ../linkcheck/checker/urlbase.py:297
#: ../linkcheck/checker/urlbase.py:298
#, python-format
msgid "Effective URL %r."
msgstr ""
#: ../linkcheck/checker/urlbase.py:313
#: ../linkcheck/checker/urlbase.py:314
#, python-format
msgid ""
"URL %r has a unicode domain name which\n"
@ -126,51 +126,51 @@ msgid ""
" the URL %r instead."
msgstr ""
#: ../linkcheck/checker/urlbase.py:318
#: ../linkcheck/checker/urlbase.py:319
#, python-format
msgid "Base URL is not properly normed. Normed URL is %(url)s."
msgstr ""
#: ../linkcheck/checker/urlbase.py:354
#: ../linkcheck/checker/urlbase.py:359
#, python-format
msgid "URL has invalid port %r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:388
#: ../linkcheck/checker/urlbase.py:393
#, python-format
msgid "URL is located in %s."
msgstr ""
#: ../linkcheck/checker/urlbase.py:403 ../linkcheck/checker/httpurl.py:289
#: ../linkcheck/checker/urlbase.py:408 ../linkcheck/checker/httpurl.py:289
#: ../linkcheck/checker/ignoredurl.py:34
msgid "Outside of domain filter, checked only syntax."
msgstr ""
#: ../linkcheck/checker/urlbase.py:419
#: ../linkcheck/checker/urlbase.py:424
msgid "Hostname not found"
msgstr ""
#: ../linkcheck/checker/urlbase.py:422
#: ../linkcheck/checker/urlbase.py:427
#, python-format
msgid "Bad HTTP response %r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:453
#: ../linkcheck/checker/urlbase.py:458
#, python-format
msgid "could not parse content: %r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:537
#: ../linkcheck/checker/urlbase.py:542
#, python-format
msgid "Anchor #%s not found."
msgstr ""
#: ../linkcheck/checker/urlbase.py:589
#: ../linkcheck/checker/urlbase.py:594
#, python-format
msgid "Found %r in link contents."
msgstr ""
#: ../linkcheck/checker/urlbase.py:598
#: ../linkcheck/checker/urlbase.py:603
#, python-format
msgid "Content size %s is larger than %s."
msgstr ""