allow empty relative URLs

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2704 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-02 03:44:43 +00:00 · 2005-07-04 20:28:55 +00:00 · 2005-07-04 20:28:55 +00:00 · 2e207be127
commit 2e207be127
parent 9eb96bc520
17 changed files with 114 additions and 41 deletions
--- a/7
+++ b/7
@ -1,5 +1,12 @@
 3.0 "" (released xx.xx.xxxx)

+  * Allow empty relative URLs. Note that a completely missing URL is
+    still an error (ie. <a href=""> is valid, <a href> is an error).
+    Type: bugfix
+    Closes: SF bug #
+    Changed: linkcheck/linkparse.py, linkcheck/logger/*.py,
+      linkcheck/checker/urlbase.py
+
  * Added checks for more <meta> URL entries, especially favicons
    was added.
    Type: feature
--- a/3
+++ b/3
@ -1,8 +1,5 @@
 Possible improvements people could work on:

- [BUGFIX] Accept empty (relative) URIs. Solution: adjust check_syntax()
-  Note: the empty string ('') seems to be already in the cache (?)
-
 - [FEATURE] Assign numbers to warnings and let the user configure which to
  ignore.

--- a/debian/control.gui
+++ b/debian/control.gui
@ -0,0 +1,50 @@
+Source: linkchecker
+Section: web
+Priority: optional
+Maintainer: Bastian Kleineidam <calvin@debian.org>
+Build-Depends: python2.4-dev, python, debhelper (>= 4.1.51), gettext, cdbs
+Standards-Version: 3.6.1
+
+Package: linkchecker
+Architecture: any
+Depends: python2.4
+Conflicts: linkchecker-ssl
+Suggests: apache | httpd, python2.4-psyco, python2.4-optcomplete, python2.4-profiler
+Description: check websites and HTML documents for broken links
+ Provides a command line program and web interface to check links
+ of websites and HTML documents.
+ Users preferring a graphical interface can install the linkchecker-gui
+ package.
+ Features:
+  o recursive checking
+  o multithreaded
+  o output in colored or normal text, HTML, SQL, CSV or a sitemap
+    graph in DOT, GML or XML
+  o HTTP/1.1, HTTPS, FTP, mailto:, news:, nntp:, Gopher, Telnet and local
+    file links support
+  o restrict link checking with regular expression filters for URLs
+  o proxy support
+  o username/password authorization for HTTP, FTP and Telnet
+  o robots.txt exclusion protocol support
+  o Cookie support
+  o i18n support
+
+Package: linkchecker-gui
+Architecture: all
+Depends: linkchecker, python2.4-glade2
+Description: check websites and HTML documents for broken links (GUI version)
+ Provides a graphical interface "glinkchecker" to check links of websites
+ and HTML  documents.
+ Features:
+  o recursive checking
+  o multithreaded
+  o output in colored or normal text, HTML, SQL, CSV or a sitemap
+    graph in DOT, GML or XML
+  o HTTP/1.1, HTTPS, FTP, mailto:, news:, nntp:, Gopher, Telnet and local
+    file links support
+  o restrict link checking with regular expression filters for URLs
+  o proxy support
+  o username/password authorization for HTTP, FTP and Telnet
+  o robots.txt exclusion protocol support
+  o Cookie support
+  o i18n support
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -206,7 +206,7 @@ def get_url_from (base_url, recursion_level, consumer,
    Get url data from given base data.

    @param base_url: base url from a link tag
-    @type base_url: string
+    @type base_url: string or None
    @param recursion_level: current recursion level
    @type recursion_level: number
    @param consumer: consumer object
@ -222,7 +222,8 @@ def get_url_from (base_url, recursion_level, consumer,
    @param name: link name
    @type name: string
    """
-    base_url = linkcheck.strformat.unicode_safe(base_url)
+    if base_url is not None:
+        base_url = linkcheck.strformat.unicode_safe(base_url)
    if parent_url is not None:
        parent_url = linkcheck.strformat.unicode_safe(parent_url)
    if base_ref is not None:
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -93,6 +93,8 @@ class FileUrl (urlbase.UrlBase):
        """
        super(FileUrl, self).init(base_ref, base_url, parent_url,
                               recursion_level, consumer, line, column, name)
+        if self.base_url is None:
+            return
        base_url = self.base_url
        if not (parent_url or base_ref or base_url.startswith("file:")):
            base_url = os.path.expanduser(base_url)
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@ -286,7 +286,8 @@ class UrlBase (object):
        @rtype: bool
        """
        linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
-        if not self.base_url:
+        if (self.base_url is None) or \
+           (not self.base_url and not self.parent_url):
            self.set_result(_("URL is empty"), valid=False)
            return
        try:
@ -326,7 +327,11 @@ class UrlBase (object):
                                        self.scheme)
            self.url = urljoin(self.base_ref, base_url, self.scheme)
        elif self.parent_url:
-            self.url = urljoin(self.parent_url, base_url, self.scheme)
+            # strip the parent url query and anchor
+            urlparts = list(urlparse.urlsplit(self.parent_url))
+            urlparts[3] = urlparts[4] = ""
+            parent_url = urlparse.urlunsplit(urlparts)
+            self.url = urljoin(parent_url, base_url, self.scheme)
        else:
            self.url = base_url
        # note: urljoin can unnorm the url path, so norm it again
@ -704,7 +709,8 @@ class UrlBase (object):
        Return serialized url check data as unicode string.
        """
        sep = linkcheck.strformat.unicode_safe(os.linesep)
-        assert isinstance(self.base_url, unicode), self
+        if self.base_url is not None:
+            assert isinstance(self.base_url, unicode), self
        if self.parent_url is not None:
            assert isinstance(self.parent_url, unicode), self
        if self.base_ref is not None:
--- a/linkcheck/ftests/init.py
+++ b/linkcheck/ftests/init.py
@ -61,7 +61,7 @@ class TestLogger (linkcheck.logger.Logger):
        Append logger output to self.result.
        """
        if self.has_part('url'):
-            url = u"url %s" % url_data.base_url
+            url = u"url %s" % (url_data.base_url or u"")
            if url_data.cached:
                url += u" (cached)"
            self.result.append(url)
--- a/linkcheck/ftests/data/http.html
+++ b/linkcheck/ftests/data/http.html
@ -19,6 +19,10 @@ Just some HTTP links
 <a href=""></a>
 -->

+<!-- empty form URL -->
+<form action="" method="GET">
+</form>
+
 <!-- multiple links in one tag -->
 <applet archive="http.html" src="http.html">
 <!-- css urls -->
--- a/linkcheck/ftests/data/http.html.result
+++ b/linkcheck/ftests/data/http.html.result
@ -95,6 +95,11 @@ name html entities
 warning Base URL is not properly normed. Normed URL is http://localhost:8001/?quoted=%%FC.
 valid

+url  (cached)
+cache key http://localhost:8001/linkcheck/ftests/data/http.html
+real url http://localhost:8001/linkcheck/ftests/data/http.html
+valid
+
 url http.html (cached)
 cache key http://localhost:8001/linkcheck/ftests/data/http.html
 real url http://localhost:8001/linkcheck/ftests/data/http.html
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@ -207,7 +207,8 @@ class LinkFinder (TagFinder):
                codebase = unquote(attrs.get_true('codebase', u''))
            else:
                codebase = u''
-            value = unquote(attrs.get_true(attr, u''))
+            # note: value can be None
+            value = unquote(attrs.get(attr))
            # add link to url list
            self.add_link(tag, attr, value, name, codebase)
        linkcheck.log.debug(linkcheck.LOG_CHECK,
@ -242,7 +243,7 @@ class LinkFinder (TagFinder):
        assert isinstance(attr, unicode), repr(attr)
        assert isinstance(name, unicode), repr(name)
        assert isinstance(base, unicode), repr(base)
-        assert isinstance(url, unicode), repr(url)
+        assert isinstance(url, unicode) or url is None, repr(url)
        urls = []
        # look for meta refresh
        if tag == u'meta':
@ -261,7 +262,7 @@ class LinkFinder (TagFinder):
            # no url found
            return
        for u in urls:
-            assert isinstance(u, unicode), repr(u)
+            assert isinstance(u, unicode) or u is None, repr(u)
            linkcheck.log.debug(linkcheck.LOG_CHECK,
              u"LinkParser add link %s %s %s %s %s", tag, attr, u, name, base)
            self.urls.append((u, self.parser.last_lineno(),
--- a/linkcheck/logger/csvlog.py
+++ b/linkcheck/logger/csvlog.py
@ -99,12 +99,12 @@ class CSVLogger (linkcheck.logger.Logger):
        if self.fd is None:
            return
        row = []
-        for s in [url_data.base_url, url_data.recursion_level,
-               url_data.parent_url or "", url_data.base_ref or "",
+        for s in [url_data.base_url or u"", url_data.recursion_level,
+               url_data.parent_url or u"", url_data.base_ref or u"",
               url_data.result,
               os.linesep.join(url_data.warning),
               os.linesep.join(url_data.info),
-               url_data.valid, url_data.url or "",
+               url_data.valid, url_data.url or u"",
               url_data.line, url_data.column,
               url_data.name, url_data.dltime,
               url_data.dlsize, url_data.checktime,
--- a/linkcheck/logger/gml.py
+++ b/linkcheck/logger/gml.py
@ -100,8 +100,8 @@ class GMLLogger (linkcheck.logger.Logger):
        for node in self.nodes.values():
            if self.nodes.has_key(node.parent_url):
                self.writeln(u"  edge [")
-                self.writeln(u'    label  "%s"' % node.base_url)
-                if self.has_part("parenturl"):
+                self.writeln(u'    label  "%s"' % (node.base_url or u""))
+                if self.has_part("parenturl") and node.parent_url:
                    self.writeln(u"    source %d" % \
                                 self.nodes[node.parent_url].id)
                self.writeln(u"    target %d" % node.id)
--- a/linkcheck/logger/html.py
+++ b/linkcheck/logger/html.py
@ -169,7 +169,7 @@ class HtmlLogger (linkcheck.logger.Logger):
        self.writeln(u"<td bgcolor=\""+self.colorurl+u"\">"+
                     self.part("url")+u"</td>")
        self.write(u"<td bgcolor=\""+self.colorurl+u"\">"+
-                   cgi.escape(repr(url_data.base_url)[1:]))
+                   cgi.escape(repr(url_data.base_url or u"")[1:]))
        if url_data.cached:
            self.write(_(" (cached)"))
        self.writeln(u"</td></tr>")
--- a/linkcheck/logger/sql.py
+++ b/linkcheck/logger/sql.py
@ -116,15 +116,15 @@ class SQLLogger (linkcheck.logger.Logger):
              "%(cached)d"
              ")%(separator)s" % \
              {'table': self.dbname,
-               'base_url': sqlify(url_data.base_url),
+               'base_url': sqlify(url_data.base_url or u""),
               'recursion_level': url_data.recursion_level,
-               'url_parent': sqlify((url_data.parent_url or "")),
-               'base_ref': sqlify((url_data.base_ref or "")),
+               'url_parent': sqlify((url_data.parent_url or u"")),
+               'base_ref': sqlify((url_data.base_ref or u"")),
               'valid': intify(url_data.valid),
               'result': sqlify(url_data.result),
               'warning': sqlify(os.linesep.join(url_data.warning)),
               'info': sqlify(os.linesep.join(url_data.info)),
-               'url': sqlify(linkcheck.url.url_quote(url_data.url or "")),
+               'url': sqlify(linkcheck.url.url_quote(url_data.url or u"")),
               'line': url_data.line,
               'column': url_data.column,
               'name': sqlify(url_data.name),
--- a/linkcheck/logger/text.py
+++ b/linkcheck/logger/text.py
@ -141,7 +141,7 @@ class TextLogger (linkcheck.logger.Logger):
        """
        self.writeln()
        self.write(self.part('url') + self.spaces('url'))
-        txt = unicode(repr(url_data.base_url)[1:])
+        txt = unicode(repr(url_data.base_url or u"")[1:])
        if url_data.cached:
            txt += _(" (cached)")
        self.writeln(txt, color=self.colorurl)
--- a/linkcheck/logger/xmllog.py
+++ b/linkcheck/logger/xmllog.py
@ -148,7 +148,7 @@ class XMLLogger (linkcheck.logger.Logger):
                self.writeln(u' target="%d">' % node.id)
                if self.has_part("url"):
                    self.writeln(u"    <label>%s</label>" % \
-                                 xmlquote(node.base_url))
+                                 xmlquote(node.base_url or u""))
                self.writeln(u"    <data>")
                if self.has_part("result"):
                    self.writeln(u"      <valid>%d</valid>" % \
--- a/po/linkchecker.pot
+++ b/po/linkchecker.pot
@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\n"
 "Report-Msgid-Bugs-To: calvin@users.sourceforge.net\n"
-"POT-Creation-Date: 2005-07-04 01:39+0200\n"
+"POT-Creation-Date: 2005-07-04 22:27+0200\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: LANGUAGE <LL@li.org>\n"
@ -59,15 +59,15 @@ msgstr ""
 msgid "%s URL ignored."
 msgstr ""

-#: ../linkcheck/checker/fileurl.py:117
+#: ../linkcheck/checker/fileurl.py:119
 msgid "Added trailing slash to directory."
 msgstr ""

-#: ../linkcheck/checker/fileurl.py:127
+#: ../linkcheck/checker/fileurl.py:129
 msgid "directory"
 msgstr ""

-#: ../linkcheck/checker/fileurl.py:143
+#: ../linkcheck/checker/fileurl.py:145
 #, python-format
 msgid ""
 "The URL path %r is not the same as the system path %r. You should always use "
@ -109,16 +109,16 @@ msgstr ""
 msgid "Python %s on %s"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:290
+#: ../linkcheck/checker/urlbase.py:291
 msgid "URL is empty"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:297
+#: ../linkcheck/checker/urlbase.py:298
 #, python-format
 msgid "Effective URL %r."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:313
+#: ../linkcheck/checker/urlbase.py:314
 #, python-format
 msgid ""
 "URL %r has a unicode domain name which\n"
@ -126,51 +126,51 @@ msgid ""
 "                          the URL %r instead."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:318
+#: ../linkcheck/checker/urlbase.py:319
 #, python-format
 msgid "Base URL is not properly normed. Normed URL is %(url)s."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:354
+#: ../linkcheck/checker/urlbase.py:359
 #, python-format
 msgid "URL has invalid port %r"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:388
+#: ../linkcheck/checker/urlbase.py:393
 #, python-format
 msgid "URL is located in %s."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:403 ../linkcheck/checker/httpurl.py:289
+#: ../linkcheck/checker/urlbase.py:408 ../linkcheck/checker/httpurl.py:289
 #: ../linkcheck/checker/ignoredurl.py:34
 msgid "Outside of domain filter, checked only syntax."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:419
+#: ../linkcheck/checker/urlbase.py:424
 msgid "Hostname not found"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:422
+#: ../linkcheck/checker/urlbase.py:427
 #, python-format
 msgid "Bad HTTP response %r"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:453
+#: ../linkcheck/checker/urlbase.py:458
 #, python-format
 msgid "could not parse content: %r"
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:537
+#: ../linkcheck/checker/urlbase.py:542
 #, python-format
 msgid "Anchor #%s not found."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:589
+#: ../linkcheck/checker/urlbase.py:594
 #, python-format
 msgid "Found %r in link contents."
 msgstr ""

-#: ../linkcheck/checker/urlbase.py:598
+#: ../linkcheck/checker/urlbase.py:603
 #, python-format
 msgid "Content size %s is larger than %s."
 msgstr ""