diff --git a/ChangeLog b/ChangeLog index 7c2492f9..b5094fb9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -36,6 +36,12 @@ Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py Closes: SF bug #1831900 + * Try to detect unkonwn URL schemes from the command line, eg. URLs + like "rtsp://foo". + Type: feature + Changed: linkchecker, linkcheck/lc_cgi.py, + linkcheck/checker/{__init__,urlbase,httpurl,unknownurl}.py + 4.7 "300" (released 17.6.2007) * Mention in the documentation that --anchors enables logging of diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index c9a7cada..3db4f3d3 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -151,7 +151,7 @@ def absolute_url (base_url, base_ref, parent_url): def get_url_from (base_url, recursion_level, aggregate, parent_url=None, base_ref=None, line=0, column=0, - name=u"", assume_local=False): + name=u""): """ Get url data from given base data. @@ -180,13 +180,13 @@ def get_url_from (base_url, recursion_level, aggregate, base_ref = unicode_safe(base_ref) name = unicode_safe(name) url = absolute_url(base_url, base_ref, parent_url).lower() - klass = get_urlclass_from(url, assume_local) + klass = get_urlclass_from(url) return klass(base_url, recursion_level, aggregate, parent_url=parent_url, base_ref=base_ref, line=line, column=column, name=name) -def get_urlclass_from (url, assume_local): +def get_urlclass_from (url): """Return checker class for given URL.""" if url.startswith("http:"): klass = linkcheck.checker.httpurl.HttpUrl @@ -206,12 +206,12 @@ def get_urlclass_from (url, assume_local): url.startswith("news:") or \ url.startswith("snews:"): klass = linkcheck.checker.nntpurl.NntpUrl - elif assume_local: - # assume local file - klass = linkcheck.checker.fileurl.FileUrl - else: + elif linkcheck.checker.unknownurl.is_unknown_url(url): # unknown url klass = linkcheck.checker.unknownurl.UnknownUrl + else: + # assume local file + klass = linkcheck.checker.fileurl.FileUrl return klass diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 95b0c1e7..dc98f1fd 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -379,8 +379,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): newobj = linkcheck.checker.get_url_from( redirected, self.recursion_level, self.aggregate, parent_url=self.parent_url, base_ref=self.base_ref, - line=self.line, column=self.column, name=self.name, - assume_local=False) + line=self.line, column=self.column, name=self.name) # append new object to queue self.aggregate.urlqueue.put(newobj) # pretend to be finished and logged diff --git a/linkcheck/checker/tests/__init__.py b/linkcheck/checker/tests/__init__.py index c0bfbac0..84c83bc0 100644 --- a/linkcheck/checker/tests/__init__.py +++ b/linkcheck/checker/tests/__init__.py @@ -155,6 +155,16 @@ class LinkCheckTest (unittest.TestCase): """ return linkcheck.url.url_norm(url)[0] + def get_attrs (self, **kwargs): + """Return current and data directory as dictionary. + You can augment the dict with keyword attributes.""" + d = { + 'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()), + 'datadir': "linkcheck/checker/tests/data", + } + d.update(kwargs) + return d + def get_resultlines (self, filename): """ Return contents of file, as list of lines without line endings, @@ -170,7 +180,7 @@ class LinkCheckTest (unittest.TestCase): f.close() return resultlines - def file_test (self, filename, confargs=None, assume_local=True): + def file_test (self, filename, confargs=None): """ Check with expected result in .result. """ @@ -179,9 +189,9 @@ class LinkCheckTest (unittest.TestCase): confargs = {} logargs = {'expected': self.get_resultlines(filename)} aggregate = get_test_aggregate(confargs, logargs) - url_data = get_url_from(url, 0, aggregate, assume_local=assume_local) - if assume_local: - linkcheck.add_intern_pattern(url_data, aggregate.config) + url_data = get_url_from(url, 0, aggregate) + # XXX if assume_local + linkcheck.add_intern_pattern(url_data, aggregate.config) aggregate.urlqueue.put(url_data) linkcheck.director.check_urls(aggregate) diff = aggregate.config['logger'].diff @@ -192,7 +202,7 @@ class LinkCheckTest (unittest.TestCase): self.fail(l.encode("iso8859-1", "ignore")) def direct (self, url, resultlines, parts=None, recursionlevel=0, - confargs=None, assume_local=False): + confargs=None): """ Check url with expected result. """ @@ -205,9 +215,9 @@ class LinkCheckTest (unittest.TestCase): if parts is not None: logargs['parts'] = parts aggregate = get_test_aggregate(confargs, logargs) - url_data = get_url_from(url, 0, aggregate, assume_local=assume_local) - if assume_local: - linkcheck.add_intern_pattern(url_data, aggregate.config) + url_data = get_url_from(url, 0, aggregate) + # XXX if assume_local: + linkcheck.add_intern_pattern(url_data, aggregate.config) aggregate.urlqueue.put(url_data) linkcheck.director.check_urls(aggregate) diff = aggregate.config['logger'].diff diff --git a/linkcheck/checker/tests/test_error.py b/linkcheck/checker/tests/test_error.py index e94d9fa8..4f3505a0 100644 --- a/linkcheck/checker/tests/test_error.py +++ b/linkcheck/checker/tests/test_error.py @@ -34,11 +34,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest): Unrecognized scheme test. """ url = u"hutzli:" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % url, - u"real url %s" % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) @@ -48,22 +49,22 @@ class TestError (linkcheck.checker.tests.LinkCheckTest): Leading whitespace test. """ url = u" http://www.heise.de/" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"warning Base URL is not properly normed. Normed URL is %s." % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) url = u"\nhttp://www.heise.de/" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"warning Base URL is not properly normed. Normed URL is %s." % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) @@ -96,12 +97,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest): def test_invalid1 (self): # invalid scheme chars url = u"äöü?:" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"warning Base URL is not properly normed. Normed URL is %s." % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) @@ -109,12 +110,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest): def test_invalid2 (self): # missing scheme alltogether url = u"?äöü?" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"warning Base URL is not properly normed. Normed URL is %s." % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) @@ -122,12 +123,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest): def test_invalid3 (self): # really fucked up url = u"@³²¼][½ ³@] ¬½" - nurl = self.norm(url) + attrs = self.get_attrs(url=url) + attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs) resultlines = [ - u"url %s" % url, - u"cache key %s" % nurl, - u"real url %s" % nurl, - u"warning Base URL is not properly normed. Normed URL is %s." % nurl, + u"url %(nurl)s" % attrs, + u"cache key %(nurl)s" % attrs, + u"real url %(nurl)s" % attrs, u"error", ] self.direct(url, resultlines) diff --git a/linkcheck/checker/tests/test_file.py b/linkcheck/checker/tests/test_file.py index ed1b7168..965f06d8 100644 --- a/linkcheck/checker/tests/test_file.py +++ b/linkcheck/checker/tests/test_file.py @@ -23,13 +23,6 @@ import os import linkcheck.checker.tests -def get_attrs (): - return { - 'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()), - 'datadir': "linkcheck/checker/tests/data", - } - - class TestFile (linkcheck.checker.tests.LinkCheckTest): """ Test file:// link checking (and file content parsing). @@ -66,8 +59,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest): self.file_test("urllist.txt") def test_good_file (self): - attrs = get_attrs() - url = u"file://%(curdir)s/%(datadir)s/file.txt" % attrs + url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs() nurl = self.norm(url) resultlines = [ u"url %s" % url, @@ -83,8 +75,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest): # Cause: url get quoted %7C which gets lowercased to # %7c and this fails. return - attrs = get_attrs() - url = u"file:/%(curdir)s/%(datadir)s/file.txt" % attrs + url = u"file:/%(curdir)s/%(datadir)s/file.txt" % self.get_attrs() nurl = self.norm(url) resultlines = [ u"url %s" % url, @@ -96,7 +87,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest): def test_good_file_missing_dslash (self): # good file (missing double slash) - attrs = get_attrs() + attrs = self.get_attrs() url = u"file:%(curdir)s/%(datadir)s/file.txt" % attrs nurl = self.norm(url) resultlines = [ @@ -109,8 +100,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest): self.direct(url, resultlines) def test_good_dir (self): - attrs = get_attrs() - url = u"file://%(curdir)s/%(datadir)s/" % attrs + url = u"file://%(curdir)s/%(datadir)s/" % self.get_attrs() resultlines = [ u"url %s" % url, u"cache key %s" % url, diff --git a/linkcheck/checker/tests/test_http.py b/linkcheck/checker/tests/test_http.py index fde593fc..6262afda 100644 --- a/linkcheck/checker/tests/test_http.py +++ b/linkcheck/checker/tests/test_http.py @@ -36,7 +36,7 @@ class TestHttp (httptest.HttpServerTest): url = u"http://localhost:%d/linkcheck/checker/tests/data/" \ u"http.html" % self.port resultlines = self.get_resultlines("http.html") - self.direct(url, resultlines, recursionlevel=1, assume_local=True) + self.direct(url, resultlines, recursionlevel=1) self.redirect1_http_test() self.redirect2_http_test() self.robots_txt_test() @@ -69,7 +69,7 @@ class TestHttp (httptest.HttpServerTest): u"real url %s" % rurl, u"error", ] - self.direct(url, resultlines, recursionlevel=0, assume_local=True) + self.direct(url, resultlines, recursionlevel=0) def redirect1_http_test (self): url = u"http://localhost:%d/redirect1" % self.port @@ -82,7 +82,7 @@ class TestHttp (httptest.HttpServerTest): u"info Redirected to %s." % rurl, u"error", ] - self.direct(url, resultlines, recursionlevel=0, assume_local=True) + self.direct(url, resultlines, recursionlevel=0) def redirect2_http_test (self): url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \ @@ -101,7 +101,7 @@ class TestHttp (httptest.HttpServerTest): u"name Recursive Redirect", u"valid", ] - self.direct(url, resultlines, recursionlevel=99, assume_local=True) + self.direct(url, resultlines, recursionlevel=99) def robots_txt_test (self): url = u"http://localhost:%d/robots.txt" % self.port @@ -111,7 +111,7 @@ class TestHttp (httptest.HttpServerTest): u"real url %s" % url, u"valid", ] - self.direct(url, resultlines, recursionlevel=5, assume_local=True) + self.direct(url, resultlines, recursionlevel=5) def robots_txt2_test (self): url = u"http://localhost:%d/secret" % self.port @@ -122,7 +122,7 @@ class TestHttp (httptest.HttpServerTest): u"warning Access denied by robots.txt, checked only syntax.", u"valid", ] - self.direct(url, resultlines, recursionlevel=5, assume_local=True) + self.direct(url, resultlines, recursionlevel=5) def noproxyfor_test (self): """ @@ -141,7 +141,7 @@ class TestHttp (httptest.HttpServerTest): u"valid", ] self.direct(url, resultlines, recursionlevel=0, - confargs=confargs, assume_local=True) + confargs=confargs) del os.environ["http_proxy"] diff --git a/linkcheck/checker/unknownurl.py b/linkcheck/checker/unknownurl.py index f4722a26..00f69712 100644 --- a/linkcheck/checker/unknownurl.py +++ b/linkcheck/checker/unknownurl.py @@ -60,6 +60,8 @@ acap # application configuration access protocol ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) +is_unknown_url = ignored_schemes_re.search + class UnknownUrl (urlbase.UrlBase): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 2ef1f52d..7cc43113 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -672,8 +672,7 @@ class UrlBase (object): base_ref = linkcheck.url.url_norm(base_ref)[0] url_data = linkcheck.checker.get_url_from(url, self.recursion_level+1, self.aggregate, parent_url=self.url, - base_ref=base_ref, line=line, column=column, name=name, - assume_local=False) + base_ref=base_ref, line=line, column=column, name=name) self.aggregate.urlqueue.put(url_data) def parse_opera (self): @@ -694,8 +693,7 @@ class UrlBase (object): if url: url_data = linkcheck.checker.get_url_from(url, self.recursion_level+1, self.aggregate, - parent_url=self.url, line=lineno, name=name, - assume_local=False) + parent_url=self.url, line=lineno, name=name) self.aggregate.urlqueue.put(url_data) name = "" @@ -714,8 +712,7 @@ class UrlBase (object): continue url_data = linkcheck.checker.get_url_from(line, self.recursion_level+1, self.aggregate, - parent_url=self.url, line=lineno, - assume_local=False) + parent_url=self.url, line=lineno) self.aggregate.urlqueue.put(url_data) def parse_css (self): @@ -734,8 +731,7 @@ class UrlBase (object): url = linkcheck.strformat.unquote(mo.group("url").strip()) url_data = linkcheck.checker.get_url_from(url, self.recursion_level+1, self.aggregate, - parent_url=self.url, line=lineno, column=column, - assume_local=False) + parent_url=self.url, line=lineno, column=column) self.aggregate.urlqueue.put(url_data) def serialized (self): diff --git a/linkcheck/lc_cgi.py b/linkcheck/lc_cgi.py index 64795ae9..aa0514a5 100644 --- a/linkcheck/lc_cgi.py +++ b/linkcheck/lc_cgi.py @@ -95,7 +95,7 @@ def checklink (out=sys.stdout, form=None, env=os.environ): aggregate = linkcheck.director.get_aggregate(config) get_url_from = linkcheck.checker.get_url_from url = form["url"].value - url_data = get_url_from(url, 0, aggregate, assume_local=False) + url_data = get_url_from(url, 0, aggregate) try: linkcheck.add_intern_pattern(url_data, config) except UnicodeError: diff --git a/linkchecker b/linkchecker index a4b2a991..15213747 100755 --- a/linkchecker +++ b/linkchecker @@ -722,7 +722,7 @@ for url in args: elif url.lower().startswith("ftp."): # syntactic sugar url = "ftp://%s" % url - url_data = get_url_from(url, 0, aggregate, assume_local=True) + url_data = get_url_from(url, 0, aggregate) try: linkcheck.add_intern_pattern(url_data, config) except UnicodeError: