try to detect unknown URL schemes instead of manually setting the assume_local flag

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3609 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2007-11-28 18:46:50 +00:00
parent a50784042f
commit fcde8bd4d6
11 changed files with 81 additions and 77 deletions

View file

@ -36,6 +36,12 @@
Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py
Closes: SF bug #1831900
* Try to detect unkonwn URL schemes from the command line, eg. URLs
like "rtsp://foo".
Type: feature
Changed: linkchecker, linkcheck/lc_cgi.py,
linkcheck/checker/{__init__,urlbase,httpurl,unknownurl}.py
4.7 "300" (released 17.6.2007)
* Mention in the documentation that --anchors enables logging of

View file

@ -151,7 +151,7 @@ def absolute_url (base_url, base_ref, parent_url):
def get_url_from (base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=0, column=0,
name=u"", assume_local=False):
name=u""):
"""
Get url data from given base data.
@ -180,13 +180,13 @@ def get_url_from (base_url, recursion_level, aggregate,
base_ref = unicode_safe(base_ref)
name = unicode_safe(name)
url = absolute_url(base_url, base_ref, parent_url).lower()
klass = get_urlclass_from(url, assume_local)
klass = get_urlclass_from(url)
return klass(base_url, recursion_level, aggregate,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
def get_urlclass_from (url, assume_local):
def get_urlclass_from (url):
"""Return checker class for given URL."""
if url.startswith("http:"):
klass = linkcheck.checker.httpurl.HttpUrl
@ -206,12 +206,12 @@ def get_urlclass_from (url, assume_local):
url.startswith("news:") or \
url.startswith("snews:"):
klass = linkcheck.checker.nntpurl.NntpUrl
elif assume_local:
# assume local file
klass = linkcheck.checker.fileurl.FileUrl
else:
elif linkcheck.checker.unknownurl.is_unknown_url(url):
# unknown url
klass = linkcheck.checker.unknownurl.UnknownUrl
else:
# assume local file
klass = linkcheck.checker.fileurl.FileUrl
return klass

View file

@ -379,8 +379,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
newobj = linkcheck.checker.get_url_from(
redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name,
assume_local=False)
line=self.line, column=self.column, name=self.name)
# append new object to queue
self.aggregate.urlqueue.put(newobj)
# pretend to be finished and logged

View file

@ -155,6 +155,16 @@ class LinkCheckTest (unittest.TestCase):
"""
return linkcheck.url.url_norm(url)[0]
def get_attrs (self, **kwargs):
"""Return current and data directory as dictionary.
You can augment the dict with keyword attributes."""
d = {
'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()),
'datadir': "linkcheck/checker/tests/data",
}
d.update(kwargs)
return d
def get_resultlines (self, filename):
"""
Return contents of file, as list of lines without line endings,
@ -170,7 +180,7 @@ class LinkCheckTest (unittest.TestCase):
f.close()
return resultlines
def file_test (self, filename, confargs=None, assume_local=True):
def file_test (self, filename, confargs=None):
"""
Check <filename> with expected result in <filename>.result.
"""
@ -179,9 +189,9 @@ class LinkCheckTest (unittest.TestCase):
confargs = {}
logargs = {'expected': self.get_resultlines(filename)}
aggregate = get_test_aggregate(confargs, logargs)
url_data = get_url_from(url, 0, aggregate, assume_local=assume_local)
if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
url_data = get_url_from(url, 0, aggregate)
# XXX if assume_local
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff
@ -192,7 +202,7 @@ class LinkCheckTest (unittest.TestCase):
self.fail(l.encode("iso8859-1", "ignore"))
def direct (self, url, resultlines, parts=None, recursionlevel=0,
confargs=None, assume_local=False):
confargs=None):
"""
Check url with expected result.
"""
@ -205,9 +215,9 @@ class LinkCheckTest (unittest.TestCase):
if parts is not None:
logargs['parts'] = parts
aggregate = get_test_aggregate(confargs, logargs)
url_data = get_url_from(url, 0, aggregate, assume_local=assume_local)
if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
url_data = get_url_from(url, 0, aggregate)
# XXX if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff

View file

@ -34,11 +34,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
Unrecognized scheme test.
"""
url = u"hutzli:"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % url,
u"real url %s" % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)
@ -48,22 +49,22 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
Leading whitespace test.
"""
url = u" http://www.heise.de/"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)
url = u"\nhttp://www.heise.de/"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)
@ -96,12 +97,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
def test_invalid1 (self):
# invalid scheme chars
url = u"äöü?:"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)
@ -109,12 +110,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
def test_invalid2 (self):
# missing scheme alltogether
url = u"?äöü?"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)
@ -122,12 +123,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
def test_invalid3 (self):
# really fucked up
url = u"@³²¼][½ ³@] ¬½"
nurl = self.norm(url)
attrs = self.get_attrs(url=url)
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
u"url %(nurl)s" % attrs,
u"cache key %(nurl)s" % attrs,
u"real url %(nurl)s" % attrs,
u"error",
]
self.direct(url, resultlines)

View file

@ -23,13 +23,6 @@ import os
import linkcheck.checker.tests
def get_attrs ():
return {
'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()),
'datadir': "linkcheck/checker/tests/data",
}
class TestFile (linkcheck.checker.tests.LinkCheckTest):
"""
Test file:// link checking (and file content parsing).
@ -66,8 +59,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
self.file_test("urllist.txt")
def test_good_file (self):
attrs = get_attrs()
url = u"file://%(curdir)s/%(datadir)s/file.txt" % attrs
url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
@ -83,8 +75,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
# Cause: url get quoted %7C which gets lowercased to
# %7c and this fails.
return
attrs = get_attrs()
url = u"file:/%(curdir)s/%(datadir)s/file.txt" % attrs
url = u"file:/%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
@ -96,7 +87,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
def test_good_file_missing_dslash (self):
# good file (missing double slash)
attrs = get_attrs()
attrs = self.get_attrs()
url = u"file:%(curdir)s/%(datadir)s/file.txt" % attrs
nurl = self.norm(url)
resultlines = [
@ -109,8 +100,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
self.direct(url, resultlines)
def test_good_dir (self):
attrs = get_attrs()
url = u"file://%(curdir)s/%(datadir)s/" % attrs
url = u"file://%(curdir)s/%(datadir)s/" % self.get_attrs()
resultlines = [
u"url %s" % url,
u"cache key %s" % url,

View file

@ -36,7 +36,7 @@ class TestHttp (httptest.HttpServerTest):
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
u"http.html" % self.port
resultlines = self.get_resultlines("http.html")
self.direct(url, resultlines, recursionlevel=1, assume_local=True)
self.direct(url, resultlines, recursionlevel=1)
self.redirect1_http_test()
self.redirect2_http_test()
self.robots_txt_test()
@ -69,7 +69,7 @@ class TestHttp (httptest.HttpServerTest):
u"real url %s" % rurl,
u"error",
]
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
self.direct(url, resultlines, recursionlevel=0)
def redirect1_http_test (self):
url = u"http://localhost:%d/redirect1" % self.port
@ -82,7 +82,7 @@ class TestHttp (httptest.HttpServerTest):
u"info Redirected to %s." % rurl,
u"error",
]
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
self.direct(url, resultlines, recursionlevel=0)
def redirect2_http_test (self):
url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
@ -101,7 +101,7 @@ class TestHttp (httptest.HttpServerTest):
u"name Recursive Redirect",
u"valid",
]
self.direct(url, resultlines, recursionlevel=99, assume_local=True)
self.direct(url, resultlines, recursionlevel=99)
def robots_txt_test (self):
url = u"http://localhost:%d/robots.txt" % self.port
@ -111,7 +111,7 @@ class TestHttp (httptest.HttpServerTest):
u"real url %s" % url,
u"valid",
]
self.direct(url, resultlines, recursionlevel=5, assume_local=True)
self.direct(url, resultlines, recursionlevel=5)
def robots_txt2_test (self):
url = u"http://localhost:%d/secret" % self.port
@ -122,7 +122,7 @@ class TestHttp (httptest.HttpServerTest):
u"warning Access denied by robots.txt, checked only syntax.",
u"valid",
]
self.direct(url, resultlines, recursionlevel=5, assume_local=True)
self.direct(url, resultlines, recursionlevel=5)
def noproxyfor_test (self):
"""
@ -141,7 +141,7 @@ class TestHttp (httptest.HttpServerTest):
u"valid",
]
self.direct(url, resultlines, recursionlevel=0,
confargs=confargs, assume_local=True)
confargs=confargs)
del os.environ["http_proxy"]

View file

@ -60,6 +60,8 @@ acap # application configuration access protocol
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
is_unknown_url = ignored_schemes_re.search
class UnknownUrl (urlbase.UrlBase):
"""

View file

@ -672,8 +672,7 @@ class UrlBase (object):
base_ref = linkcheck.url.url_norm(base_ref)[0]
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.aggregate, parent_url=self.url,
base_ref=base_ref, line=line, column=column, name=name,
assume_local=False)
base_ref=base_ref, line=line, column=column, name=name)
self.aggregate.urlqueue.put(url_data)
def parse_opera (self):
@ -694,8 +693,7 @@ class UrlBase (object):
if url:
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, name=name,
assume_local=False)
parent_url=self.url, line=lineno, name=name)
self.aggregate.urlqueue.put(url_data)
name = ""
@ -714,8 +712,7 @@ class UrlBase (object):
continue
url_data = linkcheck.checker.get_url_from(line,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno,
assume_local=False)
parent_url=self.url, line=lineno)
self.aggregate.urlqueue.put(url_data)
def parse_css (self):
@ -734,8 +731,7 @@ class UrlBase (object):
url = linkcheck.strformat.unquote(mo.group("url").strip())
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, column=column,
assume_local=False)
parent_url=self.url, line=lineno, column=column)
self.aggregate.urlqueue.put(url_data)
def serialized (self):

View file

@ -95,7 +95,7 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
aggregate = linkcheck.director.get_aggregate(config)
get_url_from = linkcheck.checker.get_url_from
url = form["url"].value
url_data = get_url_from(url, 0, aggregate, assume_local=False)
url_data = get_url_from(url, 0, aggregate)
try:
linkcheck.add_intern_pattern(url_data, config)
except UnicodeError:

View file

@ -722,7 +722,7 @@ for url in args:
elif url.lower().startswith("ftp."):
# syntactic sugar
url = "ftp://%s" % url
url_data = get_url_from(url, 0, aggregate, assume_local=True)
url_data = get_url_from(url, 0, aggregate)
try:
linkcheck.add_intern_pattern(url_data, config)
except UnicodeError: