mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-24 01:40:23 +00:00
try to detect unknown URL schemes instead of manually setting the assume_local flag
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3609 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
a50784042f
commit
fcde8bd4d6
11 changed files with 81 additions and 77 deletions
|
|
@ -36,6 +36,12 @@
|
|||
Changed: linkcheck/linkparse.py, linkcheck/checker/urlbase.py
|
||||
Closes: SF bug #1831900
|
||||
|
||||
* Try to detect unkonwn URL schemes from the command line, eg. URLs
|
||||
like "rtsp://foo".
|
||||
Type: feature
|
||||
Changed: linkchecker, linkcheck/lc_cgi.py,
|
||||
linkcheck/checker/{__init__,urlbase,httpurl,unknownurl}.py
|
||||
|
||||
4.7 "300" (released 17.6.2007)
|
||||
|
||||
* Mention in the documentation that --anchors enables logging of
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ def absolute_url (base_url, base_ref, parent_url):
|
|||
|
||||
def get_url_from (base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=0, column=0,
|
||||
name=u"", assume_local=False):
|
||||
name=u""):
|
||||
"""
|
||||
Get url data from given base data.
|
||||
|
||||
|
|
@ -180,13 +180,13 @@ def get_url_from (base_url, recursion_level, aggregate,
|
|||
base_ref = unicode_safe(base_ref)
|
||||
name = unicode_safe(name)
|
||||
url = absolute_url(base_url, base_ref, parent_url).lower()
|
||||
klass = get_urlclass_from(url, assume_local)
|
||||
klass = get_urlclass_from(url)
|
||||
return klass(base_url, recursion_level, aggregate,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
|
||||
|
||||
def get_urlclass_from (url, assume_local):
|
||||
def get_urlclass_from (url):
|
||||
"""Return checker class for given URL."""
|
||||
if url.startswith("http:"):
|
||||
klass = linkcheck.checker.httpurl.HttpUrl
|
||||
|
|
@ -206,12 +206,12 @@ def get_urlclass_from (url, assume_local):
|
|||
url.startswith("news:") or \
|
||||
url.startswith("snews:"):
|
||||
klass = linkcheck.checker.nntpurl.NntpUrl
|
||||
elif assume_local:
|
||||
# assume local file
|
||||
klass = linkcheck.checker.fileurl.FileUrl
|
||||
else:
|
||||
elif linkcheck.checker.unknownurl.is_unknown_url(url):
|
||||
# unknown url
|
||||
klass = linkcheck.checker.unknownurl.UnknownUrl
|
||||
else:
|
||||
# assume local file
|
||||
klass = linkcheck.checker.fileurl.FileUrl
|
||||
return klass
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -379,8 +379,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
newobj = linkcheck.checker.get_url_from(
|
||||
redirected, self.recursion_level, self.aggregate,
|
||||
parent_url=self.parent_url, base_ref=self.base_ref,
|
||||
line=self.line, column=self.column, name=self.name,
|
||||
assume_local=False)
|
||||
line=self.line, column=self.column, name=self.name)
|
||||
# append new object to queue
|
||||
self.aggregate.urlqueue.put(newobj)
|
||||
# pretend to be finished and logged
|
||||
|
|
|
|||
|
|
@ -155,6 +155,16 @@ class LinkCheckTest (unittest.TestCase):
|
|||
"""
|
||||
return linkcheck.url.url_norm(url)[0]
|
||||
|
||||
def get_attrs (self, **kwargs):
|
||||
"""Return current and data directory as dictionary.
|
||||
You can augment the dict with keyword attributes."""
|
||||
d = {
|
||||
'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()),
|
||||
'datadir': "linkcheck/checker/tests/data",
|
||||
}
|
||||
d.update(kwargs)
|
||||
return d
|
||||
|
||||
def get_resultlines (self, filename):
|
||||
"""
|
||||
Return contents of file, as list of lines without line endings,
|
||||
|
|
@ -170,7 +180,7 @@ class LinkCheckTest (unittest.TestCase):
|
|||
f.close()
|
||||
return resultlines
|
||||
|
||||
def file_test (self, filename, confargs=None, assume_local=True):
|
||||
def file_test (self, filename, confargs=None):
|
||||
"""
|
||||
Check <filename> with expected result in <filename>.result.
|
||||
"""
|
||||
|
|
@ -179,9 +189,9 @@ class LinkCheckTest (unittest.TestCase):
|
|||
confargs = {}
|
||||
logargs = {'expected': self.get_resultlines(filename)}
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = get_url_from(url, 0, aggregate, assume_local=assume_local)
|
||||
if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
# XXX if assume_local
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
|
|
@ -192,7 +202,7 @@ class LinkCheckTest (unittest.TestCase):
|
|||
self.fail(l.encode("iso8859-1", "ignore"))
|
||||
|
||||
def direct (self, url, resultlines, parts=None, recursionlevel=0,
|
||||
confargs=None, assume_local=False):
|
||||
confargs=None):
|
||||
"""
|
||||
Check url with expected result.
|
||||
"""
|
||||
|
|
@ -205,9 +215,9 @@ class LinkCheckTest (unittest.TestCase):
|
|||
if parts is not None:
|
||||
logargs['parts'] = parts
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = get_url_from(url, 0, aggregate, assume_local=assume_local)
|
||||
if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
# XXX if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
|
|
|
|||
|
|
@ -34,11 +34,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
|
|||
Unrecognized scheme test.
|
||||
"""
|
||||
url = u"hutzli:"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % url,
|
||||
u"real url %s" % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
@ -48,22 +49,22 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
|
|||
Leading whitespace test.
|
||||
"""
|
||||
url = u" http://www.heise.de/"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
url = u"\nhttp://www.heise.de/"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
@ -96,12 +97,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
|
|||
def test_invalid1 (self):
|
||||
# invalid scheme chars
|
||||
url = u"äöü?:"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
@ -109,12 +110,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
|
|||
def test_invalid2 (self):
|
||||
# missing scheme alltogether
|
||||
url = u"?äöü?"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
@ -122,12 +123,12 @@ class TestError (linkcheck.checker.tests.LinkCheckTest):
|
|||
def test_invalid3 (self):
|
||||
# really fucked up
|
||||
url = u"@³²¼][½ ³@] ¬½"
|
||||
nurl = self.norm(url)
|
||||
attrs = self.get_attrs(url=url)
|
||||
attrs['nurl'] = self.norm("file://%(curdir)s/%(url)s" % attrs)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"warning Base URL is not properly normed. Normed URL is %s." % nurl,
|
||||
u"url %(nurl)s" % attrs,
|
||||
u"cache key %(nurl)s" % attrs,
|
||||
u"real url %(nurl)s" % attrs,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines)
|
||||
|
|
|
|||
|
|
@ -23,13 +23,6 @@ import os
|
|||
|
||||
import linkcheck.checker.tests
|
||||
|
||||
def get_attrs ():
|
||||
return {
|
||||
'curdir': linkcheck.checker.tests.get_file_url(os.getcwd()),
|
||||
'datadir': "linkcheck/checker/tests/data",
|
||||
}
|
||||
|
||||
|
||||
class TestFile (linkcheck.checker.tests.LinkCheckTest):
|
||||
"""
|
||||
Test file:// link checking (and file content parsing).
|
||||
|
|
@ -66,8 +59,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
|
|||
self.file_test("urllist.txt")
|
||||
|
||||
def test_good_file (self):
|
||||
attrs = get_attrs()
|
||||
url = u"file://%(curdir)s/%(datadir)s/file.txt" % attrs
|
||||
url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
|
|
@ -83,8 +75,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
|
|||
# Cause: url get quoted %7C which gets lowercased to
|
||||
# %7c and this fails.
|
||||
return
|
||||
attrs = get_attrs()
|
||||
url = u"file:/%(curdir)s/%(datadir)s/file.txt" % attrs
|
||||
url = u"file:/%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
|
|
@ -96,7 +87,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
|
|||
|
||||
def test_good_file_missing_dslash (self):
|
||||
# good file (missing double slash)
|
||||
attrs = get_attrs()
|
||||
attrs = self.get_attrs()
|
||||
url = u"file:%(curdir)s/%(datadir)s/file.txt" % attrs
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
|
|
@ -109,8 +100,7 @@ class TestFile (linkcheck.checker.tests.LinkCheckTest):
|
|||
self.direct(url, resultlines)
|
||||
|
||||
def test_good_dir (self):
|
||||
attrs = get_attrs()
|
||||
url = u"file://%(curdir)s/%(datadir)s/" % attrs
|
||||
url = u"file://%(curdir)s/%(datadir)s/" % self.get_attrs()
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % url,
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
|
||||
u"http.html" % self.port
|
||||
resultlines = self.get_resultlines("http.html")
|
||||
self.direct(url, resultlines, recursionlevel=1, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
self.redirect1_http_test()
|
||||
self.redirect2_http_test()
|
||||
self.robots_txt_test()
|
||||
|
|
@ -69,7 +69,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"real url %s" % rurl,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=0)
|
||||
|
||||
def redirect1_http_test (self):
|
||||
url = u"http://localhost:%d/redirect1" % self.port
|
||||
|
|
@ -82,7 +82,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"info Redirected to %s." % rurl,
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=0)
|
||||
|
||||
def redirect2_http_test (self):
|
||||
url = u"http://localhost:%d/linkcheck/checker/tests/data/redirect.html" % \
|
||||
|
|
@ -101,7 +101,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"name Recursive Redirect",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=99, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=99)
|
||||
|
||||
def robots_txt_test (self):
|
||||
url = u"http://localhost:%d/robots.txt" % self.port
|
||||
|
|
@ -111,7 +111,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"real url %s" % url,
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=5, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=5)
|
||||
|
||||
def robots_txt2_test (self):
|
||||
url = u"http://localhost:%d/secret" % self.port
|
||||
|
|
@ -122,7 +122,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"warning Access denied by robots.txt, checked only syntax.",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=5, assume_local=True)
|
||||
self.direct(url, resultlines, recursionlevel=5)
|
||||
|
||||
def noproxyfor_test (self):
|
||||
"""
|
||||
|
|
@ -141,7 +141,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=0,
|
||||
confargs=confargs, assume_local=True)
|
||||
confargs=confargs)
|
||||
del os.environ["http_proxy"]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ acap # application configuration access protocol
|
|||
|
||||
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
||||
|
||||
is_unknown_url = ignored_schemes_re.search
|
||||
|
||||
|
||||
class UnknownUrl (urlbase.UrlBase):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -672,8 +672,7 @@ class UrlBase (object):
|
|||
base_ref = linkcheck.url.url_norm(base_ref)[0]
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.aggregate, parent_url=self.url,
|
||||
base_ref=base_ref, line=line, column=column, name=name,
|
||||
assume_local=False)
|
||||
base_ref=base_ref, line=line, column=column, name=name)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_opera (self):
|
||||
|
|
@ -694,8 +693,7 @@ class UrlBase (object):
|
|||
if url:
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno, name=name,
|
||||
assume_local=False)
|
||||
parent_url=self.url, line=lineno, name=name)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
name = ""
|
||||
|
||||
|
|
@ -714,8 +712,7 @@ class UrlBase (object):
|
|||
continue
|
||||
url_data = linkcheck.checker.get_url_from(line,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno,
|
||||
assume_local=False)
|
||||
parent_url=self.url, line=lineno)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_css (self):
|
||||
|
|
@ -734,8 +731,7 @@ class UrlBase (object):
|
|||
url = linkcheck.strformat.unquote(mo.group("url").strip())
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, line=lineno, column=column,
|
||||
assume_local=False)
|
||||
parent_url=self.url, line=lineno, column=column)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def serialized (self):
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
|
|||
aggregate = linkcheck.director.get_aggregate(config)
|
||||
get_url_from = linkcheck.checker.get_url_from
|
||||
url = form["url"].value
|
||||
url_data = get_url_from(url, 0, aggregate, assume_local=False)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
try:
|
||||
linkcheck.add_intern_pattern(url_data, config)
|
||||
except UnicodeError:
|
||||
|
|
|
|||
|
|
@ -722,7 +722,7 @@ for url in args:
|
|||
elif url.lower().startswith("ftp."):
|
||||
# syntactic sugar
|
||||
url = "ftp://%s" % url
|
||||
url_data = get_url_from(url, 0, aggregate, assume_local=True)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
try:
|
||||
linkcheck.add_intern_pattern(url_data, config)
|
||||
except UnicodeError:
|
||||
|
|
|
|||
Loading…
Reference in a new issue