diff --git a/TODO b/TODO
index f7c02b9d..3e06114d 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,4 @@
+Dont assume .html on local files: guess mime, parse URIs
Check why threaded app wont exit resp. is stalled
Another Profiling roundup
Named constants for ANSI Color codes
diff --git a/debian/changelog b/debian/changelog
index 582f769e..7631863d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,12 @@
+linkchecker (1.3.23) unstable; urgency=low
+
+ * linkcheck/linkname.py: workaround for a bug regex matching with
+ re.DOTALL. This could result in href="" names not found correctly.
+ * linkcheck/linkname.py: immediately return on
tags inside .
+ * linkchecker: interpolate %s in help text
+
+ -- Bastian Kleineidam Wed, 13 Mar 2002 21:31:57 +0100
+
linkchecker (1.3.22) unstable; urgency=low
* last release before 1.4.0
diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py
index 29eb23af..0bbba12e 100644
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@@ -21,12 +21,69 @@ from UrlData import UrlData, ExcList
# OSError is thrown on Windows when a file is not found
ExcList.append(OSError)
-html_re = re.compile(r'(?i)\.s?html?$')
-html_content_re = re.compile(r'(?i).*')
-opera_re = re.compile(r'^(?i)opera.adr$')
-opera_content_re = re.compile(r'(?i)Opera Hotlist')
+# file extensions we can parse recursively
+extensions = {
+ "html": r'(?i)\.s?html?$',
+ "opera": r'^(?i)opera.adr$', # opera bookmark file
+ "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
+}
+for key in extensions.keys():
+ extensions[key] = re.compile(extensions[key])
-class FileUrlData(UrlData):
+# if file extension was fruitless, look at the content
+contents = {
+ "html": r'(?i).*',
+ "opera" : r'Opera Hotlist',
+ "text" : r'[\w\s]+',
+}
+for key in contents.keys():
+ contents[key] = re.compile(contents[key])
+
+_schemes = r"""(
+acap # application configuration access protocol
+|afs # Andrew File System global file names
+|cid # content identifier
+|data # data
+|dav # dav
+|fax # fax
+|imap # internet message access protocol
+|ldap # Lightweight Directory Access Protocol
+|mailserver # Access to data available from mail servers
+|mid # message identifier
+|modem # modem
+|nfs # network file system protocol
+|opaquelocktoken # opaquelocktoken
+|pop # Post Office Protocol v3
+|prospero # Prospero Directory Service
+|rtsp # real time streaming protocol
+|service # service location
+|sip # session initiation protocol
+|tel # telephone
+|tip # Transaction Internet Protocol
+|tn3270 # Interactive 3270 emulation sessions
+|vemmi # versatile multimedia interface
+|wais # Wide Area Information Servers
+|z39\.50r # Z39.50 Retrieval
+|z39\.50s # Z39.50 Session
+|chrome # Mozilla specific
+|find # Mozilla specific
+|clsid # Microsoft specific
+|javascript # JavaScript
+|isbn # ISBN (int. book numbers)
+|https? # HTTP/HTTPS
+|ftp # FTP
+|file # local file
+|telnet # telnet
+|mailto # mailto
+|gopher # gopher
+|s?news # news
+|nntp # news
+)"""
+_url = r"(?i)%s:[-a-zA-Z0-9$_.+!*'/(),;]+" % _schemes
+_url_re = re.compile(_url, re.VERBOSE)
+
+
+class FileUrlData (UrlData):
"Url link with file scheme"
def __init__(self,
@@ -50,42 +107,51 @@ class FileUrlData(UrlData):
self.urlName = os.getcwd()+"/"+self.urlName
if winre.search(self.urlName):
self.adjustWinPath()
- self.urlName = self.urlName.replace("\\", "/")
- self.urlName = "file://"+self.urlName
+ self.urlName = "file://"+self.urlName.replace("\\", "/")
- def buildUrl(self):
+ def buildUrl (self):
UrlData.buildUrl(self)
# cut off parameter, query and fragment
self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','',''))
- def adjustWinPath(self):
+ def adjustWinPath (self):
"c:\\windows ==> /c|\\windows"
self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:]
- def isHtml(self):
- if html_re.search(self.url) or opera_re.search(self.url):
- return 1
+ def isHtml (self):
+ # guess by extension
+ for ro in extensions.values():
+ if ro.search(self.url):
+ return 1
# try to read content (can fail, so catch error)
try:
- return html_content_re.search(self.getContent()) or \
- opera_content_re.search(self.getContent())
+ for ro in contents.values():
+ if ro.search(self.getContent()):
+ return 1
except IOError:
pass
return None
- def parseUrl(self, config):
- if html_re.search(self.url) or \
- html_content_re.search(self.getContent()):
- UrlData.parseUrl(self, config)
- return
+ def parseUrl (self, config):
+ for key,ro in extensions.items():
+ if ro.search(self.url):
+ return getattr(self, "parse_"+key)(config)
+ for key,ro in contents.items():
+ if ro.search(self.getContent()):
+ return getattr(self, "parse_"+key)(config)
+
+ def parse_html (self, config):
+ UrlData.parseUrl(self, config)
+
+ def parse_opera (self, config):
# parse an opera bookmark file
name = ""
lineno = 0
- for line in self.getContent().split("\n"):
+ for line in self.getContent().splitlines():
lineno += 1
line = line.strip()
if line.startswith("NAME="):
@@ -93,7 +159,21 @@ class FileUrlData(UrlData):
elif line.startswith("URL="):
url = line[4:]
if url:
- from UrlData import GetUrlDataFrom
- config.appendUrl(GetUrlDataFrom(url,
+ config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
self.recursionLevel+1, self.url, None, lineno, name))
name = ""
+
+ def parse_text (self, config):
+ lineno = 0
+ for line in self.getContent().splitlines():
+ lineno += 1
+ i = 0
+ while 1:
+ mo = _url_re.search(line, i)
+ if not mo: break
+ config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
+ self.recursionLevel+1, self.url, None, lineno, ""))
+ i = mo.end()
+
+ return
+
diff --git a/linkcheck/IgnoredUrlData.py b/linkcheck/IgnoredUrlData.py
index 2dcb97fa..ef7f9cc5 100644
--- a/linkcheck/IgnoredUrlData.py
+++ b/linkcheck/IgnoredUrlData.py
@@ -18,7 +18,7 @@
import re, linkcheck
from UrlData import UrlData
-ignored_schemes_re = re.compile(r"""^(
+ignored_schemes = r"""^(
acap # application configuration access protocol
|afs # Andrew File System global file names
|cid # content identifier
@@ -49,7 +49,9 @@ acap # application configuration access protocol
|clsid # Microsoft specific
|javascript # JavaScript
|isbn # ISBN (int. book numbers)
-):""", re.VERBOSE)
+):"""
+
+ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
class IgnoredUrlData(UrlData):
diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index 5820f43c..7804134c 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -171,8 +171,8 @@ class UrlData:
self.html_comments = []
self.has_content = 0
url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
- self.scheme = url.split(":", 1)[0] or "unknown"
-
+ # assume file link if no scheme is found
+ self.scheme = url.split(":", 1)[0] or "file"
def setError(self, s):
self.valid=0
@@ -191,14 +191,12 @@ class UrlData:
else:
self.warningString = s
-
def setInfo(self, s):
if self.infoString:
self.infoString += "\n"+s
else:
self.infoString = s
-
def copyFrom(self, urlData):
self.errorString = urlData.errorString
self.validString = urlData.validString
diff --git a/linkchecker b/linkchecker
index 5aac4fb3..6bcfe952 100755
--- a/linkchecker
+++ b/linkchecker
@@ -64,7 +64,8 @@ For single-letter option arguments the space is not a necessity. So
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked.
-o type, --output=type
- Specify output type as %s. Default type is text.
+ Specify output type as %s.
+ Default type is text.
-p pwd, --password=pwd
Try password pwd for HTML and FTP authorization.
Default password is 'joe@'. See also -u.
@@ -105,8 +106,8 @@ For single-letter option arguments the space is not a necessity. So
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'.
- This option implies -w.\n") % linkcheck.Config.LoggerKeys
-""")
+ This option implies -w.
+""") % linkcheck.Config.LoggerKeys
Notes = linkcheck._("""NOTES
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
diff --git a/test/output/test_file b/test/output/test_file
index 733f7852..ece6d675 100644
--- a/test/output/test_file
+++ b/test/output/test_file
@@ -1,6 +1,10 @@
test_file
url file:///home/calvin/projects/linkchecker/test/html/file.html
valid
+url file:///home/calvin/projects/linkchecker/test/html/file.txt
+valid
+url file:///home/calvin/projects/linkchecker/test/html/file.asc
+valid
url http.html
name relative url
valid
@@ -27,3 +31,9 @@ error
url file:/etc/
name good dir
valid
+url file:///etc/group
+cached
+valid
+url file:///etc/group
+cached
+valid
diff --git a/test/output/test_http b/test/output/test_http
index 81ff5370..6bf1229f 100644
--- a/test/output/test_http
+++ b/test/output/test_http
@@ -4,7 +4,6 @@ valid
url http://www.garantiertnixgutt.bla
name bad url
warning Missing '/' at end of URL
-Server does not support HEAD request (got 500 status), falling back to GET
error
url http://www.heise.de
name ok
diff --git a/test/output/test_mail b/test/output/test_mail
index f4c26728..419dce22 100644
--- a/test/output/test_mail
+++ b/test/output/test_mail
@@ -3,13 +3,15 @@ url file:///home/calvin/projects/linkchecker/test/html/mail.html
valid
url mailto:calvin@LocalHost?subject=Hallo&to=michi
name 1
-error
+warning No MX mail host for LocalHost found
+valid
url mailto:Dude , Killer ?subject=bla
name 2
valid
url mailto:Bastian Kleineidam ?bcc=jsmith%40company.com
name 3
-error
+warning No MX mail host for company.com found
+valid
url mailto:Bastian Kleineidam
name 4
valid
@@ -20,9 +22,9 @@ valid
url mailto:o'hara@cs.uni-sb.de
name 5
valid
-url mailto:?to=calvin@studcs.uni-sb.de?subject=blubb
+url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
name ...
-error
+valid
url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
name ...
valid
@@ -31,10 +33,12 @@ name ...
valid
url mailto:a@d?subject=äöü
name 5
-error
+warning No MX mail host for d found
+valid
url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
name _
valid
url mailto:Bastian Kleineidam
name 3
-error
+warning No MX mail host for host1 found
+valid
diff --git a/test/test_file.py b/test/test_file.py
index 0ff66e60..9fa25365 100644
--- a/test/test_file.py
+++ b/test/test_file.py
@@ -8,7 +8,7 @@ config["anchors"] = 1
config["verbose"] = 1
config.disableThreading()
htmldir = "test/html"
-for file in ('file.html',):
+for file in ('file.html',"file.txt","file.asc"):
url = os.path.join(htmldir, file)
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
linkcheck.checkUrls(config)