support text files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@380 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-11 16:13:11 +00:00 · 2002-03-14 21:18:52 +00:00 · 2002-03-14 21:18:52 +00:00 · 3365ea48ab
commit 3365ea48ab
parent 15989c088c
10 changed files with 143 additions and 39 deletions
--- a/1
+++ b/1
@ -1,3 +1,4 @@
 Dont assume .html on local files: guess mime, parse URIs
 Check why threaded app wont exit resp. is stalled
 Another Profiling roundup
 Named constants for ANSI Color codes
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,12 @@
 linkchecker (1.3.23) unstable; urgency=low
  * linkcheck/linkname.py: workaround for a bug regex matching with
    re.DOTALL. This could result in href="" names not found correctly.
  * linkcheck/linkname.py: immediately return on <img> tags inside <a>.
  * linkchecker: interpolate %s in help text
 -- Bastian Kleineidam <calvin@debian.org>  Wed, 13 Mar 2002 21:31:57 +0100
 linkchecker (1.3.22) unstable; urgency=low
  * last release before 1.4.0
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@ -21,12 +21,69 @@ from UrlData import UrlData, ExcList
 # OSError is thrown on Windows when a file is not found
 ExcList.append(OSError)
-html_re = re.compile(r'(?i)\.s?html?$')
+# file extensions we can parse recursively
-html_content_re = re.compile(r'(?i)<html>.*</html>')
+extensions = {
-opera_re = re.compile(r'^(?i)opera.adr$')
+    "html": r'(?i)\.s?html?$',
-opera_content_re = re.compile(r'(?i)Opera Hotlist')
+    "opera": r'^(?i)opera.adr$', # opera bookmark file
    "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
 }
 for key in extensions.keys():
    extensions[key] = re.compile(extensions[key])
-class FileUrlData(UrlData):
+# if file extension was fruitless, look at the content
 contents = {
    "html": r'(?i)<html>.*</html>',
    "opera" : r'Opera Hotlist',
    "text" : r'[\w\s]+',
 }
 for key in contents.keys():
    contents[key] = re.compile(contents[key])
 _schemes = r"""(
 acap        # application configuration access protocol
 |afs        # Andrew File System global file names
 |cid        # content identifier
 |data       # data
 |dav        # dav
 |fax        # fax
 |imap       # internet message access protocol
 |ldap       # Lightweight Directory Access Protocol
 |mailserver # Access to data available from mail servers
 |mid        # message identifier
 |modem      # modem
 |nfs        # network file system protocol
 |opaquelocktoken # opaquelocktoken
 |pop        # Post Office Protocol v3
 |prospero   # Prospero Directory Service
 |rtsp       # real time streaming protocol
 |service    # service location
 |sip        # session initiation protocol
 |tel        # telephone
 |tip        # Transaction Internet Protocol
 |tn3270     # Interactive 3270 emulation sessions
 |vemmi      # versatile multimedia interface
 |wais       # Wide Area Information Servers
 |z39\.50r   # Z39.50 Retrieval
 |z39\.50s   # Z39.50 Session
 |chrome     # Mozilla specific
 |find       # Mozilla specific
 |clsid      # Microsoft specific
 |javascript # JavaScript
 |isbn       # ISBN (int. book numbers)
 |https?     # HTTP/HTTPS
 |ftp        # FTP
 |file       # local file
 |telnet     # telnet
 |mailto     # mailto
 |gopher     # gopher
 |s?news     # news
 |nntp       # news
 )"""
 _url = r"(?i)%s:[-a-zA-Z0-9$_.+!*'/(),;]+" % _schemes
 _url_re = re.compile(_url, re.VERBOSE)
 class FileUrlData (UrlData):
    "Url link with file scheme"
    def __init__(self,
@ -50,42 +107,51 @@ class FileUrlData(UrlData):
                    self.urlName = os.getcwd()+"/"+self.urlName
                    if winre.search(self.urlName):
                        self.adjustWinPath()
-            self.urlName = self.urlName.replace("\\", "/")
+            self.urlName = "file://"+self.urlName.replace("\\", "/")
            self.urlName = "file://"+self.urlName
-    def buildUrl(self):
+    def buildUrl (self):
        UrlData.buildUrl(self)
        # cut off parameter, query and fragment
        self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','',''))
-    def adjustWinPath(self):
+    def adjustWinPath (self):
        "c:\\windows ==> /c|\\windows"
        self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:]
-    def isHtml(self):
+    def isHtml (self):
-        if html_re.search(self.url) or opera_re.search(self.url):
+        # guess by extension
-            return 1
+        for ro in extensions.values():
            if ro.search(self.url):
                return 1
        # try to read content (can fail, so catch error)
        try:
-            return html_content_re.search(self.getContent()) or \
+            for ro in contents.values():
-                   opera_content_re.search(self.getContent())
+                if ro.search(self.getContent()):
                    return 1
        except IOError:
            pass
        return None
-    def parseUrl(self, config):
+    def parseUrl (self, config):
-        if html_re.search(self.url) or \
+        for key,ro in extensions.items():
-           html_content_re.search(self.getContent()):
+            if ro.search(self.url):
-            UrlData.parseUrl(self, config)
+                return getattr(self, "parse_"+key)(config)
-            return
+        for key,ro in contents.items():
            if ro.search(self.getContent()):
                return getattr(self, "parse_"+key)(config)
    def parse_html (self, config):
        UrlData.parseUrl(self, config)
    def parse_opera (self, config):
        # parse an opera bookmark file
        name = ""
        lineno = 0
-        for line in self.getContent().split("\n"):
+        for line in self.getContent().splitlines():
            lineno += 1
            line = line.strip()
            if line.startswith("NAME="):
@ -93,7 +159,21 @@ class FileUrlData(UrlData):
            elif line.startswith("URL="):
                url = line[4:]
                if url:
-                    from UrlData import GetUrlDataFrom
+                    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
                    config.appendUrl(GetUrlDataFrom(url,
                        self.recursionLevel+1, self.url, None, lineno, name))
                name = ""
    def parse_text (self, config):
        lineno = 0
        for line in self.getContent().splitlines():
            lineno += 1
            i = 0
            while 1:
                mo = _url_re.search(line, i)
                if not mo: break
                config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
                        self.recursionLevel+1, self.url, None, lineno, ""))
                i = mo.end()
        return
--- a/linkcheck/IgnoredUrlData.py
+++ b/linkcheck/IgnoredUrlData.py
@ -18,7 +18,7 @@
 import re, linkcheck
 from UrlData import UrlData
-ignored_schemes_re = re.compile(r"""^(
+ignored_schemes = r"""^(
 acap        # application configuration access protocol
 |afs        # Andrew File System global file names
 |cid        # content identifier
@ -49,7 +49,9 @@ acap        # application configuration access protocol
 |clsid      # Microsoft specific
 |javascript # JavaScript
 |isbn       # ISBN (int. book numbers)
-):""", re.VERBOSE)
+):"""
 ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
 class IgnoredUrlData(UrlData):
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -171,8 +171,8 @@ class UrlData:
        self.html_comments = []
        self.has_content = 0
        url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
-        self.scheme = url.split(":", 1)[0] or "unknown"
+        # assume file link if no scheme is found
-
+        self.scheme = url.split(":", 1)[0] or "file"
    def setError(self, s):
        self.valid=0
@ -191,14 +191,12 @@ class UrlData:
        else:
            self.warningString = s
    def setInfo(self, s):
        if self.infoString:
            self.infoString += "\n"+s
        else:
            self.infoString = s
    def copyFrom(self, urlData):
        self.errorString = urlData.errorString
        self.validString = urlData.validString
--- a/7
+++ b/7
@ -64,7 +64,8 @@ For single-letter option arguments the space is not a necessity. So
        environment variable NNTP_SERVER. If no host is given,
        only the syntax of the link is checked.
 -o type, --output=type
-        Specify output type as %s. Default type is text.
+        Specify output type as %s.
        Default type is text.
 -p pwd, --password=pwd
        Try password pwd for HTML and FTP authorization.
        Default password is 'joe@'. See also -u.
@ -105,8 +106,8 @@ For single-letter option arguments the space is not a necessity. So
        Use this to check for pages that contain some form of error
        message, for example 'This page has moved' or 'Oracle
        Application Server error'.
-        This option implies -w.\n") % linkcheck.Config.LoggerKeys
+        This option implies -w.
-""")
+""") % linkcheck.Config.LoggerKeys
 Notes = linkcheck._("""NOTES
 o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
--- a/test/output/test_file
+++ b/test/output/test_file
@ -1,6 +1,10 @@
 test_file
 url file:///home/calvin/projects/linkchecker/test/html/file.html
 valid
 url file:///home/calvin/projects/linkchecker/test/html/file.txt
 valid
 url file:///home/calvin/projects/linkchecker/test/html/file.asc
 valid
 url http.html
 name relative url
 valid
@ -27,3 +31,9 @@ error
 url file:/etc/
 name good dir
 valid
 url file:///etc/group
 cached
 valid
 url file:///etc/group
 cached
 valid
--- a/test/output/test_http
+++ b/test/output/test_http
@ -4,7 +4,6 @@ valid
 url http://www.garantiertnixgutt.bla
 name bad url
 warning Missing '/' at end of URL
 Server does not support HEAD request (got 500 status), falling back to GET
 error
 url http://www.heise.de
 name ok
--- a/test/output/test_mail
+++ b/test/output/test_mail
@ -3,13 +3,15 @@ url file:///home/calvin/projects/linkchecker/test/html/mail.html
 valid
 url mailto:calvin@LocalHost?subject=Hallo&to=michi
 name 1
-error
+warning No MX mail host for LocalHost found
 valid
 url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
 name 2
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
 name 3
-error
+warning No MX mail host for company.com found
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
 name 4
 valid
@ -20,9 +22,9 @@ valid
 url mailto:o'hara@cs.uni-sb.de
 name 5
 valid
-url mailto:?to=calvin@studcs.uni-sb.de?subject=blubb
+url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
 name ...
-error
+valid
 url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
 name ...
 valid
@ -31,10 +33,12 @@ name ...
 valid
 url mailto:a@d?subject=äöü
 name 5
-error
+warning No MX mail host for d found
 valid
 url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
 name _
 valid
 url mailto:Bastian Kleineidam <calvin@host1?foo=bar>
 name 3
-error
+warning No MX mail host for host1 found
 valid
--- a/test/test_file.py
+++ b/test/test_file.py
@ -8,7 +8,7 @@ config["anchors"] = 1
 config["verbose"] = 1
 config.disableThreading()
 htmldir = "test/html"
-for file in ('file.html',):
+for file in ('file.html',"file.txt","file.asc"):
    url = os.path.join(htmldir, file)
    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
 linkcheck.checkUrls(config)