support text files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@380 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-08 14:44:46 +00:00 · 2002-03-14 21:18:52 +00:00 · 2002-03-14 21:18:52 +00:00 · 3365ea48ab
commit 3365ea48ab
parent 15989c088c
10 changed files with 143 additions and 39 deletions
--- a/1
+++ b/1
@ -1,3 +1,4 @@
+Dont assume .html on local files: guess mime, parse URIs
 Check why threaded app wont exit resp. is stalled
 Another Profiling roundup
 Named constants for ANSI Color codes
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,12 @@
+linkchecker (1.3.23) unstable; urgency=low
+
+  * linkcheck/linkname.py: workaround for a bug regex matching with
+    re.DOTALL. This could result in href="" names not found correctly.
+  * linkcheck/linkname.py: immediately return on <img> tags inside <a>.
+  * linkchecker: interpolate %s in help text
+
+ -- Bastian Kleineidam <calvin@debian.org>  Wed, 13 Mar 2002 21:31:57 +0100
+
 linkchecker (1.3.22) unstable; urgency=low

  * last release before 1.4.0
--- a/linkcheck/FileUrlData.py
+++ b/linkcheck/FileUrlData.py
@ -21,12 +21,69 @@ from UrlData import UrlData, ExcList
 # OSError is thrown on Windows when a file is not found
 ExcList.append(OSError)

-html_re = re.compile(r'(?i)\.s?html?$')
-html_content_re = re.compile(r'(?i)<html>.*</html>')
-opera_re = re.compile(r'^(?i)opera.adr$')
-opera_content_re = re.compile(r'(?i)Opera Hotlist')
+# file extensions we can parse recursively
+extensions = {
+    "html": r'(?i)\.s?html?$',
+    "opera": r'^(?i)opera.adr$', # opera bookmark file
+    "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
+}
+for key in extensions.keys():
+    extensions[key] = re.compile(extensions[key])

-class FileUrlData(UrlData):
+# if file extension was fruitless, look at the content
+contents = {
+    "html": r'(?i)<html>.*</html>',
+    "opera" : r'Opera Hotlist',
+    "text" : r'[\w\s]+',
+}
+for key in contents.keys():
+    contents[key] = re.compile(contents[key])
+
+_schemes = r"""(
+acap        # application configuration access protocol
+|afs        # Andrew File System global file names
+|cid        # content identifier
+|data       # data
+|dav        # dav
+|fax        # fax
+|imap       # internet message access protocol
+|ldap       # Lightweight Directory Access Protocol
+|mailserver # Access to data available from mail servers
+|mid        # message identifier
+|modem      # modem
+|nfs        # network file system protocol
+|opaquelocktoken # opaquelocktoken
+|pop        # Post Office Protocol v3
+|prospero   # Prospero Directory Service
+|rtsp       # real time streaming protocol
+|service    # service location
+|sip        # session initiation protocol
+|tel        # telephone
+|tip        # Transaction Internet Protocol
+|tn3270     # Interactive 3270 emulation sessions
+|vemmi      # versatile multimedia interface
+|wais       # Wide Area Information Servers
+|z39\.50r   # Z39.50 Retrieval
+|z39\.50s   # Z39.50 Session
+|chrome     # Mozilla specific
+|find       # Mozilla specific
+|clsid      # Microsoft specific
+|javascript # JavaScript
+|isbn       # ISBN (int. book numbers)
+|https?     # HTTP/HTTPS
+|ftp        # FTP
+|file       # local file
+|telnet     # telnet
+|mailto     # mailto
+|gopher     # gopher
+|s?news     # news
+|nntp       # news
+)"""
+_url = r"(?i)%s:[-a-zA-Z0-9$_.+!*'/(),;]+" % _schemes
+_url_re = re.compile(_url, re.VERBOSE)
+
+
+class FileUrlData (UrlData):
    "Url link with file scheme"

    def __init__(self,
@ -50,42 +107,51 @@ class FileUrlData(UrlData):
                    self.urlName = os.getcwd()+"/"+self.urlName
                    if winre.search(self.urlName):
                        self.adjustWinPath()
-            self.urlName = self.urlName.replace("\\", "/")
-            self.urlName = "file://"+self.urlName
+            self.urlName = "file://"+self.urlName.replace("\\", "/")


-    def buildUrl(self):
+    def buildUrl (self):
        UrlData.buildUrl(self)
        # cut off parameter, query and fragment
        self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','',''))


-    def adjustWinPath(self):
+    def adjustWinPath (self):
        "c:\\windows ==> /c|\\windows"
        self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:]


-    def isHtml(self):
-        if html_re.search(self.url) or opera_re.search(self.url):
-            return 1
+    def isHtml (self):
+        # guess by extension
+        for ro in extensions.values():
+            if ro.search(self.url):
+                return 1
        # try to read content (can fail, so catch error)
        try:
-            return html_content_re.search(self.getContent()) or \
-                   opera_content_re.search(self.getContent())
+            for ro in contents.values():
+                if ro.search(self.getContent()):
+                    return 1
        except IOError:
            pass
        return None


-    def parseUrl(self, config):
-        if html_re.search(self.url) or \
-           html_content_re.search(self.getContent()):
-            UrlData.parseUrl(self, config)
-            return
+    def parseUrl (self, config):
+        for key,ro in extensions.items():
+            if ro.search(self.url):
+                return getattr(self, "parse_"+key)(config)
+        for key,ro in contents.items():
+            if ro.search(self.getContent()):
+                return getattr(self, "parse_"+key)(config)
+
+    def parse_html (self, config):
+        UrlData.parseUrl(self, config)
+
+    def parse_opera (self, config):
        # parse an opera bookmark file
        name = ""
        lineno = 0
-        for line in self.getContent().split("\n"):
+        for line in self.getContent().splitlines():
            lineno += 1
            line = line.strip()
            if line.startswith("NAME="):
@ -93,7 +159,21 @@ class FileUrlData(UrlData):
            elif line.startswith("URL="):
                url = line[4:]
                if url:
-                    from UrlData import GetUrlDataFrom
-                    config.appendUrl(GetUrlDataFrom(url,
+                    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
                        self.recursionLevel+1, self.url, None, lineno, name))
                name = ""
+
+    def parse_text (self, config):
+        lineno = 0
+        for line in self.getContent().splitlines():
+            lineno += 1
+            i = 0
+            while 1:
+                mo = _url_re.search(line, i)
+                if not mo: break
+                config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
+                        self.recursionLevel+1, self.url, None, lineno, ""))
+                i = mo.end()
+
+        return
+
--- a/linkcheck/IgnoredUrlData.py
+++ b/linkcheck/IgnoredUrlData.py
@ -18,7 +18,7 @@
 import re, linkcheck
 from UrlData import UrlData

-ignored_schemes_re = re.compile(r"""^(
+ignored_schemes = r"""^(
 acap        # application configuration access protocol
 |afs        # Andrew File System global file names
 |cid        # content identifier
@ -49,7 +49,9 @@ acap        # application configuration access protocol
 |clsid      # Microsoft specific
 |javascript # JavaScript
 |isbn       # ISBN (int. book numbers)
-):""", re.VERBOSE)
+):"""
+
+ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)


 class IgnoredUrlData(UrlData):
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -171,8 +171,8 @@ class UrlData:
        self.html_comments = []
        self.has_content = 0
        url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
-        self.scheme = url.split(":", 1)[0] or "unknown"
-
+        # assume file link if no scheme is found
+        self.scheme = url.split(":", 1)[0] or "file"

    def setError(self, s):
        self.valid=0
@ -191,14 +191,12 @@ class UrlData:
        else:
            self.warningString = s

-
    def setInfo(self, s):
        if self.infoString:
            self.infoString += "\n"+s
        else:
            self.infoString = s

-
    def copyFrom(self, urlData):
        self.errorString = urlData.errorString
        self.validString = urlData.validString
--- a/7
+++ b/7
@ -64,7 +64,8 @@ For single-letter option arguments the space is not a necessity. So
        environment variable NNTP_SERVER. If no host is given,
        only the syntax of the link is checked.
 -o type, --output=type
-        Specify output type as %s. Default type is text.
+        Specify output type as %s.
+        Default type is text.
 -p pwd, --password=pwd
        Try password pwd for HTML and FTP authorization.
        Default password is 'joe@'. See also -u.
@ -105,8 +106,8 @@ For single-letter option arguments the space is not a necessity. So
        Use this to check for pages that contain some form of error
        message, for example 'This page has moved' or 'Oracle
        Application Server error'.
-        This option implies -w.\n") % linkcheck.Config.LoggerKeys
-""")
+        This option implies -w.
+""") % linkcheck.Config.LoggerKeys

 Notes = linkcheck._("""NOTES
 o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
--- a/test/output/test_file
+++ b/test/output/test_file
@ -1,6 +1,10 @@
 test_file
 url file:///home/calvin/projects/linkchecker/test/html/file.html
 valid
+url file:///home/calvin/projects/linkchecker/test/html/file.txt
+valid
+url file:///home/calvin/projects/linkchecker/test/html/file.asc
+valid
 url http.html
 name relative url
 valid
@ -27,3 +31,9 @@ error
 url file:/etc/
 name good dir
 valid
+url file:///etc/group
+cached
+valid
+url file:///etc/group
+cached
+valid
--- a/test/output/test_http
+++ b/test/output/test_http
@ -4,7 +4,6 @@ valid
 url http://www.garantiertnixgutt.bla
 name bad url
 warning Missing '/' at end of URL
-Server does not support HEAD request (got 500 status), falling back to GET
 error
 url http://www.heise.de
 name ok
--- a/test/output/test_mail
+++ b/test/output/test_mail
@ -3,13 +3,15 @@ url file:///home/calvin/projects/linkchecker/test/html/mail.html
 valid
 url mailto:calvin@LocalHost?subject=Hallo&to=michi
 name 1
-error
+warning No MX mail host for LocalHost found
+valid
 url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
 name 2
 valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
 name 3
-error
+warning No MX mail host for company.com found
+valid
 url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
 name 4
 valid
@ -20,9 +22,9 @@ valid
 url mailto:o'hara@cs.uni-sb.de
 name 5
 valid
-url mailto:?to=calvin@studcs.uni-sb.de?subject=blubb
+url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
 name ...
-error
+valid
 url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
 name ...
 valid
@ -31,10 +33,12 @@ name ...
 valid
 url mailto:a@d?subject=äöü
 name 5
-error
+warning No MX mail host for d found
+valid
 url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
 name _
 valid
 url mailto:Bastian Kleineidam <calvin@host1?foo=bar>
 name 3
-error
+warning No MX mail host for host1 found
+valid
--- a/test/test_file.py
+++ b/test/test_file.py
@ -8,7 +8,7 @@ config["anchors"] = 1
 config["verbose"] = 1
 config.disableThreading()
 htmldir = "test/html"
-for file in ('file.html',):
+for file in ('file.html',"file.txt","file.asc"):
    url = os.path.join(htmldir, file)
    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
 linkcheck.checkUrls(config)