support text files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@380 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-03-14 21:18:52 +00:00
parent 15989c088c
commit 3365ea48ab
10 changed files with 143 additions and 39 deletions

1
TODO
View file

@ -1,3 +1,4 @@
Dont assume .html on local files: guess mime, parse URIs
Check why threaded app wont exit resp. is stalled Check why threaded app wont exit resp. is stalled
Another Profiling roundup Another Profiling roundup
Named constants for ANSI Color codes Named constants for ANSI Color codes

9
debian/changelog vendored
View file

@ -1,3 +1,12 @@
linkchecker (1.3.23) unstable; urgency=low
* linkcheck/linkname.py: workaround for a bug regex matching with
re.DOTALL. This could result in href="" names not found correctly.
* linkcheck/linkname.py: immediately return on <img> tags inside <a>.
* linkchecker: interpolate %s in help text
-- Bastian Kleineidam <calvin@debian.org> Wed, 13 Mar 2002 21:31:57 +0100
linkchecker (1.3.22) unstable; urgency=low linkchecker (1.3.22) unstable; urgency=low
* last release before 1.4.0 * last release before 1.4.0

View file

@ -21,12 +21,69 @@ from UrlData import UrlData, ExcList
# OSError is thrown on Windows when a file is not found # OSError is thrown on Windows when a file is not found
ExcList.append(OSError) ExcList.append(OSError)
html_re = re.compile(r'(?i)\.s?html?$') # file extensions we can parse recursively
html_content_re = re.compile(r'(?i)<html>.*</html>') extensions = {
opera_re = re.compile(r'^(?i)opera.adr$') "html": r'(?i)\.s?html?$',
opera_content_re = re.compile(r'(?i)Opera Hotlist') "opera": r'^(?i)opera.adr$', # opera bookmark file
"text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
}
for key in extensions.keys():
extensions[key] = re.compile(extensions[key])
class FileUrlData(UrlData): # if file extension was fruitless, look at the content
contents = {
"html": r'(?i)<html>.*</html>',
"opera" : r'Opera Hotlist',
"text" : r'[\w\s]+',
}
for key in contents.keys():
contents[key] = re.compile(contents[key])
_schemes = r"""(
acap # application configuration access protocol
|afs # Andrew File System global file names
|cid # content identifier
|data # data
|dav # dav
|fax # fax
|imap # internet message access protocol
|ldap # Lightweight Directory Access Protocol
|mailserver # Access to data available from mail servers
|mid # message identifier
|modem # modem
|nfs # network file system protocol
|opaquelocktoken # opaquelocktoken
|pop # Post Office Protocol v3
|prospero # Prospero Directory Service
|rtsp # real time streaming protocol
|service # service location
|sip # session initiation protocol
|tel # telephone
|tip # Transaction Internet Protocol
|tn3270 # Interactive 3270 emulation sessions
|vemmi # versatile multimedia interface
|wais # Wide Area Information Servers
|z39\.50r # Z39.50 Retrieval
|z39\.50s # Z39.50 Session
|chrome # Mozilla specific
|find # Mozilla specific
|clsid # Microsoft specific
|javascript # JavaScript
|isbn # ISBN (int. book numbers)
|https? # HTTP/HTTPS
|ftp # FTP
|file # local file
|telnet # telnet
|mailto # mailto
|gopher # gopher
|s?news # news
|nntp # news
)"""
_url = r"(?i)%s:[-a-zA-Z0-9$_.+!*'/(),;]+" % _schemes
_url_re = re.compile(_url, re.VERBOSE)
class FileUrlData (UrlData):
"Url link with file scheme" "Url link with file scheme"
def __init__(self, def __init__(self,
@ -50,42 +107,51 @@ class FileUrlData(UrlData):
self.urlName = os.getcwd()+"/"+self.urlName self.urlName = os.getcwd()+"/"+self.urlName
if winre.search(self.urlName): if winre.search(self.urlName):
self.adjustWinPath() self.adjustWinPath()
self.urlName = self.urlName.replace("\\", "/") self.urlName = "file://"+self.urlName.replace("\\", "/")
self.urlName = "file://"+self.urlName
def buildUrl(self): def buildUrl (self):
UrlData.buildUrl(self) UrlData.buildUrl(self)
# cut off parameter, query and fragment # cut off parameter, query and fragment
self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','','')) self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','',''))
def adjustWinPath(self): def adjustWinPath (self):
"c:\\windows ==> /c|\\windows" "c:\\windows ==> /c|\\windows"
self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:] self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:]
def isHtml(self): def isHtml (self):
if html_re.search(self.url) or opera_re.search(self.url): # guess by extension
return 1 for ro in extensions.values():
if ro.search(self.url):
return 1
# try to read content (can fail, so catch error) # try to read content (can fail, so catch error)
try: try:
return html_content_re.search(self.getContent()) or \ for ro in contents.values():
opera_content_re.search(self.getContent()) if ro.search(self.getContent()):
return 1
except IOError: except IOError:
pass pass
return None return None
def parseUrl(self, config): def parseUrl (self, config):
if html_re.search(self.url) or \ for key,ro in extensions.items():
html_content_re.search(self.getContent()): if ro.search(self.url):
UrlData.parseUrl(self, config) return getattr(self, "parse_"+key)(config)
return for key,ro in contents.items():
if ro.search(self.getContent()):
return getattr(self, "parse_"+key)(config)
def parse_html (self, config):
UrlData.parseUrl(self, config)
def parse_opera (self, config):
# parse an opera bookmark file # parse an opera bookmark file
name = "" name = ""
lineno = 0 lineno = 0
for line in self.getContent().split("\n"): for line in self.getContent().splitlines():
lineno += 1 lineno += 1
line = line.strip() line = line.strip()
if line.startswith("NAME="): if line.startswith("NAME="):
@ -93,7 +159,21 @@ class FileUrlData(UrlData):
elif line.startswith("URL="): elif line.startswith("URL="):
url = line[4:] url = line[4:]
if url: if url:
from UrlData import GetUrlDataFrom config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
config.appendUrl(GetUrlDataFrom(url,
self.recursionLevel+1, self.url, None, lineno, name)) self.recursionLevel+1, self.url, None, lineno, name))
name = "" name = ""
def parse_text (self, config):
lineno = 0
for line in self.getContent().splitlines():
lineno += 1
i = 0
while 1:
mo = _url_re.search(line, i)
if not mo: break
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
self.recursionLevel+1, self.url, None, lineno, ""))
i = mo.end()
return

View file

@ -18,7 +18,7 @@
import re, linkcheck import re, linkcheck
from UrlData import UrlData from UrlData import UrlData
ignored_schemes_re = re.compile(r"""^( ignored_schemes = r"""^(
acap # application configuration access protocol acap # application configuration access protocol
|afs # Andrew File System global file names |afs # Andrew File System global file names
|cid # content identifier |cid # content identifier
@ -49,7 +49,9 @@ acap # application configuration access protocol
|clsid # Microsoft specific |clsid # Microsoft specific
|javascript # JavaScript |javascript # JavaScript
|isbn # ISBN (int. book numbers) |isbn # ISBN (int. book numbers)
):""", re.VERBOSE) ):"""
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
class IgnoredUrlData(UrlData): class IgnoredUrlData(UrlData):

View file

@ -171,8 +171,8 @@ class UrlData:
self.html_comments = [] self.html_comments = []
self.has_content = 0 self.has_content = 0
url = get_absolute_url(self.urlName, self.baseRef, self.parentName) url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
self.scheme = url.split(":", 1)[0] or "unknown" # assume file link if no scheme is found
self.scheme = url.split(":", 1)[0] or "file"
def setError(self, s): def setError(self, s):
self.valid=0 self.valid=0
@ -191,14 +191,12 @@ class UrlData:
else: else:
self.warningString = s self.warningString = s
def setInfo(self, s): def setInfo(self, s):
if self.infoString: if self.infoString:
self.infoString += "\n"+s self.infoString += "\n"+s
else: else:
self.infoString = s self.infoString = s
def copyFrom(self, urlData): def copyFrom(self, urlData):
self.errorString = urlData.errorString self.errorString = urlData.errorString
self.validString = urlData.validString self.validString = urlData.validString

View file

@ -64,7 +64,8 @@ For single-letter option arguments the space is not a necessity. So
environment variable NNTP_SERVER. If no host is given, environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked. only the syntax of the link is checked.
-o type, --output=type -o type, --output=type
Specify output type as %s. Default type is text. Specify output type as %s.
Default type is text.
-p pwd, --password=pwd -p pwd, --password=pwd
Try password pwd for HTML and FTP authorization. Try password pwd for HTML and FTP authorization.
Default password is 'joe@'. See also -u. Default password is 'joe@'. See also -u.
@ -105,8 +106,8 @@ For single-letter option arguments the space is not a necessity. So
Use this to check for pages that contain some form of error Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle message, for example 'This page has moved' or 'Oracle
Application Server error'. Application Server error'.
This option implies -w.\n") % linkcheck.Config.LoggerKeys This option implies -w.
""") """) % linkcheck.Config.LoggerKeys
Notes = linkcheck._("""NOTES Notes = linkcheck._("""NOTES
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL

View file

@ -1,6 +1,10 @@
test_file test_file
url file:///home/calvin/projects/linkchecker/test/html/file.html url file:///home/calvin/projects/linkchecker/test/html/file.html
valid valid
url file:///home/calvin/projects/linkchecker/test/html/file.txt
valid
url file:///home/calvin/projects/linkchecker/test/html/file.asc
valid
url http.html url http.html
name relative url name relative url
valid valid
@ -27,3 +31,9 @@ error
url file:/etc/ url file:/etc/
name good dir name good dir
valid valid
url file:///etc/group
cached
valid
url file:///etc/group
cached
valid

View file

@ -4,7 +4,6 @@ valid
url http://www.garantiertnixgutt.bla url http://www.garantiertnixgutt.bla
name bad url name bad url
warning Missing '/' at end of URL warning Missing '/' at end of URL
Server does not support HEAD request (got 500 status), falling back to GET
error error
url http://www.heise.de url http://www.heise.de
name ok name ok

View file

@ -3,13 +3,15 @@ url file:///home/calvin/projects/linkchecker/test/html/mail.html
valid valid
url mailto:calvin@LocalHost?subject=Hallo&to=michi url mailto:calvin@LocalHost?subject=Hallo&to=michi
name 1 name 1
error warning No MX mail host for LocalHost found
valid
url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla url mailto:Dude <calvin@studcs.uni-sb.de> , Killer <calvin@cs.uni-sb.de>?subject=bla
name 2 name 2
valid valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>?bcc=jsmith%40company.com
name 3 name 3
error warning No MX mail host for company.com found
valid
url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de> url mailto:Bastian Kleineidam <calvin@studcs.uni-sb.de>
name 4 name 4
valid valid
@ -20,9 +22,9 @@ valid
url mailto:o'hara@cs.uni-sb.de url mailto:o'hara@cs.uni-sb.de
name 5 name 5
valid valid
url mailto:?to=calvin@studcs.uni-sb.de?subject=blubb url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de
name ... name ...
error valid
url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs
name ... name ...
valid valid
@ -31,10 +33,12 @@ name ...
valid valid
url mailto:a@d?subject=äöü url mailto:a@d?subject=äöü
name 5 name 5
error warning No MX mail host for d found
valid
url mailto:calvin@cs.uni-sb.de?subject=Halli hallo url mailto:calvin@cs.uni-sb.de?subject=Halli hallo
name _ name _
valid valid
url mailto:Bastian Kleineidam <calvin@host1?foo=bar> url mailto:Bastian Kleineidam <calvin@host1?foo=bar>
name 3 name 3
error warning No MX mail host for host1 found
valid

View file

@ -8,7 +8,7 @@ config["anchors"] = 1
config["verbose"] = 1 config["verbose"] = 1
config.disableThreading() config.disableThreading()
htmldir = "test/html" htmldir = "test/html"
for file in ('file.html',): for file in ('file.html',"file.txt","file.asc"):
url = os.path.join(htmldir, file) url = os.path.join(htmldir, file)
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
linkcheck.checkUrls(config) linkcheck.checkUrls(config)