diff --git a/TODO b/TODO index f7c02b9d..3e06114d 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,4 @@ +Dont assume .html on local files: guess mime, parse URIs Check why threaded app wont exit resp. is stalled Another Profiling roundup Named constants for ANSI Color codes diff --git a/debian/changelog b/debian/changelog index 582f769e..7631863d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +linkchecker (1.3.23) unstable; urgency=low + + * linkcheck/linkname.py: workaround for a bug regex matching with + re.DOTALL. This could result in href="" names not found correctly. + * linkcheck/linkname.py: immediately return on tags inside . + * linkchecker: interpolate %s in help text + + -- Bastian Kleineidam Wed, 13 Mar 2002 21:31:57 +0100 + linkchecker (1.3.22) unstable; urgency=low * last release before 1.4.0 diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py index 29eb23af..0bbba12e 100644 --- a/linkcheck/FileUrlData.py +++ b/linkcheck/FileUrlData.py @@ -21,12 +21,69 @@ from UrlData import UrlData, ExcList # OSError is thrown on Windows when a file is not found ExcList.append(OSError) -html_re = re.compile(r'(?i)\.s?html?$') -html_content_re = re.compile(r'(?i).*') -opera_re = re.compile(r'^(?i)opera.adr$') -opera_content_re = re.compile(r'(?i)Opera Hotlist') +# file extensions we can parse recursively +extensions = { + "html": r'(?i)\.s?html?$', + "opera": r'^(?i)opera.adr$', # opera bookmark file + "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$', +} +for key in extensions.keys(): + extensions[key] = re.compile(extensions[key]) -class FileUrlData(UrlData): +# if file extension was fruitless, look at the content +contents = { + "html": r'(?i).*', + "opera" : r'Opera Hotlist', + "text" : r'[\w\s]+', +} +for key in contents.keys(): + contents[key] = re.compile(contents[key]) + +_schemes = r"""( +acap # application configuration access protocol +|afs # Andrew File System global file names +|cid # content identifier +|data # data +|dav # dav +|fax # fax +|imap # internet message access protocol +|ldap # Lightweight Directory Access Protocol +|mailserver # Access to data available from mail servers +|mid # message identifier +|modem # modem +|nfs # network file system protocol +|opaquelocktoken # opaquelocktoken +|pop # Post Office Protocol v3 +|prospero # Prospero Directory Service +|rtsp # real time streaming protocol +|service # service location +|sip # session initiation protocol +|tel # telephone +|tip # Transaction Internet Protocol +|tn3270 # Interactive 3270 emulation sessions +|vemmi # versatile multimedia interface +|wais # Wide Area Information Servers +|z39\.50r # Z39.50 Retrieval +|z39\.50s # Z39.50 Session +|chrome # Mozilla specific +|find # Mozilla specific +|clsid # Microsoft specific +|javascript # JavaScript +|isbn # ISBN (int. book numbers) +|https? # HTTP/HTTPS +|ftp # FTP +|file # local file +|telnet # telnet +|mailto # mailto +|gopher # gopher +|s?news # news +|nntp # news +)""" +_url = r"(?i)%s:[-a-zA-Z0-9$_.+!*'/(),;]+" % _schemes +_url_re = re.compile(_url, re.VERBOSE) + + +class FileUrlData (UrlData): "Url link with file scheme" def __init__(self, @@ -50,42 +107,51 @@ class FileUrlData(UrlData): self.urlName = os.getcwd()+"/"+self.urlName if winre.search(self.urlName): self.adjustWinPath() - self.urlName = self.urlName.replace("\\", "/") - self.urlName = "file://"+self.urlName + self.urlName = "file://"+self.urlName.replace("\\", "/") - def buildUrl(self): + def buildUrl (self): UrlData.buildUrl(self) # cut off parameter, query and fragment self.url = urlparse.urlunparse(self.urlTuple[:3] + ('','','')) - def adjustWinPath(self): + def adjustWinPath (self): "c:\\windows ==> /c|\\windows" self.urlName = "/"+self.urlName[0]+"|"+self.urlName[2:] - def isHtml(self): - if html_re.search(self.url) or opera_re.search(self.url): - return 1 + def isHtml (self): + # guess by extension + for ro in extensions.values(): + if ro.search(self.url): + return 1 # try to read content (can fail, so catch error) try: - return html_content_re.search(self.getContent()) or \ - opera_content_re.search(self.getContent()) + for ro in contents.values(): + if ro.search(self.getContent()): + return 1 except IOError: pass return None - def parseUrl(self, config): - if html_re.search(self.url) or \ - html_content_re.search(self.getContent()): - UrlData.parseUrl(self, config) - return + def parseUrl (self, config): + for key,ro in extensions.items(): + if ro.search(self.url): + return getattr(self, "parse_"+key)(config) + for key,ro in contents.items(): + if ro.search(self.getContent()): + return getattr(self, "parse_"+key)(config) + + def parse_html (self, config): + UrlData.parseUrl(self, config) + + def parse_opera (self, config): # parse an opera bookmark file name = "" lineno = 0 - for line in self.getContent().split("\n"): + for line in self.getContent().splitlines(): lineno += 1 line = line.strip() if line.startswith("NAME="): @@ -93,7 +159,21 @@ class FileUrlData(UrlData): elif line.startswith("URL="): url = line[4:] if url: - from UrlData import GetUrlDataFrom - config.appendUrl(GetUrlDataFrom(url, + config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, self.recursionLevel+1, self.url, None, lineno, name)) name = "" + + def parse_text (self, config): + lineno = 0 + for line in self.getContent().splitlines(): + lineno += 1 + i = 0 + while 1: + mo = _url_re.search(line, i) + if not mo: break + config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(), + self.recursionLevel+1, self.url, None, lineno, "")) + i = mo.end() + + return + diff --git a/linkcheck/IgnoredUrlData.py b/linkcheck/IgnoredUrlData.py index 2dcb97fa..ef7f9cc5 100644 --- a/linkcheck/IgnoredUrlData.py +++ b/linkcheck/IgnoredUrlData.py @@ -18,7 +18,7 @@ import re, linkcheck from UrlData import UrlData -ignored_schemes_re = re.compile(r"""^( +ignored_schemes = r"""^( acap # application configuration access protocol |afs # Andrew File System global file names |cid # content identifier @@ -49,7 +49,9 @@ acap # application configuration access protocol |clsid # Microsoft specific |javascript # JavaScript |isbn # ISBN (int. book numbers) -):""", re.VERBOSE) +):""" + +ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) class IgnoredUrlData(UrlData): diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 5820f43c..7804134c 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -171,8 +171,8 @@ class UrlData: self.html_comments = [] self.has_content = 0 url = get_absolute_url(self.urlName, self.baseRef, self.parentName) - self.scheme = url.split(":", 1)[0] or "unknown" - + # assume file link if no scheme is found + self.scheme = url.split(":", 1)[0] or "file" def setError(self, s): self.valid=0 @@ -191,14 +191,12 @@ class UrlData: else: self.warningString = s - def setInfo(self, s): if self.infoString: self.infoString += "\n"+s else: self.infoString = s - def copyFrom(self, urlData): self.errorString = urlData.errorString self.validString = urlData.validString diff --git a/linkchecker b/linkchecker index 5aac4fb3..6bcfe952 100755 --- a/linkchecker +++ b/linkchecker @@ -64,7 +64,8 @@ For single-letter option arguments the space is not a necessity. So environment variable NNTP_SERVER. If no host is given, only the syntax of the link is checked. -o type, --output=type - Specify output type as %s. Default type is text. + Specify output type as %s. + Default type is text. -p pwd, --password=pwd Try password pwd for HTML and FTP authorization. Default password is 'joe@'. See also -u. @@ -105,8 +106,8 @@ For single-letter option arguments the space is not a necessity. So Use this to check for pages that contain some form of error message, for example 'This page has moved' or 'Oracle Application Server error'. - This option implies -w.\n") % linkcheck.Config.LoggerKeys -""") + This option implies -w. +""") % linkcheck.Config.LoggerKeys Notes = linkcheck._("""NOTES o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL diff --git a/test/output/test_file b/test/output/test_file index 733f7852..ece6d675 100644 --- a/test/output/test_file +++ b/test/output/test_file @@ -1,6 +1,10 @@ test_file url file:///home/calvin/projects/linkchecker/test/html/file.html valid +url file:///home/calvin/projects/linkchecker/test/html/file.txt +valid +url file:///home/calvin/projects/linkchecker/test/html/file.asc +valid url http.html name relative url valid @@ -27,3 +31,9 @@ error url file:/etc/ name good dir valid +url file:///etc/group +cached +valid +url file:///etc/group +cached +valid diff --git a/test/output/test_http b/test/output/test_http index 81ff5370..6bf1229f 100644 --- a/test/output/test_http +++ b/test/output/test_http @@ -4,7 +4,6 @@ valid url http://www.garantiertnixgutt.bla name bad url warning Missing '/' at end of URL -Server does not support HEAD request (got 500 status), falling back to GET error url http://www.heise.de name ok diff --git a/test/output/test_mail b/test/output/test_mail index f4c26728..419dce22 100644 --- a/test/output/test_mail +++ b/test/output/test_mail @@ -3,13 +3,15 @@ url file:///home/calvin/projects/linkchecker/test/html/mail.html valid url mailto:calvin@LocalHost?subject=Hallo&to=michi name 1 -error +warning No MX mail host for LocalHost found +valid url mailto:Dude , Killer ?subject=bla name 2 valid url mailto:Bastian Kleineidam ?bcc=jsmith%40company.com name 3 -error +warning No MX mail host for company.com found +valid url mailto:Bastian Kleineidam name 4 valid @@ -20,9 +22,9 @@ valid url mailto:o'hara@cs.uni-sb.de name 5 valid -url mailto:?to=calvin@studcs.uni-sb.de?subject=blubb +url mailto:?to=calvin@studcs.uni-sb.de&subject=blubb&cc=calvin_cc@studcs.uni-sb.de&CC=calvin_CC@studcs.uni-sb.de name ... -error +valid url mailto:news-admins@freshmeat.net?subject=Re:%20[fm%20#11093]%20(news-admins)%20Submission%20report%20-%20Pretty%20CoLoRs name ... valid @@ -31,10 +33,12 @@ name ... valid url mailto:a@d?subject=äöü name 5 -error +warning No MX mail host for d found +valid url mailto:calvin@cs.uni-sb.de?subject=Halli hallo name _ valid url mailto:Bastian Kleineidam name 3 -error +warning No MX mail host for host1 found +valid diff --git a/test/test_file.py b/test/test_file.py index 0ff66e60..9fa25365 100644 --- a/test/test_file.py +++ b/test/test_file.py @@ -8,7 +8,7 @@ config["anchors"] = 1 config["verbose"] = 1 config.disableThreading() htmldir = "test/html" -for file in ('file.html',): +for file in ('file.html',"file.txt","file.asc"): url = os.path.join(htmldir, file) config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) linkcheck.checkUrls(config)