See changelog

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@82 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-08 14:44:46 +00:00 · 2000-05-16 15:31:19 +00:00 · 2000-05-16 15:31:19 +00:00 · 71b07ee8d8
commit 71b07ee8d8
parent 9b311e1e7a
12 changed files with 91 additions and 183 deletions
--- a/2
+++ b/2
@ -6,7 +6,7 @@ You need Python >= 1.5.2
 You get Python from http://www.python.org

 Optionally packages:
-Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/ 
+Distutils >= 0.9 from http://www.python.org/sigs/distutils-sig/ 
 OpenSSL from http://www.openssl.org
 You will need Perl for Win32 (available from
 http://www.activestate.com/ActivePerl) if you want to install OpenSSL 
--- a/20
+++ b/20
@ -1,9 +1,9 @@
 VERSION=$(shell ./setup.py --version)
-HOST=treasure.calvinsplayground.de
-PROXY=
+#HOST=treasure.calvinsplayground.de
+#PROXY=
 #PROXY=-P$(HOST):5050
-#HOST=fsinfo.cs.uni-sb.de
-#PROXY=-Pwww-proxy.uni-sb.de:3128
+HOST=fsinfo.cs.uni-sb.de
+PROXY=-Pwww-proxy.uni-sb.de:3128
 PACKAGE = linkchecker
 DEBPACKAGE = $(PACKAGE)_$(VERSION)_i386.deb
 ALLPACKAGES = ../$(DEBPACKAGE)
@ -13,22 +13,12 @@ TAR = tar
 ZIP = zip

 all:
+	@echo "run ./setup.py --help to see how to install"

 clean:
 	./setup.py clean --all
 	rm -rf $(ALLPACKAGES) $(PACKAGE)-out.*

-install:
-	# ha! the root option finally made it into distutils
-	./setup.py install --root=$(DESTDIR)
-	# german translation mit Rezepten von Zlatko :)
-	msgfmt -o linkcheck.mo linkcheck/linkcheck.po
-	install -c -m 644 linkcheck.mo $(DESTDIR)/usr/share/locale/de/LC_MESSAGES/
-	# remove following line if Distutils have script support
-	#install -c -m 755 linkchecker $(DESTDIR)/usr/bin/
-	install -c -m 644 linkcheckerrc $(DESTDIR)/etc/
-	install -c -m 644 DNS/README $(DESTDIR)/usr/share/doc/$(PACKAGE)/README.dns
-
 dist:
 	./setup.py sdist
 	fakeroot debian/rules binary
--- a/debian/changelog
+++ b/debian/changelog
@ -13,8 +13,10 @@ linkchecker (1.2.3) unstable; urgency=low
  * i18n support and german translation of the logger outputs
  * use http_proxy environment variable if present
  * be more RFC822 and RFC2368 compliant when scanning mail syntax
+  * fix for incorrect line number in logger output (reported by Michael
+    Schmitz)

- -- Bastian Kleineidam <calvin@users.sourceforge.net>  Tue,  9 May 2000 00:15:12 +0200
+ -- Bastian Kleineidam <calvin@users.sourceforge.net>  Tue, 16 May 2000 14:53:23 +0200

 linkchecker (1.2.2) unstable; urgency=low

--- a/debian/rules
+++ b/debian/rules
@ -33,7 +33,15 @@ install: build
 	dh_clean -k
 	dh_installdirs
 	# Add here commands to install the package into debian/tmp.
-	$(MAKE) install DESTDIR=`pwd`/debian/tmp
+	# ha! the root option finally made it into distutils
+	./setup.py install --root=`pwd`/debian/tmp
+	# german translation mit Rezepten von Zlatko :)
+	msgfmt -o linkcheck.mo linkcheck/linkcheck.po
+	install -c -m 644 linkcheck.mo debian/tmp/usr/share/locale/de/LC_MESSAGES/
+	# remove following line if Distutils have script support
+	#install -c -m 755 linkchecker debian/tmp/usr/bin/
+	install -c -m 644 linkcheckerrc debian/tmp/etc/
+	install -c -m 644 DNS/README debian/tmp/usr/share/doc/$(PACKAGE)/README.dns


 # Build architecture-independent files here.
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -162,8 +162,9 @@ class HttpUrlData(UrlData):
            t = time.time()
            status, statusText, self.mime = self._getHttpRequest("GET")
            self.urlConnection = self.urlConnection.getfile()
-            self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
+            self.data = self.urlConnection.read()
            self.downloadtime = time.time() - t
+            self._init_html_comments()
            #Config.debug(Config.DebugDelim+self.data+Config.DebugDelim)
        
    def isHtml(self):
--- a/linkcheck/MailtoUrlData.py
+++ b/linkcheck/MailtoUrlData.py
@ -22,17 +22,17 @@ from UrlData import LinkCheckerException

 # regular expression strings for partially RFC822 compliant adress scanning
 # XXX far from complete mail adress scanning; enhance only when needed!
-word = r"[\w\-%']+"
-words = r"[\w\-%'\s]+"
-dotwords = "("+word+r"(?:\."+word+")*)
-adress = dotwords+"@"+dotwords
-route_adress = words+"<"+adress+">"
-mailbox = "("+adress+"|"+route_adress+")"
-mailboxes = mailbox+r"?(,+"+mailbox+")*"
+word = r"[-\w%']+"
+words = r"[-\w%'\s]+"
+dotwords = r"(%s(?:\.%s)*)" % (word,word)
+adress = "%s@%s" % (dotwords, dotwords)
+route_adress = "%s<%s>" % (words, adress)
+mailbox = "(%s|%s)" % (adress, route_adress)
+mailboxes = "%s?(,%s)*" % (mailbox, mailbox)

 # regular expression strings for RFC2368 compliant mailto: scanning
 header = word+"="+word
-headers = "?"+header+"(&"+header+")*
+headers = r"(?:\?%s(&%s)*)?" % (header, header)
 mailto = "^mailto:"+mailboxes+headers

 # compiled
--- a/linkcheck/OutputReader.py
+++ b/linkcheck/OutputReader.py
@ -1,95 +0,0 @@
-"""
-    Copyright (C) 2000  Bastian Kleineidam
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-"""
-import string,re
-import UrlData
-
-class ParseException(Exception):
-    pass
-
-
-ws = re.compile("\s+")
-regex_realUrl = re.compile("^Real URL.+")
-regex_result = re.compile("^Result.+")
-regex_base = re.compile("^Base.+")
-regex_info = re.compile("^Info.+")
-regex_warning = re.compile("^Warning.+") 
-regex_parentUrl = re.compile("^Parent URL.+")
-regex_valid = re.compile("^Valid.*")
-
-
-class OutputReader:
-
-    def resetState(self):
-        self.urlName = None
-        self.parentName = None
-        self.baseRef = None
-        self.info = None
-        self.warning = None
-        self.result = None
-        self.linenumber = 0
-        self.state = 0
-
-    def parse(self, file):
-        line = file.readline()
-        url = None
-        urls = []
-        self.resetState()
-
-        while line:
-            if ws.match(line):
-                if self.state>=2: 
-                    #append url
-                    urldata = UrlData.GetUrlDataFrom(self.urlName, 0, 
-                    self.parentName, self.baseRef, self.linenumber)
-                    if self.info:
-                        urldata.setInfo(self.info)
-                    if self.warning:
-                        urldata.setWarning(self.info)
-                    if OutputReader.regex_valid.match(self.result):
-                        urldata.valid=1
-                        urldata.validString = self.result
-                    else:
-                        urldata.valid=0
-                        urldata.errorString = self.result
-                    urls.append(urldata)
-                elif self.state:
-                    raise ParseException, "No Real URL and Result keyword found"
-                self.resetState()
-                
-            elif regex_realUrl.match(line):
-                self.state = self.state+1
-                self.urlName = string.strip(line[8:])
-            elif regex_result.match(line):
-                self.state = self.state+1
-                self.result = string.strip(line[6:])
-            elif regex_info.match(line):
-                self.info = string.strip(line[4:])
-            elif regex_base.match(line):
-                self.baseRef = string.strip(line[4:])
-            elif regex_warning.match(line):
-                self.warning = string.strip(line[7:])
-            elif regex_parentUrl.match(line):
-                self.parentName = string.strip(line[10:])
-                if ',' in self.parentName:
-                    self.parentName,self.linenumber = string.split(self.parentName,",",1)
-            else:
-                pass
-                
-            line = file.readline()
-        return urls
-
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -24,7 +24,7 @@ LinkTags = [("a",     "href"),
            ("body",  "background"),
            ("frame", "src"),
            ("link",  "href"),
-            # <meta http-equiv="refresh" content="5; url=...">
+            # <meta http-equiv="refresh" content="x; url=...">
            ("meta",  "url"),  
            ("area",  "href")]

@ -59,6 +59,7 @@ class UrlData:
        self.urlConnection = None
        self.extern = 1
        self.data = None
+        self.html_comments = []
        
        
    def setError(self, s):
@ -216,7 +217,7 @@ class UrlData:
        if not (anchor!="" and self.isHtml() and self.valid):
            return
        self.getContent()
-        for cur_anchor,line in self.searchInForTag(self.data, ("a", "name")):
+        for cur_anchor,line in self.searchInForTag("a", "name"):
            if cur_anchor == anchor:
                return
        self.setWarning("anchor #"+anchor+" not found")
@ -245,17 +246,35 @@ class UrlData:


    def getContent(self):
-        """Precondition: urlConnection is an opened URL.
-        """
+        """Precondition: urlConnection is an opened URL."""
        if not self.data:
            t = time.time()
-            self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
+            self.data = self.urlConnection.read()
            self.downloadtime = time.time() - t
+            self._init_html_comments()
+        return self.data
+
+
+    def _init_html_comments(self):
+        # if we find an URL inside HTML comments we ignore it
+        # so build a list of intervalls which are HTML comments
+        pattern = re.compile("<!--.*?-->")
+        index = 0
+        while 1:
+            match = pattern.search(self.data, index)
+            if not match: break
+            index = match.end()
+            self.html_comments.append(match.span())
+
+    def _isInComment(self, index):
+        for low,high in self.html_comments:
+            if low < index and index < high:
+                return 1
+        return 0


    def checkContent(self, warningregex):
-        self.getContent()
-        match = warningregex.search(self.data)
+        match = warningregex.search(self.getContent())
        if match:
            self.setWarning("Found '"+match.group()+"' in link contents")

@ -263,10 +282,8 @@ class UrlData:
    def parseUrl(self, config):
        Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
                         str(self)+"\n"+Config.DebugDelim)
-        self.getContent()
-
        # search for a possible base reference
-        bases = self.searchInForTag(self.data, ("base", "href"))
+        bases = self.searchInForTag("base", "href")
        baseRef = None
        if len(bases)>=1:
            baseRef = bases[0][0]
@ -274,34 +291,30 @@ class UrlData:
                self.setWarning("more than one base tag found")
            
        # search for tags and add found tags to URL queue
-        for tag in LinkTags:
-            urls = self.searchInForTag(self.data, tag)
-            Config.debug("DEBUG: "+str(tag)+" urls="+str(urls)+"\n")
-            for _url,line in urls:
-                config.appendUrl(GetUrlDataFrom(_url,
+        for start,end in LinkTags:
+            urls = self.searchInForTag(start,end)
+            Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls))
+            for url,line in urls:
+                config.appendUrl(GetUrlDataFrom(url,
                        self.recursionLevel+1, self.url, baseRef, line))


-    def searchInForTag(self, data, tag):
-        _urls = []
-        _prefix="<\s*"+tag[0]+"\s+[^>]*?"+tag[1]+"\s*=\s*"
-        _suffix="[^>]*>"
-        _patterns = [re.compile(_prefix+"\"([^\"]+)\""+_suffix, re.I),
-                     re.compile(_prefix+"([^\s>]+)"   +_suffix, re.I)]
-        cutofflines = 0
-        for _pattern in _patterns:
-            while 1:
-                _match = _pattern.search(data)
-                if not _match: break
-                # need to strip optional ending quotes for the <meta url=> tag
-                linenumberbegin = StringUtil.getLineNumber(data, _match.start(0))
-                linenumberend = StringUtil.getLineNumber(data, _match.end(0))
-                cutofflines = cutofflines + linenumberend - linenumberbegin
-                _urls.append((string.strip(StringUtil.rstripQuotes(_match.group(1))),
-                     linenumberbegin + cutofflines))
-                data = data[:_match.start(0)] + data[_match.end(0):]
-        
-        return _urls
+    def searchInForTag(self, tag_start, tag_end):
+        urls = []
+        prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*"
+        suffix="[^>]*>"
+        pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I)
+        index = 0
+        while 1:
+            match = pattern.search(self.getContent(), index)
+            if not match: break
+            index = match.end()
+            if self._isInComment(match.start()): continue
+            # need to strip optional ending quotes for the meta tag
+            urls.append((string.strip(StringUtil.stripQuotes(match.group(1))), 
+                          StringUtil.getLineNumber(self.getContent(), 
+                                                   match.start())))
+        return urls


    def __str__(self):
@ -313,9 +326,9 @@ class UrlData:


    def _getUserPassword(self, config):
-        for rx, _user, _password in config["authentication"]:
+        for rx, user, password in config["authentication"]:
            if rx.match(self.url):
-                return _user, _password
+                return user, password


 from FileUrlData import FileUrlData
@ -341,23 +354,23 @@ def GetUrlDataFrom(urlName,
    elif parentName and ":" in parentName:
        name = string.lower(parentName)
    # test scheme
-    if re.compile("^http:").search(name):
+    if re.search("^http:", name):
        return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^ftp:").search(name):
+    if re.search("^ftp:", name):
        return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^file:").search(name):
+    if re.search("^file:", name):
        return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^telnet:").search(name):
+    if re.search("^telnet:", name):
        return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^mailto:").search(name):
+    if re.search("^mailto:", name):
        return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^gopher:").search(name):
+    if re.search("^gopher:", name):
        return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^javascript:").search(name):
+    if re.search("^javascript:", name):
        return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^https:").search(name):
+    if re.search("^https:", name):
        return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
-    if re.compile("^news:").search(name):
+    if re.search("^news:", name):
        return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line)
    # assume local file
    return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
--- a/linkcheck/init.py
+++ b/linkcheck/init.py
@ -30,7 +30,7 @@ try:
 except ImportError:
    def gettext(msg):
        return msg
-import Config,UrlData,OutputReader,sys,lc_cgi
+import Config,UrlData,sys,lc_cgi

 def checkUrls(config = Config.Configuration()):
    """ checkUrls gets a complete configuration object as parameter where all
--- a/linkchecker.bat
+++ b/linkchecker.bat
@ -1,4 +1,4 @@
@echo off

 rem Limited to 9 parameters? Is there a $* for Windows?
-python "c:\progra~1\linkchecker-1.2.3\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9
+python "@install_scripts@\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9
--- a/setup.py
+++ b/setup.py
@ -22,13 +22,11 @@ from Template import Template
 import sys,os,string

 # Autodetect the existence of an SSL library (this is pretty shitty)
-# Autodetect Windows platforms to include the linkchecker.bat script
 class LCDistribution(Distribution):
    default_include_dirs = ['/usr/include/openssl',
                            '/usr/local/include/openssl']
    def run_commands (self):
        self.check_ssl()
-        self.check_windows()
        for cmd in self.commands:
            self.run_command (cmd)

@ -46,16 +44,6 @@ class LCDistribution(Distribution):
                          "disabling SSL compilation.\n"
 			  "Use the -I option for the build_ext command.")

-    def check_windows(self):
-        if sys.platform=='win32':
-            inst = self.find_command_obj("install")
-            inst.ensure_ready()
-            t = Template("linkchecker.bat.tmpl")
-            f = open("linkchecker.bat","w")
-	    f.write(t.fill_in({"path_to_linkchecker": inst.install_scripts}))
-            f.close()
-            self.scripts.append('linkchecker.bat')
-
    def has_ssl(self):
        incls = self.find_command_obj("build_ext").include_dirs
        incls = (incls and string.split(incls, os.pathsep)) or []
@ -89,5 +77,5 @@ o robots.txt exclusion protocol support
 """,
       distclass = LCDistribution,
       packages = ['','DNS','linkcheck'],
-       scripts = ['linkchecker'],
+       scripts = ['linkchecker', 'linkchecker.bat'],
 )
--- a/test/base2.html
+++ b/test/base2.html
@ -1,3 +1,4 @@
+<!--comment-->
 <base href="file:/etc/">
 <a href="passwd">