diff --git a/INSTALL b/INSTALL index dce29b89..9e49b3f2 100644 --- a/INSTALL +++ b/INSTALL @@ -6,7 +6,7 @@ You need Python >= 1.5.2 You get Python from http://www.python.org Optionally packages: -Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/ +Distutils >= 0.9 from http://www.python.org/sigs/distutils-sig/ OpenSSL from http://www.openssl.org You will need Perl for Win32 (available from http://www.activestate.com/ActivePerl) if you want to install OpenSSL diff --git a/Makefile b/Makefile index a33b930c..4b63989f 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ VERSION=$(shell ./setup.py --version) -HOST=treasure.calvinsplayground.de -PROXY= +#HOST=treasure.calvinsplayground.de +#PROXY= #PROXY=-P$(HOST):5050 -#HOST=fsinfo.cs.uni-sb.de -#PROXY=-Pwww-proxy.uni-sb.de:3128 +HOST=fsinfo.cs.uni-sb.de +PROXY=-Pwww-proxy.uni-sb.de:3128 PACKAGE = linkchecker DEBPACKAGE = $(PACKAGE)_$(VERSION)_i386.deb ALLPACKAGES = ../$(DEBPACKAGE) @@ -13,22 +13,12 @@ TAR = tar ZIP = zip all: + @echo "run ./setup.py --help to see how to install" clean: ./setup.py clean --all rm -rf $(ALLPACKAGES) $(PACKAGE)-out.* -install: - # ha! the root option finally made it into distutils - ./setup.py install --root=$(DESTDIR) - # german translation mit Rezepten von Zlatko :) - msgfmt -o linkcheck.mo linkcheck/linkcheck.po - install -c -m 644 linkcheck.mo $(DESTDIR)/usr/share/locale/de/LC_MESSAGES/ - # remove following line if Distutils have script support - #install -c -m 755 linkchecker $(DESTDIR)/usr/bin/ - install -c -m 644 linkcheckerrc $(DESTDIR)/etc/ - install -c -m 644 DNS/README $(DESTDIR)/usr/share/doc/$(PACKAGE)/README.dns - dist: ./setup.py sdist fakeroot debian/rules binary diff --git a/debian/changelog b/debian/changelog index fb1d0057..f0aa4508 100644 --- a/debian/changelog +++ b/debian/changelog @@ -13,8 +13,10 @@ linkchecker (1.2.3) unstable; urgency=low * i18n support and german translation of the logger outputs * use http_proxy environment variable if present * be more RFC822 and RFC2368 compliant when scanning mail syntax + * fix for incorrect line number in logger output (reported by Michael + Schmitz) - -- Bastian Kleineidam Tue, 9 May 2000 00:15:12 +0200 + -- Bastian Kleineidam Tue, 16 May 2000 14:53:23 +0200 linkchecker (1.2.2) unstable; urgency=low diff --git a/debian/rules b/debian/rules index 6ff3a3a5..d32acf4b 100755 --- a/debian/rules +++ b/debian/rules @@ -33,7 +33,15 @@ install: build dh_clean -k dh_installdirs # Add here commands to install the package into debian/tmp. - $(MAKE) install DESTDIR=`pwd`/debian/tmp + # ha! the root option finally made it into distutils + ./setup.py install --root=`pwd`/debian/tmp + # german translation mit Rezepten von Zlatko :) + msgfmt -o linkcheck.mo linkcheck/linkcheck.po + install -c -m 644 linkcheck.mo debian/tmp/usr/share/locale/de/LC_MESSAGES/ + # remove following line if Distutils have script support + #install -c -m 755 linkchecker debian/tmp/usr/bin/ + install -c -m 644 linkcheckerrc debian/tmp/etc/ + install -c -m 644 DNS/README debian/tmp/usr/share/doc/$(PACKAGE)/README.dns # Build architecture-independent files here. diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index 8845153c..12159f6e 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -162,8 +162,9 @@ class HttpUrlData(UrlData): t = time.time() status, statusText, self.mime = self._getHttpRequest("GET") self.urlConnection = self.urlConnection.getfile() - self.data = StringUtil.stripHtmlComments(self.urlConnection.read()) + self.data = self.urlConnection.read() self.downloadtime = time.time() - t + self._init_html_comments() #Config.debug(Config.DebugDelim+self.data+Config.DebugDelim) def isHtml(self): diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py index 800c325c..8bee114a 100644 --- a/linkcheck/MailtoUrlData.py +++ b/linkcheck/MailtoUrlData.py @@ -22,17 +22,17 @@ from UrlData import LinkCheckerException # regular expression strings for partially RFC822 compliant adress scanning # XXX far from complete mail adress scanning; enhance only when needed! -word = r"[\w\-%']+" -words = r"[\w\-%'\s]+" -dotwords = "("+word+r"(?:\."+word+")*) -adress = dotwords+"@"+dotwords -route_adress = words+"<"+adress+">" -mailbox = "("+adress+"|"+route_adress+")" -mailboxes = mailbox+r"?(,+"+mailbox+")*" +word = r"[-\w%']+" +words = r"[-\w%'\s]+" +dotwords = r"(%s(?:\.%s)*)" % (word,word) +adress = "%s@%s" % (dotwords, dotwords) +route_adress = "%s<%s>" % (words, adress) +mailbox = "(%s|%s)" % (adress, route_adress) +mailboxes = "%s?(,%s)*" % (mailbox, mailbox) # regular expression strings for RFC2368 compliant mailto: scanning header = word+"="+word -headers = "?"+header+"(&"+header+")* +headers = r"(?:\?%s(&%s)*)?" % (header, header) mailto = "^mailto:"+mailboxes+headers # compiled diff --git a/linkcheck/OutputReader.py b/linkcheck/OutputReader.py deleted file mode 100644 index d176b466..00000000 --- a/linkcheck/OutputReader.py +++ /dev/null @@ -1,95 +0,0 @@ -""" - Copyright (C) 2000 Bastian Kleineidam - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -""" -import string,re -import UrlData - -class ParseException(Exception): - pass - - -ws = re.compile("\s+") -regex_realUrl = re.compile("^Real URL.+") -regex_result = re.compile("^Result.+") -regex_base = re.compile("^Base.+") -regex_info = re.compile("^Info.+") -regex_warning = re.compile("^Warning.+") -regex_parentUrl = re.compile("^Parent URL.+") -regex_valid = re.compile("^Valid.*") - - -class OutputReader: - - def resetState(self): - self.urlName = None - self.parentName = None - self.baseRef = None - self.info = None - self.warning = None - self.result = None - self.linenumber = 0 - self.state = 0 - - def parse(self, file): - line = file.readline() - url = None - urls = [] - self.resetState() - - while line: - if ws.match(line): - if self.state>=2: - #append url - urldata = UrlData.GetUrlDataFrom(self.urlName, 0, - self.parentName, self.baseRef, self.linenumber) - if self.info: - urldata.setInfo(self.info) - if self.warning: - urldata.setWarning(self.info) - if OutputReader.regex_valid.match(self.result): - urldata.valid=1 - urldata.validString = self.result - else: - urldata.valid=0 - urldata.errorString = self.result - urls.append(urldata) - elif self.state: - raise ParseException, "No Real URL and Result keyword found" - self.resetState() - - elif regex_realUrl.match(line): - self.state = self.state+1 - self.urlName = string.strip(line[8:]) - elif regex_result.match(line): - self.state = self.state+1 - self.result = string.strip(line[6:]) - elif regex_info.match(line): - self.info = string.strip(line[4:]) - elif regex_base.match(line): - self.baseRef = string.strip(line[4:]) - elif regex_warning.match(line): - self.warning = string.strip(line[7:]) - elif regex_parentUrl.match(line): - self.parentName = string.strip(line[10:]) - if ',' in self.parentName: - self.parentName,self.linenumber = string.split(self.parentName,",",1) - else: - pass - - line = file.readline() - return urls - diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 867ccfb7..330515b4 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -24,7 +24,7 @@ LinkTags = [("a", "href"), ("body", "background"), ("frame", "src"), ("link", "href"), - # + # ("meta", "url"), ("area", "href")] @@ -59,6 +59,7 @@ class UrlData: self.urlConnection = None self.extern = 1 self.data = None + self.html_comments = [] def setError(self, s): @@ -216,7 +217,7 @@ class UrlData: if not (anchor!="" and self.isHtml() and self.valid): return self.getContent() - for cur_anchor,line in self.searchInForTag(self.data, ("a", "name")): + for cur_anchor,line in self.searchInForTag("a", "name"): if cur_anchor == anchor: return self.setWarning("anchor #"+anchor+" not found") @@ -245,17 +246,35 @@ class UrlData: def getContent(self): - """Precondition: urlConnection is an opened URL. - """ + """Precondition: urlConnection is an opened URL.""" if not self.data: t = time.time() - self.data = StringUtil.stripHtmlComments(self.urlConnection.read()) + self.data = self.urlConnection.read() self.downloadtime = time.time() - t + self._init_html_comments() + return self.data + + + def _init_html_comments(self): + # if we find an URL inside HTML comments we ignore it + # so build a list of intervalls which are HTML comments + pattern = re.compile("") + index = 0 + while 1: + match = pattern.search(self.data, index) + if not match: break + index = match.end() + self.html_comments.append(match.span()) + + def _isInComment(self, index): + for low,high in self.html_comments: + if low < index and index < high: + return 1 + return 0 def checkContent(self, warningregex): - self.getContent() - match = warningregex.search(self.data) + match = warningregex.search(self.getContent()) if match: self.setWarning("Found '"+match.group()+"' in link contents") @@ -263,10 +282,8 @@ class UrlData: def parseUrl(self, config): Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\ str(self)+"\n"+Config.DebugDelim) - self.getContent() - # search for a possible base reference - bases = self.searchInForTag(self.data, ("base", "href")) + bases = self.searchInForTag("base", "href") baseRef = None if len(bases)>=1: baseRef = bases[0][0] @@ -274,34 +291,30 @@ class UrlData: self.setWarning("more than one base tag found") # search for tags and add found tags to URL queue - for tag in LinkTags: - urls = self.searchInForTag(self.data, tag) - Config.debug("DEBUG: "+str(tag)+" urls="+str(urls)+"\n") - for _url,line in urls: - config.appendUrl(GetUrlDataFrom(_url, + for start,end in LinkTags: + urls = self.searchInForTag(start,end) + Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls)) + for url,line in urls: + config.appendUrl(GetUrlDataFrom(url, self.recursionLevel+1, self.url, baseRef, line)) - def searchInForTag(self, data, tag): - _urls = [] - _prefix="<\s*"+tag[0]+"\s+[^>]*?"+tag[1]+"\s*=\s*" - _suffix="[^>]*>" - _patterns = [re.compile(_prefix+"\"([^\"]+)\""+_suffix, re.I), - re.compile(_prefix+"([^\s>]+)" +_suffix, re.I)] - cutofflines = 0 - for _pattern in _patterns: - while 1: - _match = _pattern.search(data) - if not _match: break - # need to strip optional ending quotes for the tag - linenumberbegin = StringUtil.getLineNumber(data, _match.start(0)) - linenumberend = StringUtil.getLineNumber(data, _match.end(0)) - cutofflines = cutofflines + linenumberend - linenumberbegin - _urls.append((string.strip(StringUtil.rstripQuotes(_match.group(1))), - linenumberbegin + cutofflines)) - data = data[:_match.start(0)] + data[_match.end(0):] - - return _urls + def searchInForTag(self, tag_start, tag_end): + urls = [] + prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*" + suffix="[^>]*>" + pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I) + index = 0 + while 1: + match = pattern.search(self.getContent(), index) + if not match: break + index = match.end() + if self._isInComment(match.start()): continue + # need to strip optional ending quotes for the meta tag + urls.append((string.strip(StringUtil.stripQuotes(match.group(1))), + StringUtil.getLineNumber(self.getContent(), + match.start()))) + return urls def __str__(self): @@ -313,9 +326,9 @@ class UrlData: def _getUserPassword(self, config): - for rx, _user, _password in config["authentication"]: + for rx, user, password in config["authentication"]: if rx.match(self.url): - return _user, _password + return user, password from FileUrlData import FileUrlData @@ -341,23 +354,23 @@ def GetUrlDataFrom(urlName, elif parentName and ":" in parentName: name = string.lower(parentName) # test scheme - if re.compile("^http:").search(name): + if re.search("^http:", name): return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^ftp:").search(name): + if re.search("^ftp:", name): return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^file:").search(name): + if re.search("^file:", name): return FileUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^telnet:").search(name): + if re.search("^telnet:", name): return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^mailto:").search(name): + if re.search("^mailto:", name): return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^gopher:").search(name): + if re.search("^gopher:", name): return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^javascript:").search(name): + if re.search("^javascript:", name): return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^https:").search(name): + if re.search("^https:", name): return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line) - if re.compile("^news:").search(name): + if re.search("^news:", name): return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line) # assume local file return FileUrlData(urlName, recursionLevel, parentName, baseRef, line) diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index e1b1aa44..e4be09f6 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -30,7 +30,7 @@ try: except ImportError: def gettext(msg): return msg -import Config,UrlData,OutputReader,sys,lc_cgi +import Config,UrlData,sys,lc_cgi def checkUrls(config = Config.Configuration()): """ checkUrls gets a complete configuration object as parameter where all diff --git a/linkchecker.bat b/linkchecker.bat index 3e7bca0e..06e0d7a3 100644 --- a/linkchecker.bat +++ b/linkchecker.bat @@ -1,4 +1,4 @@ @echo off rem Limited to 9 parameters? Is there a $* for Windows? -python "c:\progra~1\linkchecker-1.2.3\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9 +python "@install_scripts@\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9 diff --git a/setup.py b/setup.py index 6b57dd64..fd9125c9 100755 --- a/setup.py +++ b/setup.py @@ -22,13 +22,11 @@ from Template import Template import sys,os,string # Autodetect the existence of an SSL library (this is pretty shitty) -# Autodetect Windows platforms to include the linkchecker.bat script class LCDistribution(Distribution): default_include_dirs = ['/usr/include/openssl', '/usr/local/include/openssl'] def run_commands (self): self.check_ssl() - self.check_windows() for cmd in self.commands: self.run_command (cmd) @@ -46,16 +44,6 @@ class LCDistribution(Distribution): "disabling SSL compilation.\n" "Use the -I option for the build_ext command.") - def check_windows(self): - if sys.platform=='win32': - inst = self.find_command_obj("install") - inst.ensure_ready() - t = Template("linkchecker.bat.tmpl") - f = open("linkchecker.bat","w") - f.write(t.fill_in({"path_to_linkchecker": inst.install_scripts})) - f.close() - self.scripts.append('linkchecker.bat') - def has_ssl(self): incls = self.find_command_obj("build_ext").include_dirs incls = (incls and string.split(incls, os.pathsep)) or [] @@ -89,5 +77,5 @@ o robots.txt exclusion protocol support """, distclass = LCDistribution, packages = ['','DNS','linkcheck'], - scripts = ['linkchecker'], + scripts = ['linkchecker', 'linkchecker.bat'], ) diff --git a/test/base2.html b/test/base2.html index 4400f784..bb7fbed1 100644 --- a/test/base2.html +++ b/test/base2.html @@ -1,3 +1,4 @@ +