See changelog

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@82 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-05-16 15:31:19 +00:00
parent 9b311e1e7a
commit 71b07ee8d8
12 changed files with 91 additions and 183 deletions

View file

@ -6,7 +6,7 @@ You need Python >= 1.5.2
You get Python from http://www.python.org
Optionally packages:
Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/
Distutils >= 0.9 from http://www.python.org/sigs/distutils-sig/
OpenSSL from http://www.openssl.org
You will need Perl for Win32 (available from
http://www.activestate.com/ActivePerl) if you want to install OpenSSL

View file

@ -1,9 +1,9 @@
VERSION=$(shell ./setup.py --version)
HOST=treasure.calvinsplayground.de
PROXY=
#HOST=treasure.calvinsplayground.de
#PROXY=
#PROXY=-P$(HOST):5050
#HOST=fsinfo.cs.uni-sb.de
#PROXY=-Pwww-proxy.uni-sb.de:3128
HOST=fsinfo.cs.uni-sb.de
PROXY=-Pwww-proxy.uni-sb.de:3128
PACKAGE = linkchecker
DEBPACKAGE = $(PACKAGE)_$(VERSION)_i386.deb
ALLPACKAGES = ../$(DEBPACKAGE)
@ -13,22 +13,12 @@ TAR = tar
ZIP = zip
all:
@echo "run ./setup.py --help to see how to install"
clean:
./setup.py clean --all
rm -rf $(ALLPACKAGES) $(PACKAGE)-out.*
install:
# ha! the root option finally made it into distutils
./setup.py install --root=$(DESTDIR)
# german translation mit Rezepten von Zlatko :)
msgfmt -o linkcheck.mo linkcheck/linkcheck.po
install -c -m 644 linkcheck.mo $(DESTDIR)/usr/share/locale/de/LC_MESSAGES/
# remove following line if Distutils have script support
#install -c -m 755 linkchecker $(DESTDIR)/usr/bin/
install -c -m 644 linkcheckerrc $(DESTDIR)/etc/
install -c -m 644 DNS/README $(DESTDIR)/usr/share/doc/$(PACKAGE)/README.dns
dist:
./setup.py sdist
fakeroot debian/rules binary

4
debian/changelog vendored
View file

@ -13,8 +13,10 @@ linkchecker (1.2.3) unstable; urgency=low
* i18n support and german translation of the logger outputs
* use http_proxy environment variable if present
* be more RFC822 and RFC2368 compliant when scanning mail syntax
* fix for incorrect line number in logger output (reported by Michael
Schmitz)
-- Bastian Kleineidam <calvin@users.sourceforge.net> Tue, 9 May 2000 00:15:12 +0200
-- Bastian Kleineidam <calvin@users.sourceforge.net> Tue, 16 May 2000 14:53:23 +0200
linkchecker (1.2.2) unstable; urgency=low

10
debian/rules vendored
View file

@ -33,7 +33,15 @@ install: build
dh_clean -k
dh_installdirs
# Add here commands to install the package into debian/tmp.
$(MAKE) install DESTDIR=`pwd`/debian/tmp
# ha! the root option finally made it into distutils
./setup.py install --root=`pwd`/debian/tmp
# german translation mit Rezepten von Zlatko :)
msgfmt -o linkcheck.mo linkcheck/linkcheck.po
install -c -m 644 linkcheck.mo debian/tmp/usr/share/locale/de/LC_MESSAGES/
# remove following line if Distutils have script support
#install -c -m 755 linkchecker debian/tmp/usr/bin/
install -c -m 644 linkcheckerrc debian/tmp/etc/
install -c -m 644 DNS/README debian/tmp/usr/share/doc/$(PACKAGE)/README.dns
# Build architecture-independent files here.

View file

@ -162,8 +162,9 @@ class HttpUrlData(UrlData):
t = time.time()
status, statusText, self.mime = self._getHttpRequest("GET")
self.urlConnection = self.urlConnection.getfile()
self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
self.data = self.urlConnection.read()
self.downloadtime = time.time() - t
self._init_html_comments()
#Config.debug(Config.DebugDelim+self.data+Config.DebugDelim)
def isHtml(self):

View file

@ -22,17 +22,17 @@ from UrlData import LinkCheckerException
# regular expression strings for partially RFC822 compliant adress scanning
# XXX far from complete mail adress scanning; enhance only when needed!
word = r"[\w\-%']+"
words = r"[\w\-%'\s]+"
dotwords = "("+word+r"(?:\."+word+")*)
adress = dotwords+"@"+dotwords
route_adress = words+"<"+adress+">"
mailbox = "("+adress+"|"+route_adress+")"
mailboxes = mailbox+r"?(,+"+mailbox+")*"
word = r"[-\w%']+"
words = r"[-\w%'\s]+"
dotwords = r"(%s(?:\.%s)*)" % (word,word)
adress = "%s@%s" % (dotwords, dotwords)
route_adress = "%s<%s>" % (words, adress)
mailbox = "(%s|%s)" % (adress, route_adress)
mailboxes = "%s?(,%s)*" % (mailbox, mailbox)
# regular expression strings for RFC2368 compliant mailto: scanning
header = word+"="+word
headers = "?"+header+"(&"+header+")*
headers = r"(?:\?%s(&%s)*)?" % (header, header)
mailto = "^mailto:"+mailboxes+headers
# compiled

View file

@ -1,95 +0,0 @@
"""
Copyright (C) 2000 Bastian Kleineidam
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
import string,re
import UrlData
class ParseException(Exception):
pass
ws = re.compile("\s+")
regex_realUrl = re.compile("^Real URL.+")
regex_result = re.compile("^Result.+")
regex_base = re.compile("^Base.+")
regex_info = re.compile("^Info.+")
regex_warning = re.compile("^Warning.+")
regex_parentUrl = re.compile("^Parent URL.+")
regex_valid = re.compile("^Valid.*")
class OutputReader:
def resetState(self):
self.urlName = None
self.parentName = None
self.baseRef = None
self.info = None
self.warning = None
self.result = None
self.linenumber = 0
self.state = 0
def parse(self, file):
line = file.readline()
url = None
urls = []
self.resetState()
while line:
if ws.match(line):
if self.state>=2:
#append url
urldata = UrlData.GetUrlDataFrom(self.urlName, 0,
self.parentName, self.baseRef, self.linenumber)
if self.info:
urldata.setInfo(self.info)
if self.warning:
urldata.setWarning(self.info)
if OutputReader.regex_valid.match(self.result):
urldata.valid=1
urldata.validString = self.result
else:
urldata.valid=0
urldata.errorString = self.result
urls.append(urldata)
elif self.state:
raise ParseException, "No Real URL and Result keyword found"
self.resetState()
elif regex_realUrl.match(line):
self.state = self.state+1
self.urlName = string.strip(line[8:])
elif regex_result.match(line):
self.state = self.state+1
self.result = string.strip(line[6:])
elif regex_info.match(line):
self.info = string.strip(line[4:])
elif regex_base.match(line):
self.baseRef = string.strip(line[4:])
elif regex_warning.match(line):
self.warning = string.strip(line[7:])
elif regex_parentUrl.match(line):
self.parentName = string.strip(line[10:])
if ',' in self.parentName:
self.parentName,self.linenumber = string.split(self.parentName,",",1)
else:
pass
line = file.readline()
return urls

View file

@ -24,7 +24,7 @@ LinkTags = [("a", "href"),
("body", "background"),
("frame", "src"),
("link", "href"),
# <meta http-equiv="refresh" content="5; url=...">
# <meta http-equiv="refresh" content="x; url=...">
("meta", "url"),
("area", "href")]
@ -59,6 +59,7 @@ class UrlData:
self.urlConnection = None
self.extern = 1
self.data = None
self.html_comments = []
def setError(self, s):
@ -216,7 +217,7 @@ class UrlData:
if not (anchor!="" and self.isHtml() and self.valid):
return
self.getContent()
for cur_anchor,line in self.searchInForTag(self.data, ("a", "name")):
for cur_anchor,line in self.searchInForTag("a", "name"):
if cur_anchor == anchor:
return
self.setWarning("anchor #"+anchor+" not found")
@ -245,17 +246,35 @@ class UrlData:
def getContent(self):
"""Precondition: urlConnection is an opened URL.
"""
"""Precondition: urlConnection is an opened URL."""
if not self.data:
t = time.time()
self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
self.data = self.urlConnection.read()
self.downloadtime = time.time() - t
self._init_html_comments()
return self.data
def _init_html_comments(self):
# if we find an URL inside HTML comments we ignore it
# so build a list of intervalls which are HTML comments
pattern = re.compile("<!--.*?-->")
index = 0
while 1:
match = pattern.search(self.data, index)
if not match: break
index = match.end()
self.html_comments.append(match.span())
def _isInComment(self, index):
for low,high in self.html_comments:
if low < index and index < high:
return 1
return 0
def checkContent(self, warningregex):
self.getContent()
match = warningregex.search(self.data)
match = warningregex.search(self.getContent())
if match:
self.setWarning("Found '"+match.group()+"' in link contents")
@ -263,10 +282,8 @@ class UrlData:
def parseUrl(self, config):
Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
str(self)+"\n"+Config.DebugDelim)
self.getContent()
# search for a possible base reference
bases = self.searchInForTag(self.data, ("base", "href"))
bases = self.searchInForTag("base", "href")
baseRef = None
if len(bases)>=1:
baseRef = bases[0][0]
@ -274,34 +291,30 @@ class UrlData:
self.setWarning("more than one base tag found")
# search for tags and add found tags to URL queue
for tag in LinkTags:
urls = self.searchInForTag(self.data, tag)
Config.debug("DEBUG: "+str(tag)+" urls="+str(urls)+"\n")
for _url,line in urls:
config.appendUrl(GetUrlDataFrom(_url,
for start,end in LinkTags:
urls = self.searchInForTag(start,end)
Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls))
for url,line in urls:
config.appendUrl(GetUrlDataFrom(url,
self.recursionLevel+1, self.url, baseRef, line))
def searchInForTag(self, data, tag):
_urls = []
_prefix="<\s*"+tag[0]+"\s+[^>]*?"+tag[1]+"\s*=\s*"
_suffix="[^>]*>"
_patterns = [re.compile(_prefix+"\"([^\"]+)\""+_suffix, re.I),
re.compile(_prefix+"([^\s>]+)" +_suffix, re.I)]
cutofflines = 0
for _pattern in _patterns:
while 1:
_match = _pattern.search(data)
if not _match: break
# need to strip optional ending quotes for the <meta url=> tag
linenumberbegin = StringUtil.getLineNumber(data, _match.start(0))
linenumberend = StringUtil.getLineNumber(data, _match.end(0))
cutofflines = cutofflines + linenumberend - linenumberbegin
_urls.append((string.strip(StringUtil.rstripQuotes(_match.group(1))),
linenumberbegin + cutofflines))
data = data[:_match.start(0)] + data[_match.end(0):]
return _urls
def searchInForTag(self, tag_start, tag_end):
urls = []
prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*"
suffix="[^>]*>"
pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I)
index = 0
while 1:
match = pattern.search(self.getContent(), index)
if not match: break
index = match.end()
if self._isInComment(match.start()): continue
# need to strip optional ending quotes for the meta tag
urls.append((string.strip(StringUtil.stripQuotes(match.group(1))),
StringUtil.getLineNumber(self.getContent(),
match.start())))
return urls
def __str__(self):
@ -313,9 +326,9 @@ class UrlData:
def _getUserPassword(self, config):
for rx, _user, _password in config["authentication"]:
for rx, user, password in config["authentication"]:
if rx.match(self.url):
return _user, _password
return user, password
from FileUrlData import FileUrlData
@ -341,23 +354,23 @@ def GetUrlDataFrom(urlName,
elif parentName and ":" in parentName:
name = string.lower(parentName)
# test scheme
if re.compile("^http:").search(name):
if re.search("^http:", name):
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^ftp:").search(name):
if re.search("^ftp:", name):
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^file:").search(name):
if re.search("^file:", name):
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^telnet:").search(name):
if re.search("^telnet:", name):
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^mailto:").search(name):
if re.search("^mailto:", name):
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^gopher:").search(name):
if re.search("^gopher:", name):
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^javascript:").search(name):
if re.search("^javascript:", name):
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^https:").search(name):
if re.search("^https:", name):
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
if re.compile("^news:").search(name):
if re.search("^news:", name):
return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line)
# assume local file
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)

View file

@ -30,7 +30,7 @@ try:
except ImportError:
def gettext(msg):
return msg
import Config,UrlData,OutputReader,sys,lc_cgi
import Config,UrlData,sys,lc_cgi
def checkUrls(config = Config.Configuration()):
""" checkUrls gets a complete configuration object as parameter where all

View file

@ -1,4 +1,4 @@
@echo off
rem Limited to 9 parameters? Is there a $* for Windows?
python "c:\progra~1\linkchecker-1.2.3\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9
python "@install_scripts@\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9

View file

@ -22,13 +22,11 @@ from Template import Template
import sys,os,string
# Autodetect the existence of an SSL library (this is pretty shitty)
# Autodetect Windows platforms to include the linkchecker.bat script
class LCDistribution(Distribution):
default_include_dirs = ['/usr/include/openssl',
'/usr/local/include/openssl']
def run_commands (self):
self.check_ssl()
self.check_windows()
for cmd in self.commands:
self.run_command (cmd)
@ -46,16 +44,6 @@ class LCDistribution(Distribution):
"disabling SSL compilation.\n"
"Use the -I option for the build_ext command.")
def check_windows(self):
if sys.platform=='win32':
inst = self.find_command_obj("install")
inst.ensure_ready()
t = Template("linkchecker.bat.tmpl")
f = open("linkchecker.bat","w")
f.write(t.fill_in({"path_to_linkchecker": inst.install_scripts}))
f.close()
self.scripts.append('linkchecker.bat')
def has_ssl(self):
incls = self.find_command_obj("build_ext").include_dirs
incls = (incls and string.split(incls, os.pathsep)) or []
@ -89,5 +77,5 @@ o robots.txt exclusion protocol support
""",
distclass = LCDistribution,
packages = ['','DNS','linkcheck'],
scripts = ['linkchecker'],
scripts = ['linkchecker', 'linkchecker.bat'],
)

View file

@ -1,3 +1,4 @@
<!--comment-->
<base href="file:/etc/">
<a href="passwd">