mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 00:40:30 +00:00
See changelog
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@82 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
9b311e1e7a
commit
71b07ee8d8
12 changed files with 91 additions and 183 deletions
2
INSTALL
2
INSTALL
|
|
@ -6,7 +6,7 @@ You need Python >= 1.5.2
|
|||
You get Python from http://www.python.org
|
||||
|
||||
Optionally packages:
|
||||
Distutils >= 0.8.1 from http://www.python.org/sigs/distutils-sig/
|
||||
Distutils >= 0.9 from http://www.python.org/sigs/distutils-sig/
|
||||
OpenSSL from http://www.openssl.org
|
||||
You will need Perl for Win32 (available from
|
||||
http://www.activestate.com/ActivePerl) if you want to install OpenSSL
|
||||
|
|
|
|||
20
Makefile
20
Makefile
|
|
@ -1,9 +1,9 @@
|
|||
VERSION=$(shell ./setup.py --version)
|
||||
HOST=treasure.calvinsplayground.de
|
||||
PROXY=
|
||||
#HOST=treasure.calvinsplayground.de
|
||||
#PROXY=
|
||||
#PROXY=-P$(HOST):5050
|
||||
#HOST=fsinfo.cs.uni-sb.de
|
||||
#PROXY=-Pwww-proxy.uni-sb.de:3128
|
||||
HOST=fsinfo.cs.uni-sb.de
|
||||
PROXY=-Pwww-proxy.uni-sb.de:3128
|
||||
PACKAGE = linkchecker
|
||||
DEBPACKAGE = $(PACKAGE)_$(VERSION)_i386.deb
|
||||
ALLPACKAGES = ../$(DEBPACKAGE)
|
||||
|
|
@ -13,22 +13,12 @@ TAR = tar
|
|||
ZIP = zip
|
||||
|
||||
all:
|
||||
@echo "run ./setup.py --help to see how to install"
|
||||
|
||||
clean:
|
||||
./setup.py clean --all
|
||||
rm -rf $(ALLPACKAGES) $(PACKAGE)-out.*
|
||||
|
||||
install:
|
||||
# ha! the root option finally made it into distutils
|
||||
./setup.py install --root=$(DESTDIR)
|
||||
# german translation mit Rezepten von Zlatko :)
|
||||
msgfmt -o linkcheck.mo linkcheck/linkcheck.po
|
||||
install -c -m 644 linkcheck.mo $(DESTDIR)/usr/share/locale/de/LC_MESSAGES/
|
||||
# remove following line if Distutils have script support
|
||||
#install -c -m 755 linkchecker $(DESTDIR)/usr/bin/
|
||||
install -c -m 644 linkcheckerrc $(DESTDIR)/etc/
|
||||
install -c -m 644 DNS/README $(DESTDIR)/usr/share/doc/$(PACKAGE)/README.dns
|
||||
|
||||
dist:
|
||||
./setup.py sdist
|
||||
fakeroot debian/rules binary
|
||||
|
|
|
|||
4
debian/changelog
vendored
4
debian/changelog
vendored
|
|
@ -13,8 +13,10 @@ linkchecker (1.2.3) unstable; urgency=low
|
|||
* i18n support and german translation of the logger outputs
|
||||
* use http_proxy environment variable if present
|
||||
* be more RFC822 and RFC2368 compliant when scanning mail syntax
|
||||
* fix for incorrect line number in logger output (reported by Michael
|
||||
Schmitz)
|
||||
|
||||
-- Bastian Kleineidam <calvin@users.sourceforge.net> Tue, 9 May 2000 00:15:12 +0200
|
||||
-- Bastian Kleineidam <calvin@users.sourceforge.net> Tue, 16 May 2000 14:53:23 +0200
|
||||
|
||||
linkchecker (1.2.2) unstable; urgency=low
|
||||
|
||||
|
|
|
|||
10
debian/rules
vendored
10
debian/rules
vendored
|
|
@ -33,7 +33,15 @@ install: build
|
|||
dh_clean -k
|
||||
dh_installdirs
|
||||
# Add here commands to install the package into debian/tmp.
|
||||
$(MAKE) install DESTDIR=`pwd`/debian/tmp
|
||||
# ha! the root option finally made it into distutils
|
||||
./setup.py install --root=`pwd`/debian/tmp
|
||||
# german translation mit Rezepten von Zlatko :)
|
||||
msgfmt -o linkcheck.mo linkcheck/linkcheck.po
|
||||
install -c -m 644 linkcheck.mo debian/tmp/usr/share/locale/de/LC_MESSAGES/
|
||||
# remove following line if Distutils have script support
|
||||
#install -c -m 755 linkchecker debian/tmp/usr/bin/
|
||||
install -c -m 644 linkcheckerrc debian/tmp/etc/
|
||||
install -c -m 644 DNS/README debian/tmp/usr/share/doc/$(PACKAGE)/README.dns
|
||||
|
||||
|
||||
# Build architecture-independent files here.
|
||||
|
|
|
|||
|
|
@ -162,8 +162,9 @@ class HttpUrlData(UrlData):
|
|||
t = time.time()
|
||||
status, statusText, self.mime = self._getHttpRequest("GET")
|
||||
self.urlConnection = self.urlConnection.getfile()
|
||||
self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
|
||||
self.data = self.urlConnection.read()
|
||||
self.downloadtime = time.time() - t
|
||||
self._init_html_comments()
|
||||
#Config.debug(Config.DebugDelim+self.data+Config.DebugDelim)
|
||||
|
||||
def isHtml(self):
|
||||
|
|
|
|||
|
|
@ -22,17 +22,17 @@ from UrlData import LinkCheckerException
|
|||
|
||||
# regular expression strings for partially RFC822 compliant adress scanning
|
||||
# XXX far from complete mail adress scanning; enhance only when needed!
|
||||
word = r"[\w\-%']+"
|
||||
words = r"[\w\-%'\s]+"
|
||||
dotwords = "("+word+r"(?:\."+word+")*)
|
||||
adress = dotwords+"@"+dotwords
|
||||
route_adress = words+"<"+adress+">"
|
||||
mailbox = "("+adress+"|"+route_adress+")"
|
||||
mailboxes = mailbox+r"?(,+"+mailbox+")*"
|
||||
word = r"[-\w%']+"
|
||||
words = r"[-\w%'\s]+"
|
||||
dotwords = r"(%s(?:\.%s)*)" % (word,word)
|
||||
adress = "%s@%s" % (dotwords, dotwords)
|
||||
route_adress = "%s<%s>" % (words, adress)
|
||||
mailbox = "(%s|%s)" % (adress, route_adress)
|
||||
mailboxes = "%s?(,%s)*" % (mailbox, mailbox)
|
||||
|
||||
# regular expression strings for RFC2368 compliant mailto: scanning
|
||||
header = word+"="+word
|
||||
headers = "?"+header+"(&"+header+")*
|
||||
headers = r"(?:\?%s(&%s)*)?" % (header, header)
|
||||
mailto = "^mailto:"+mailboxes+headers
|
||||
|
||||
# compiled
|
||||
|
|
|
|||
|
|
@ -1,95 +0,0 @@
|
|||
"""
|
||||
Copyright (C) 2000 Bastian Kleineidam
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
"""
|
||||
import string,re
|
||||
import UrlData
|
||||
|
||||
class ParseException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
ws = re.compile("\s+")
|
||||
regex_realUrl = re.compile("^Real URL.+")
|
||||
regex_result = re.compile("^Result.+")
|
||||
regex_base = re.compile("^Base.+")
|
||||
regex_info = re.compile("^Info.+")
|
||||
regex_warning = re.compile("^Warning.+")
|
||||
regex_parentUrl = re.compile("^Parent URL.+")
|
||||
regex_valid = re.compile("^Valid.*")
|
||||
|
||||
|
||||
class OutputReader:
|
||||
|
||||
def resetState(self):
|
||||
self.urlName = None
|
||||
self.parentName = None
|
||||
self.baseRef = None
|
||||
self.info = None
|
||||
self.warning = None
|
||||
self.result = None
|
||||
self.linenumber = 0
|
||||
self.state = 0
|
||||
|
||||
def parse(self, file):
|
||||
line = file.readline()
|
||||
url = None
|
||||
urls = []
|
||||
self.resetState()
|
||||
|
||||
while line:
|
||||
if ws.match(line):
|
||||
if self.state>=2:
|
||||
#append url
|
||||
urldata = UrlData.GetUrlDataFrom(self.urlName, 0,
|
||||
self.parentName, self.baseRef, self.linenumber)
|
||||
if self.info:
|
||||
urldata.setInfo(self.info)
|
||||
if self.warning:
|
||||
urldata.setWarning(self.info)
|
||||
if OutputReader.regex_valid.match(self.result):
|
||||
urldata.valid=1
|
||||
urldata.validString = self.result
|
||||
else:
|
||||
urldata.valid=0
|
||||
urldata.errorString = self.result
|
||||
urls.append(urldata)
|
||||
elif self.state:
|
||||
raise ParseException, "No Real URL and Result keyword found"
|
||||
self.resetState()
|
||||
|
||||
elif regex_realUrl.match(line):
|
||||
self.state = self.state+1
|
||||
self.urlName = string.strip(line[8:])
|
||||
elif regex_result.match(line):
|
||||
self.state = self.state+1
|
||||
self.result = string.strip(line[6:])
|
||||
elif regex_info.match(line):
|
||||
self.info = string.strip(line[4:])
|
||||
elif regex_base.match(line):
|
||||
self.baseRef = string.strip(line[4:])
|
||||
elif regex_warning.match(line):
|
||||
self.warning = string.strip(line[7:])
|
||||
elif regex_parentUrl.match(line):
|
||||
self.parentName = string.strip(line[10:])
|
||||
if ',' in self.parentName:
|
||||
self.parentName,self.linenumber = string.split(self.parentName,",",1)
|
||||
else:
|
||||
pass
|
||||
|
||||
line = file.readline()
|
||||
return urls
|
||||
|
||||
|
|
@ -24,7 +24,7 @@ LinkTags = [("a", "href"),
|
|||
("body", "background"),
|
||||
("frame", "src"),
|
||||
("link", "href"),
|
||||
# <meta http-equiv="refresh" content="5; url=...">
|
||||
# <meta http-equiv="refresh" content="x; url=...">
|
||||
("meta", "url"),
|
||||
("area", "href")]
|
||||
|
||||
|
|
@ -59,6 +59,7 @@ class UrlData:
|
|||
self.urlConnection = None
|
||||
self.extern = 1
|
||||
self.data = None
|
||||
self.html_comments = []
|
||||
|
||||
|
||||
def setError(self, s):
|
||||
|
|
@ -216,7 +217,7 @@ class UrlData:
|
|||
if not (anchor!="" and self.isHtml() and self.valid):
|
||||
return
|
||||
self.getContent()
|
||||
for cur_anchor,line in self.searchInForTag(self.data, ("a", "name")):
|
||||
for cur_anchor,line in self.searchInForTag("a", "name"):
|
||||
if cur_anchor == anchor:
|
||||
return
|
||||
self.setWarning("anchor #"+anchor+" not found")
|
||||
|
|
@ -245,17 +246,35 @@ class UrlData:
|
|||
|
||||
|
||||
def getContent(self):
|
||||
"""Precondition: urlConnection is an opened URL.
|
||||
"""
|
||||
"""Precondition: urlConnection is an opened URL."""
|
||||
if not self.data:
|
||||
t = time.time()
|
||||
self.data = StringUtil.stripHtmlComments(self.urlConnection.read())
|
||||
self.data = self.urlConnection.read()
|
||||
self.downloadtime = time.time() - t
|
||||
self._init_html_comments()
|
||||
return self.data
|
||||
|
||||
|
||||
def _init_html_comments(self):
|
||||
# if we find an URL inside HTML comments we ignore it
|
||||
# so build a list of intervalls which are HTML comments
|
||||
pattern = re.compile("<!--.*?-->")
|
||||
index = 0
|
||||
while 1:
|
||||
match = pattern.search(self.data, index)
|
||||
if not match: break
|
||||
index = match.end()
|
||||
self.html_comments.append(match.span())
|
||||
|
||||
def _isInComment(self, index):
|
||||
for low,high in self.html_comments:
|
||||
if low < index and index < high:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def checkContent(self, warningregex):
|
||||
self.getContent()
|
||||
match = warningregex.search(self.data)
|
||||
match = warningregex.search(self.getContent())
|
||||
if match:
|
||||
self.setWarning("Found '"+match.group()+"' in link contents")
|
||||
|
||||
|
|
@ -263,10 +282,8 @@ class UrlData:
|
|||
def parseUrl(self, config):
|
||||
Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
|
||||
str(self)+"\n"+Config.DebugDelim)
|
||||
self.getContent()
|
||||
|
||||
# search for a possible base reference
|
||||
bases = self.searchInForTag(self.data, ("base", "href"))
|
||||
bases = self.searchInForTag("base", "href")
|
||||
baseRef = None
|
||||
if len(bases)>=1:
|
||||
baseRef = bases[0][0]
|
||||
|
|
@ -274,34 +291,30 @@ class UrlData:
|
|||
self.setWarning("more than one base tag found")
|
||||
|
||||
# search for tags and add found tags to URL queue
|
||||
for tag in LinkTags:
|
||||
urls = self.searchInForTag(self.data, tag)
|
||||
Config.debug("DEBUG: "+str(tag)+" urls="+str(urls)+"\n")
|
||||
for _url,line in urls:
|
||||
config.appendUrl(GetUrlDataFrom(_url,
|
||||
for start,end in LinkTags:
|
||||
urls = self.searchInForTag(start,end)
|
||||
Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls))
|
||||
for url,line in urls:
|
||||
config.appendUrl(GetUrlDataFrom(url,
|
||||
self.recursionLevel+1, self.url, baseRef, line))
|
||||
|
||||
|
||||
def searchInForTag(self, data, tag):
|
||||
_urls = []
|
||||
_prefix="<\s*"+tag[0]+"\s+[^>]*?"+tag[1]+"\s*=\s*"
|
||||
_suffix="[^>]*>"
|
||||
_patterns = [re.compile(_prefix+"\"([^\"]+)\""+_suffix, re.I),
|
||||
re.compile(_prefix+"([^\s>]+)" +_suffix, re.I)]
|
||||
cutofflines = 0
|
||||
for _pattern in _patterns:
|
||||
while 1:
|
||||
_match = _pattern.search(data)
|
||||
if not _match: break
|
||||
# need to strip optional ending quotes for the <meta url=> tag
|
||||
linenumberbegin = StringUtil.getLineNumber(data, _match.start(0))
|
||||
linenumberend = StringUtil.getLineNumber(data, _match.end(0))
|
||||
cutofflines = cutofflines + linenumberend - linenumberbegin
|
||||
_urls.append((string.strip(StringUtil.rstripQuotes(_match.group(1))),
|
||||
linenumberbegin + cutofflines))
|
||||
data = data[:_match.start(0)] + data[_match.end(0):]
|
||||
|
||||
return _urls
|
||||
def searchInForTag(self, tag_start, tag_end):
|
||||
urls = []
|
||||
prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*"
|
||||
suffix="[^>]*>"
|
||||
pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I)
|
||||
index = 0
|
||||
while 1:
|
||||
match = pattern.search(self.getContent(), index)
|
||||
if not match: break
|
||||
index = match.end()
|
||||
if self._isInComment(match.start()): continue
|
||||
# need to strip optional ending quotes for the meta tag
|
||||
urls.append((string.strip(StringUtil.stripQuotes(match.group(1))),
|
||||
StringUtil.getLineNumber(self.getContent(),
|
||||
match.start())))
|
||||
return urls
|
||||
|
||||
|
||||
def __str__(self):
|
||||
|
|
@ -313,9 +326,9 @@ class UrlData:
|
|||
|
||||
|
||||
def _getUserPassword(self, config):
|
||||
for rx, _user, _password in config["authentication"]:
|
||||
for rx, user, password in config["authentication"]:
|
||||
if rx.match(self.url):
|
||||
return _user, _password
|
||||
return user, password
|
||||
|
||||
|
||||
from FileUrlData import FileUrlData
|
||||
|
|
@ -341,23 +354,23 @@ def GetUrlDataFrom(urlName,
|
|||
elif parentName and ":" in parentName:
|
||||
name = string.lower(parentName)
|
||||
# test scheme
|
||||
if re.compile("^http:").search(name):
|
||||
if re.search("^http:", name):
|
||||
return HttpUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^ftp:").search(name):
|
||||
if re.search("^ftp:", name):
|
||||
return FtpUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^file:").search(name):
|
||||
if re.search("^file:", name):
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^telnet:").search(name):
|
||||
if re.search("^telnet:", name):
|
||||
return TelnetUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^mailto:").search(name):
|
||||
if re.search("^mailto:", name):
|
||||
return MailtoUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^gopher:").search(name):
|
||||
if re.search("^gopher:", name):
|
||||
return GopherUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^javascript:").search(name):
|
||||
if re.search("^javascript:", name):
|
||||
return JavascriptUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^https:").search(name):
|
||||
if re.search("^https:", name):
|
||||
return HttpsUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
if re.compile("^news:").search(name):
|
||||
if re.search("^news:", name):
|
||||
return NntpUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
# assume local file
|
||||
return FileUrlData(urlName, recursionLevel, parentName, baseRef, line)
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ try:
|
|||
except ImportError:
|
||||
def gettext(msg):
|
||||
return msg
|
||||
import Config,UrlData,OutputReader,sys,lc_cgi
|
||||
import Config,UrlData,sys,lc_cgi
|
||||
|
||||
def checkUrls(config = Config.Configuration()):
|
||||
""" checkUrls gets a complete configuration object as parameter where all
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
@echo off
|
||||
|
||||
rem Limited to 9 parameters? Is there a $* for Windows?
|
||||
python "c:\progra~1\linkchecker-1.2.3\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||
python "@install_scripts@\linkchecker" %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||
|
|
|
|||
14
setup.py
14
setup.py
|
|
@ -22,13 +22,11 @@ from Template import Template
|
|||
import sys,os,string
|
||||
|
||||
# Autodetect the existence of an SSL library (this is pretty shitty)
|
||||
# Autodetect Windows platforms to include the linkchecker.bat script
|
||||
class LCDistribution(Distribution):
|
||||
default_include_dirs = ['/usr/include/openssl',
|
||||
'/usr/local/include/openssl']
|
||||
def run_commands (self):
|
||||
self.check_ssl()
|
||||
self.check_windows()
|
||||
for cmd in self.commands:
|
||||
self.run_command (cmd)
|
||||
|
||||
|
|
@ -46,16 +44,6 @@ class LCDistribution(Distribution):
|
|||
"disabling SSL compilation.\n"
|
||||
"Use the -I option for the build_ext command.")
|
||||
|
||||
def check_windows(self):
|
||||
if sys.platform=='win32':
|
||||
inst = self.find_command_obj("install")
|
||||
inst.ensure_ready()
|
||||
t = Template("linkchecker.bat.tmpl")
|
||||
f = open("linkchecker.bat","w")
|
||||
f.write(t.fill_in({"path_to_linkchecker": inst.install_scripts}))
|
||||
f.close()
|
||||
self.scripts.append('linkchecker.bat')
|
||||
|
||||
def has_ssl(self):
|
||||
incls = self.find_command_obj("build_ext").include_dirs
|
||||
incls = (incls and string.split(incls, os.pathsep)) or []
|
||||
|
|
@ -89,5 +77,5 @@ o robots.txt exclusion protocol support
|
|||
""",
|
||||
distclass = LCDistribution,
|
||||
packages = ['','DNS','linkcheck'],
|
||||
scripts = ['linkchecker'],
|
||||
scripts = ['linkchecker', 'linkchecker.bat'],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
<!--comment-->
|
||||
<base href="file:/etc/">
|
||||
<a href="passwd">
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue