diff --git a/Makefile b/Makefile
index 66a2012a..b82d6995 100644
--- a/Makefile
+++ b/Makefile
@@ -33,14 +33,14 @@ cleandeb:
rm -f configure-stamp build-stamp
dist: locale
- fakeroot debian/rules binary
# cleandeb because distutils choke on dangling symlinks
# (linkchecker.1 -> undocumented.1)
$(MAKE) cleandeb
$(PYTHON) setup.py sdist --formats=gztar,zip bdist_rpm
# extra run without SSL compilation
- $(PYTHON) setup.py bdist_wininst
- mv -f ../$(DEBPACKAGE) dist
+ python setup.py bdist_wininst
+ fakeroot dpkg-buildpackage -sgpg -pgpg
+ cp -f ../$(DEBPACKAGE) dist
package:
cd dist && dpkg-scanpackages . ../override.txt | gzip --best > Packages.gz
diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index c21157f2..9cea90f5 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -53,24 +53,45 @@ _linkMatcher = r"""
> # close tag
"""
+# ripped mainly from HTML::Tagset.pm
LinkTags = (
- ("a", "href"),
- ("img", "src"),
- ("form", "action"),
- ("body", "background"),
- ("frame", "src"),
- ("link", "href"),
- ("meta", "url"), #
- ("area", "href"),
- ("script", "src"),
+ ("a", ["href"]),
+ ("applet", ["archive", "codebase", "src"]),
+ ("area", ["href"]),
+ ("bgsound", ["src"]),
+ ("blockquote", ["cite"]),
+ ("body", ["background"]),
+ ("del", ["cite"]),
+ ("embed", ["pluginspage", "src"]),
+ ("form", ["action"]),
+ ("frame", ["src", "longdesc"]),
+ ('head', ['profile']),
+ ("iframe", ["src", "longdesc"]),
+ ("ilayer", ["background"]),
+ ("img", ["src", "lowsrc", "longdesc", "usemap"]),
+ ('input', ['src', 'usemap']),
+ ('ins', ['cite']),
+ ('isindex', ['action']),
+ ('layer', ['background', 'src']),
+ ("link", ["href"]),
+ ("meta", ["url"]), #
+ ('object', ['classid', 'codebase', 'data', 'archive', 'usemap']),
+ ('q', ['cite']),
+ ('script', ['src', 'for']),
+ ('table', ['background']),
+ ('td', ['background']),
+ ('th', ['background']),
+ ('tr', ['background']),
+ ('xmp', ['href']),
)
LinkPatterns = []
-for tag,attr in LinkTags:
- LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
- re.VERBOSE),
- 'tag': tag,
- 'attr': attr})
+for tag,attrs in LinkTags:
+ for attr in attrs:
+ LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
+ re.VERBOSE),
+ 'tag': tag,
+ 'attr': attr})
AnchorPattern = {
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
'tag': 'a',
diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py
index 3ed3f09f..3506de6a 100755
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@@ -11,6 +11,8 @@
"""
import re,string,urlparse,urllib
+__all__ = ['RobotFileParser']
+
debug = 0
def _debug(msg):
@@ -180,12 +182,15 @@ class Entry:
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
- useragent = string.lower(useragent)
+ # split the name token and make it lower case
+ useragent = string.lower(string.split(useragent,"/")[0])
for agent in self.useragents:
- if agent=='*' or useragent=='*':
+ if agent=='*':
+ # we have the catch-all agent
return 1
agent = string.lower(agent)
- if re.match(agent, useragent):
+ # don't forget to re.escape
+ if re.search(re.escape(useragent), agent):
return 1
return 0
@@ -199,6 +204,11 @@ class Entry:
return line.allowance
return 1
+def _check(a,b):
+ if a!=b:
+ print "failed\n"
+ else:
+ print "ok\n"
def _test():
global debug
@@ -210,17 +220,29 @@ def _test():
rp.read()
else:
rp.parse(open(sys.argv[1]).readlines())
- print rp.can_fetch('*', 'http://www.musi-cal.com/')
- print rp.can_fetch('CherryPickerSE',
+ # test for re.escape
+ _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
+ # this should match the first rule, which is a disallow
+ _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
+ # various cherry pickers
+ _check(rp.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco')
- print rp.can_fetch('CherryPickerSE/1.0',
+ '?city=San+Francisco'), 0)
+ _check(rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco')
- print rp.can_fetch('CherryPickerSE/1.5',
+ '?city=San+Francisco'), 0)
+ _check(rp.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco')
- print rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba')
+ '?city=San+Francisco'), 0)
+ # case sensitivity
+ _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
+ _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
+ # substring test
+ _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
+ # tests for catch-all * agent
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/musician/me'), 0)
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
if __name__ == '__main__':
_test()
diff --git a/setup.py b/setup.py
index a12f3447..4ccf6f90 100755
--- a/setup.py
+++ b/setup.py
@@ -165,15 +165,15 @@ class MyDistribution(Distribution):
raise SystemExit, "please run 'python setup.py config'"
#self.announce("generating default configuration")
#self.run_command('config')
- import LinkCheckerConf
+ import linkcheckerConf
if 'bdist_wininst' in self.commands and os.name!='nt':
self.announce("bdist_wininst command found on non-Windows "
- "platform. Disabling SSL compilation")
- elif LinkCheckerConf.have_ssl:
+ "platform. Disabling SSL compilation")
+ elif linkcheckerConf.have_ssl:
self.ext_modules = [Extension('ssl', ['ssl.c'],
- include_dirs=LinkCheckerConf.ssl_include_dirs,
- library_dirs=LinkCheckerConf.ssl_library_dirs,
- libraries=LinkCheckerConf.libraries)]
+ include_dirs=linkcheckerConf.ssl_include_dirs,
+ library_dirs=linkcheckerConf.ssl_library_dirs,
+ libraries=linkcheckerConf.libraries)]
def create_conf_file(self, directory, data=[]):
@@ -203,7 +203,7 @@ setup (name = "linkchecker",
url = "http://linkchecker.sourceforge.net/",
licence = "GPL",
long_description =
-"""LinkChecker features
+"""Linkchecker features
o recursive checking
o multithreading
o output in colored or normal text, HTML, SQL, CSV or a sitemap