link patterns

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@228 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2001-02-09 00:59:44 +00:00
parent 9c87fedcf4
commit 83a0846fef
4 changed files with 78 additions and 35 deletions

View file

@ -33,14 +33,14 @@ cleandeb:
rm -f configure-stamp build-stamp
dist: locale
fakeroot debian/rules binary
# cleandeb because distutils choke on dangling symlinks
# (linkchecker.1 -> undocumented.1)
$(MAKE) cleandeb
$(PYTHON) setup.py sdist --formats=gztar,zip bdist_rpm
# extra run without SSL compilation
$(PYTHON) setup.py bdist_wininst
mv -f ../$(DEBPACKAGE) dist
python setup.py bdist_wininst
fakeroot dpkg-buildpackage -sgpg -pgpg
cp -f ../$(DEBPACKAGE) dist
package:
cd dist && dpkg-scanpackages . ../override.txt | gzip --best > Packages.gz

View file

@ -53,24 +53,45 @@ _linkMatcher = r"""
> # close tag
"""
# ripped mainly from HTML::Tagset.pm
LinkTags = (
("a", "href"),
("img", "src"),
("form", "action"),
("body", "background"),
("frame", "src"),
("link", "href"),
("meta", "url"), # <meta http-equiv="refresh" content="x; url=...">
("area", "href"),
("script", "src"),
("a", ["href"]),
("applet", ["archive", "codebase", "src"]),
("area", ["href"]),
("bgsound", ["src"]),
("blockquote", ["cite"]),
("body", ["background"]),
("del", ["cite"]),
("embed", ["pluginspage", "src"]),
("form", ["action"]),
("frame", ["src", "longdesc"]),
('head', ['profile']),
("iframe", ["src", "longdesc"]),
("ilayer", ["background"]),
("img", ["src", "lowsrc", "longdesc", "usemap"]),
('input', ['src', 'usemap']),
('ins', ['cite']),
('isindex', ['action']),
('layer', ['background', 'src']),
("link", ["href"]),
("meta", ["url"]), # <meta http-equiv="refresh" content="x; url=...">
('object', ['classid', 'codebase', 'data', 'archive', 'usemap']),
('q', ['cite']),
('script', ['src', 'for']),
('table', ['background']),
('td', ['background']),
('th', ['background']),
('tr', ['background']),
('xmp', ['href']),
)
LinkPatterns = []
for tag,attr in LinkTags:
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
re.VERBOSE),
'tag': tag,
'attr': attr})
for tag,attrs in LinkTags:
for attr in attrs:
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
re.VERBOSE),
'tag': tag,
'attr': attr})
AnchorPattern = {
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
'tag': 'a',

View file

@ -11,6 +11,8 @@
"""
import re,string,urlparse,urllib
__all__ = ['RobotFileParser']
debug = 0
def _debug(msg):
@ -180,12 +182,15 @@ class Entry:
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
useragent = string.lower(useragent)
# split the name token and make it lower case
useragent = string.lower(string.split(useragent,"/")[0])
for agent in self.useragents:
if agent=='*' or useragent=='*':
if agent=='*':
# we have the catch-all agent
return 1
agent = string.lower(agent)
if re.match(agent, useragent):
# don't forget to re.escape
if re.search(re.escape(useragent), agent):
return 1
return 0
@ -199,6 +204,11 @@ class Entry:
return line.allowance
return 1
def _check(a,b):
if a!=b:
print "failed\n"
else:
print "ok\n"
def _test():
global debug
@ -210,17 +220,29 @@ def _test():
rp.read()
else:
rp.parse(open(sys.argv[1]).readlines())
print rp.can_fetch('*', 'http://www.musi-cal.com/')
print rp.can_fetch('CherryPickerSE',
# test for re.escape
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
# this should match the first rule, which is a disallow
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
# various cherry pickers
_check(rp.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco')
print rp.can_fetch('CherryPickerSE/1.0',
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco')
print rp.can_fetch('CherryPickerSE/1.5',
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco')
print rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba')
'?city=San+Francisco'), 0)
# case sensitivity
_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
# substring test
_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
# tests for catch-all * agent
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/musician/me'), 0)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
if __name__ == '__main__':
_test()

View file

@ -165,15 +165,15 @@ class MyDistribution(Distribution):
raise SystemExit, "please run 'python setup.py config'"
#self.announce("generating default configuration")
#self.run_command('config')
import LinkCheckerConf
import linkcheckerConf
if 'bdist_wininst' in self.commands and os.name!='nt':
self.announce("bdist_wininst command found on non-Windows "
"platform. Disabling SSL compilation")
elif LinkCheckerConf.have_ssl:
"platform. Disabling SSL compilation")
elif linkcheckerConf.have_ssl:
self.ext_modules = [Extension('ssl', ['ssl.c'],
include_dirs=LinkCheckerConf.ssl_include_dirs,
library_dirs=LinkCheckerConf.ssl_library_dirs,
libraries=LinkCheckerConf.libraries)]
include_dirs=linkcheckerConf.ssl_include_dirs,
library_dirs=linkcheckerConf.ssl_library_dirs,
libraries=linkcheckerConf.libraries)]
def create_conf_file(self, directory, data=[]):
@ -203,7 +203,7 @@ setup (name = "linkchecker",
url = "http://linkchecker.sourceforge.net/",
licence = "GPL",
long_description =
"""LinkChecker features
"""Linkchecker features
o recursive checking
o multithreading
o output in colored or normal text, HTML, SQL, CSV or a sitemap