From 83a0846fefbceb472047f29fbe948e249aed0095 Mon Sep 17 00:00:00 2001 From: calvin Date: Fri, 9 Feb 2001 00:59:44 +0000 Subject: [PATCH] link patterns git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@228 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- Makefile | 6 ++--- linkcheck/UrlData.py | 49 ++++++++++++++++++++++++++++----------- linkcheck/robotparser2.py | 44 ++++++++++++++++++++++++++--------- setup.py | 14 +++++------ 4 files changed, 78 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index 66a2012a..b82d6995 100644 --- a/Makefile +++ b/Makefile @@ -33,14 +33,14 @@ cleandeb: rm -f configure-stamp build-stamp dist: locale - fakeroot debian/rules binary # cleandeb because distutils choke on dangling symlinks # (linkchecker.1 -> undocumented.1) $(MAKE) cleandeb $(PYTHON) setup.py sdist --formats=gztar,zip bdist_rpm # extra run without SSL compilation - $(PYTHON) setup.py bdist_wininst - mv -f ../$(DEBPACKAGE) dist + python setup.py bdist_wininst + fakeroot dpkg-buildpackage -sgpg -pgpg + cp -f ../$(DEBPACKAGE) dist package: cd dist && dpkg-scanpackages . ../override.txt | gzip --best > Packages.gz diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index c21157f2..9cea90f5 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -53,24 +53,45 @@ _linkMatcher = r""" > # close tag """ +# ripped mainly from HTML::Tagset.pm LinkTags = ( - ("a", "href"), - ("img", "src"), - ("form", "action"), - ("body", "background"), - ("frame", "src"), - ("link", "href"), - ("meta", "url"), # - ("area", "href"), - ("script", "src"), + ("a", ["href"]), + ("applet", ["archive", "codebase", "src"]), + ("area", ["href"]), + ("bgsound", ["src"]), + ("blockquote", ["cite"]), + ("body", ["background"]), + ("del", ["cite"]), + ("embed", ["pluginspage", "src"]), + ("form", ["action"]), + ("frame", ["src", "longdesc"]), + ('head', ['profile']), + ("iframe", ["src", "longdesc"]), + ("ilayer", ["background"]), + ("img", ["src", "lowsrc", "longdesc", "usemap"]), + ('input', ['src', 'usemap']), + ('ins', ['cite']), + ('isindex', ['action']), + ('layer', ['background', 'src']), + ("link", ["href"]), + ("meta", ["url"]), # + ('object', ['classid', 'codebase', 'data', 'archive', 'usemap']), + ('q', ['cite']), + ('script', ['src', 'for']), + ('table', ['background']), + ('td', ['background']), + ('th', ['background']), + ('tr', ['background']), + ('xmp', ['href']), ) LinkPatterns = [] -for tag,attr in LinkTags: - LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), - re.VERBOSE), - 'tag': tag, - 'attr': attr}) +for tag,attrs in LinkTags: + for attr in attrs: + LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), + re.VERBOSE), + 'tag': tag, + 'attr': attr}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE), 'tag': 'a', diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 3ed3f09f..3506de6a 100755 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -11,6 +11,8 @@ """ import re,string,urlparse,urllib +__all__ = ['RobotFileParser'] + debug = 0 def _debug(msg): @@ -180,12 +182,15 @@ class Entry: def applies_to(self, useragent): """check if this entry applies to the specified agent""" - useragent = string.lower(useragent) + # split the name token and make it lower case + useragent = string.lower(string.split(useragent,"/")[0]) for agent in self.useragents: - if agent=='*' or useragent=='*': + if agent=='*': + # we have the catch-all agent return 1 agent = string.lower(agent) - if re.match(agent, useragent): + # don't forget to re.escape + if re.search(re.escape(useragent), agent): return 1 return 0 @@ -199,6 +204,11 @@ class Entry: return line.allowance return 1 +def _check(a,b): + if a!=b: + print "failed\n" + else: + print "ok\n" def _test(): global debug @@ -210,17 +220,29 @@ def _test(): rp.read() else: rp.parse(open(sys.argv[1]).readlines()) - print rp.can_fetch('*', 'http://www.musi-cal.com/') - print rp.can_fetch('CherryPickerSE', + # test for re.escape + _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) + # this should match the first rule, which is a disallow + _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) + # various cherry pickers + _check(rp.can_fetch('CherryPickerSE', 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco') - print rp.can_fetch('CherryPickerSE/1.0', + '?city=San+Francisco'), 0) + _check(rp.can_fetch('CherryPickerSE/1.0', 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco') - print rp.can_fetch('CherryPickerSE/1.5', + '?city=San+Francisco'), 0) + _check(rp.can_fetch('CherryPickerSE/1.5', 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco') - print rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba') + '?city=San+Francisco'), 0) + # case sensitivity + _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) + _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) + # substring test + _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) + # tests for catch-all * agent + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/musician/me'), 0) + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) if __name__ == '__main__': _test() diff --git a/setup.py b/setup.py index a12f3447..4ccf6f90 100755 --- a/setup.py +++ b/setup.py @@ -165,15 +165,15 @@ class MyDistribution(Distribution): raise SystemExit, "please run 'python setup.py config'" #self.announce("generating default configuration") #self.run_command('config') - import LinkCheckerConf + import linkcheckerConf if 'bdist_wininst' in self.commands and os.name!='nt': self.announce("bdist_wininst command found on non-Windows " - "platform. Disabling SSL compilation") - elif LinkCheckerConf.have_ssl: + "platform. Disabling SSL compilation") + elif linkcheckerConf.have_ssl: self.ext_modules = [Extension('ssl', ['ssl.c'], - include_dirs=LinkCheckerConf.ssl_include_dirs, - library_dirs=LinkCheckerConf.ssl_library_dirs, - libraries=LinkCheckerConf.libraries)] + include_dirs=linkcheckerConf.ssl_include_dirs, + library_dirs=linkcheckerConf.ssl_library_dirs, + libraries=linkcheckerConf.libraries)] def create_conf_file(self, directory, data=[]): @@ -203,7 +203,7 @@ setup (name = "linkchecker", url = "http://linkchecker.sourceforge.net/", licence = "GPL", long_description = -"""LinkChecker features +"""Linkchecker features o recursive checking o multithreading o output in colored or normal text, HTML, SQL, CSV or a sitemap