mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-14 19:31:02 +00:00
link patterns
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@228 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
9c87fedcf4
commit
83a0846fef
4 changed files with 78 additions and 35 deletions
6
Makefile
6
Makefile
|
|
@ -33,14 +33,14 @@ cleandeb:
|
|||
rm -f configure-stamp build-stamp
|
||||
|
||||
dist: locale
|
||||
fakeroot debian/rules binary
|
||||
# cleandeb because distutils choke on dangling symlinks
|
||||
# (linkchecker.1 -> undocumented.1)
|
||||
$(MAKE) cleandeb
|
||||
$(PYTHON) setup.py sdist --formats=gztar,zip bdist_rpm
|
||||
# extra run without SSL compilation
|
||||
$(PYTHON) setup.py bdist_wininst
|
||||
mv -f ../$(DEBPACKAGE) dist
|
||||
python setup.py bdist_wininst
|
||||
fakeroot dpkg-buildpackage -sgpg -pgpg
|
||||
cp -f ../$(DEBPACKAGE) dist
|
||||
|
||||
package:
|
||||
cd dist && dpkg-scanpackages . ../override.txt | gzip --best > Packages.gz
|
||||
|
|
|
|||
|
|
@ -53,24 +53,45 @@ _linkMatcher = r"""
|
|||
> # close tag
|
||||
"""
|
||||
|
||||
# ripped mainly from HTML::Tagset.pm
|
||||
LinkTags = (
|
||||
("a", "href"),
|
||||
("img", "src"),
|
||||
("form", "action"),
|
||||
("body", "background"),
|
||||
("frame", "src"),
|
||||
("link", "href"),
|
||||
("meta", "url"), # <meta http-equiv="refresh" content="x; url=...">
|
||||
("area", "href"),
|
||||
("script", "src"),
|
||||
("a", ["href"]),
|
||||
("applet", ["archive", "codebase", "src"]),
|
||||
("area", ["href"]),
|
||||
("bgsound", ["src"]),
|
||||
("blockquote", ["cite"]),
|
||||
("body", ["background"]),
|
||||
("del", ["cite"]),
|
||||
("embed", ["pluginspage", "src"]),
|
||||
("form", ["action"]),
|
||||
("frame", ["src", "longdesc"]),
|
||||
('head', ['profile']),
|
||||
("iframe", ["src", "longdesc"]),
|
||||
("ilayer", ["background"]),
|
||||
("img", ["src", "lowsrc", "longdesc", "usemap"]),
|
||||
('input', ['src', 'usemap']),
|
||||
('ins', ['cite']),
|
||||
('isindex', ['action']),
|
||||
('layer', ['background', 'src']),
|
||||
("link", ["href"]),
|
||||
("meta", ["url"]), # <meta http-equiv="refresh" content="x; url=...">
|
||||
('object', ['classid', 'codebase', 'data', 'archive', 'usemap']),
|
||||
('q', ['cite']),
|
||||
('script', ['src', 'for']),
|
||||
('table', ['background']),
|
||||
('td', ['background']),
|
||||
('th', ['background']),
|
||||
('tr', ['background']),
|
||||
('xmp', ['href']),
|
||||
)
|
||||
|
||||
LinkPatterns = []
|
||||
for tag,attr in LinkTags:
|
||||
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
|
||||
re.VERBOSE),
|
||||
'tag': tag,
|
||||
'attr': attr})
|
||||
for tag,attrs in LinkTags:
|
||||
for attr in attrs:
|
||||
LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr),
|
||||
re.VERBOSE),
|
||||
'tag': tag,
|
||||
'attr': attr})
|
||||
AnchorPattern = {
|
||||
'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE),
|
||||
'tag': 'a',
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@
|
|||
"""
|
||||
import re,string,urlparse,urllib
|
||||
|
||||
__all__ = ['RobotFileParser']
|
||||
|
||||
debug = 0
|
||||
|
||||
def _debug(msg):
|
||||
|
|
@ -180,12 +182,15 @@ class Entry:
|
|||
|
||||
def applies_to(self, useragent):
|
||||
"""check if this entry applies to the specified agent"""
|
||||
useragent = string.lower(useragent)
|
||||
# split the name token and make it lower case
|
||||
useragent = string.lower(string.split(useragent,"/")[0])
|
||||
for agent in self.useragents:
|
||||
if agent=='*' or useragent=='*':
|
||||
if agent=='*':
|
||||
# we have the catch-all agent
|
||||
return 1
|
||||
agent = string.lower(agent)
|
||||
if re.match(agent, useragent):
|
||||
# don't forget to re.escape
|
||||
if re.search(re.escape(useragent), agent):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
|
@ -199,6 +204,11 @@ class Entry:
|
|||
return line.allowance
|
||||
return 1
|
||||
|
||||
def _check(a,b):
|
||||
if a!=b:
|
||||
print "failed\n"
|
||||
else:
|
||||
print "ok\n"
|
||||
|
||||
def _test():
|
||||
global debug
|
||||
|
|
@ -210,17 +220,29 @@ def _test():
|
|||
rp.read()
|
||||
else:
|
||||
rp.parse(open(sys.argv[1]).readlines())
|
||||
print rp.can_fetch('*', 'http://www.musi-cal.com/')
|
||||
print rp.can_fetch('CherryPickerSE',
|
||||
# test for re.escape
|
||||
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
|
||||
# this should match the first rule, which is a disallow
|
||||
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
|
||||
# various cherry pickers
|
||||
_check(rp.can_fetch('CherryPickerSE',
|
||||
'http://www.musi-cal.com/cgi-bin/event-search'
|
||||
'?city=San+Francisco')
|
||||
print rp.can_fetch('CherryPickerSE/1.0',
|
||||
'?city=San+Francisco'), 0)
|
||||
_check(rp.can_fetch('CherryPickerSE/1.0',
|
||||
'http://www.musi-cal.com/cgi-bin/event-search'
|
||||
'?city=San+Francisco')
|
||||
print rp.can_fetch('CherryPickerSE/1.5',
|
||||
'?city=San+Francisco'), 0)
|
||||
_check(rp.can_fetch('CherryPickerSE/1.5',
|
||||
'http://www.musi-cal.com/cgi-bin/event-search'
|
||||
'?city=San+Francisco')
|
||||
print rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba')
|
||||
'?city=San+Francisco'), 0)
|
||||
# case sensitivity
|
||||
_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
|
||||
_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
|
||||
# substring test
|
||||
_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
|
||||
# tests for catch-all * agent
|
||||
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/musician/me'), 0)
|
||||
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
|
||||
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
_test()
|
||||
|
|
|
|||
14
setup.py
14
setup.py
|
|
@ -165,15 +165,15 @@ class MyDistribution(Distribution):
|
|||
raise SystemExit, "please run 'python setup.py config'"
|
||||
#self.announce("generating default configuration")
|
||||
#self.run_command('config')
|
||||
import LinkCheckerConf
|
||||
import linkcheckerConf
|
||||
if 'bdist_wininst' in self.commands and os.name!='nt':
|
||||
self.announce("bdist_wininst command found on non-Windows "
|
||||
"platform. Disabling SSL compilation")
|
||||
elif LinkCheckerConf.have_ssl:
|
||||
"platform. Disabling SSL compilation")
|
||||
elif linkcheckerConf.have_ssl:
|
||||
self.ext_modules = [Extension('ssl', ['ssl.c'],
|
||||
include_dirs=LinkCheckerConf.ssl_include_dirs,
|
||||
library_dirs=LinkCheckerConf.ssl_library_dirs,
|
||||
libraries=LinkCheckerConf.libraries)]
|
||||
include_dirs=linkcheckerConf.ssl_include_dirs,
|
||||
library_dirs=linkcheckerConf.ssl_library_dirs,
|
||||
libraries=linkcheckerConf.libraries)]
|
||||
|
||||
|
||||
def create_conf_file(self, directory, data=[]):
|
||||
|
|
@ -203,7 +203,7 @@ setup (name = "linkchecker",
|
|||
url = "http://linkchecker.sourceforge.net/",
|
||||
licence = "GPL",
|
||||
long_description =
|
||||
"""LinkChecker features
|
||||
"""Linkchecker features
|
||||
o recursive checking
|
||||
o multithreading
|
||||
o output in colored or normal text, HTML, SQL, CSV or a sitemap
|
||||
|
|
|
|||
Loading…
Reference in a new issue