link pattern

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@116 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-06-17 21:25:44 +00:00
parent 78a72c71c6
commit cd89f5e6a9
2 changed files with 69 additions and 29 deletions

View file

@ -29,17 +29,36 @@ try:
except ImportError:
pass
LinkTags = [("a", "href"),
("img", "src"),
("form", "action"),
("body", "background"),
("frame", "src"),
("link", "href"),
# <meta http-equiv="refresh" content="x; url=...">
("meta", "url"),
("area", "href")]
_linkMatcher = r"""
(?i) # case insensitive
< # open tag
\s* # whitespace
%s # tag name
\s+ # whitespace
[^>]*? # skip leading attributes
%s # attrib name
\s* # whitespace
= # equal sign
\s* # whitespace
(?P<value> # attribute value
".*?" | # in double quotes
'.*?' | # in single quotes
[^\s>]+) # unquoted
[^>]* # skip trailing attributes
> # close tag
"""
LinkPatterns = (
re.compile(_linkMatcher % ("a", "href"), re.VERBOSE),
re.compile(_linkMatcher % ("img", "src"), re.VERBOSE),
re.compile(_linkMatcher % ("form", "action"), re.VERBOSE),
re.compile(_linkMatcher % ("body", "background"), re.VERBOSE),
re.compile(_linkMatcher % ("frame", "src"), re.VERBOSE),
re.compile(_linkMatcher % ("link", "href"), re.VERBOSE),
# <meta http-equiv="refresh" content="x; url=...">
re.compile(_linkMatcher % ("meta", "url"), re.VERBOSE),
re.compile(_linkMatcher % ("area", "href"), re.VERBOSE),
)
class UrlData:
"Representing a URL with additional information like validity etc"
@ -290,7 +309,8 @@ class UrlData:
Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
str(self)+"\n"+Config.DebugDelim)
# search for a possible base reference
bases = self.searchInForTag("base", "href")
bases = self.searchInForTag(re.compile(_linkMatcher % ("base",
"href"), re.VERBOSE))
baseRef = None
if len(bases)>=1:
baseRef = bases[0][0]
@ -298,19 +318,15 @@ class UrlData:
self.setWarning("more than one base tag found")
# search for tags and add found tags to URL queue
for start,end in LinkTags:
urls = self.searchInForTag(start,end)
Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls))
for pattern in LinkPatterns:
urls = self.searchInForTag(pattern)
for url,line in urls:
config.appendUrl(GetUrlDataFrom(url,
self.recursionLevel+1, self.url, baseRef, line))
def searchInForTag(self, tag_start, tag_end):
def searchInForTag(self, pattern):
urls = []
prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*"
suffix="[^>]*>"
pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I)
index = 0
while 1:
match = pattern.search(self.getContent(), index)
@ -318,7 +334,7 @@ class UrlData:
index = match.end()
if self._isInComment(match.start()): continue
# need to strip optional ending quotes for the meta tag
urls.append((string.strip(StringUtil.stripQuotes(match.group(1))),
urls.append((string.strip(StringUtil.stripQuotes(match.group('value'))),
StringUtil.getLineNumber(self.getContent(),
match.start())))
return urls

View file

@ -1,5 +1,17 @@
import sys,time,rotor,types
_curses = None
try:
from ncurses import curses
_curses = curses
except ImportError:
try:
import curses
_curses = curses
except ImportError:
pass
_bs = [
['\023\335\233\203\2323\016',
'\023\335\233\215\324\244\016',
@ -38,15 +50,11 @@ _3 = '\236\177\246\304\351F\203(\005z\375\220\324)\201\266z*j\342\344l\323\0325\
_4 = '\222\360P\277\330\300\246\3670\256\303\223\036\311['
def abbuzze():
try: import curses
except ImportError:
print "Sorry, this operating system can not wash clothes!"
if not _curses:
print "Sorry, this operating system can not wash clothes."
return
w = curses.initscr() # initialize the curses library
curses.nonl() # tell curses not to do NL->CR/NL on output
curses.noecho() # don't echo input
curses.cbreak() # take input chars one at a time, no wait for \n
curses.meta(1) # allow 8-bit chars
w = _curses.initscr() # initialize the curses library
config_curses()
my,mx = w.getmaxyx()
b = w.subwin(my-2, mx, 0, 0)
s = w.subwin(my-2, 0)
@ -63,7 +71,23 @@ def abbuzze():
abspann(curses.newwin(8, 30, 0, 0))
w.erase()
w.refresh()
curses.endwin()
_curses.endwin()
def config_curses():
_curses.nonl() # tell curses not to do NL->CR/NL on output
_curses.noecho() # don't echo input
_curses.cbreak() # take input chars one at a time, no wait for \n
_curses.meta(1) # allow 8-bit chars
if hasattr(_curses, "start_color"):
_curses.start_color() # start the colour system
if _curses.has_colors():
if _curses.can_change_color():
pass
else:
_curses.init_pair(1, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
_curses.init_pair(2, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
_curses.init_pair(3, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
_curses.init_pair(4, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
def waddemol(f):
@ -136,7 +160,7 @@ if __name__=='__main__':
try:
abbuzze()
except:
curses.endwin()
_curses.endwin()
type, value = sys.exc_info()[:2]
print type,value
print "Sorry, your washing machine is broken!"