mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-26 09:04:44 +00:00
link pattern
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@116 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
78a72c71c6
commit
cd89f5e6a9
2 changed files with 69 additions and 29 deletions
|
|
@ -29,17 +29,36 @@ try:
|
|||
except ImportError:
|
||||
pass
|
||||
|
||||
LinkTags = [("a", "href"),
|
||||
("img", "src"),
|
||||
("form", "action"),
|
||||
("body", "background"),
|
||||
("frame", "src"),
|
||||
("link", "href"),
|
||||
# <meta http-equiv="refresh" content="x; url=...">
|
||||
("meta", "url"),
|
||||
("area", "href")]
|
||||
|
||||
_linkMatcher = r"""
|
||||
(?i) # case insensitive
|
||||
< # open tag
|
||||
\s* # whitespace
|
||||
%s # tag name
|
||||
\s+ # whitespace
|
||||
[^>]*? # skip leading attributes
|
||||
%s # attrib name
|
||||
\s* # whitespace
|
||||
= # equal sign
|
||||
\s* # whitespace
|
||||
(?P<value> # attribute value
|
||||
".*?" | # in double quotes
|
||||
'.*?' | # in single quotes
|
||||
[^\s>]+) # unquoted
|
||||
[^>]* # skip trailing attributes
|
||||
> # close tag
|
||||
"""
|
||||
|
||||
LinkPatterns = (
|
||||
re.compile(_linkMatcher % ("a", "href"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("img", "src"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("form", "action"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("body", "background"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("frame", "src"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("link", "href"), re.VERBOSE),
|
||||
# <meta http-equiv="refresh" content="x; url=...">
|
||||
re.compile(_linkMatcher % ("meta", "url"), re.VERBOSE),
|
||||
re.compile(_linkMatcher % ("area", "href"), re.VERBOSE),
|
||||
)
|
||||
|
||||
class UrlData:
|
||||
"Representing a URL with additional information like validity etc"
|
||||
|
|
@ -290,7 +309,8 @@ class UrlData:
|
|||
Config.debug(Config.DebugDelim+"Parsing recursively into\n"+\
|
||||
str(self)+"\n"+Config.DebugDelim)
|
||||
# search for a possible base reference
|
||||
bases = self.searchInForTag("base", "href")
|
||||
bases = self.searchInForTag(re.compile(_linkMatcher % ("base",
|
||||
"href"), re.VERBOSE))
|
||||
baseRef = None
|
||||
if len(bases)>=1:
|
||||
baseRef = bases[0][0]
|
||||
|
|
@ -298,19 +318,15 @@ class UrlData:
|
|||
self.setWarning("more than one base tag found")
|
||||
|
||||
# search for tags and add found tags to URL queue
|
||||
for start,end in LinkTags:
|
||||
urls = self.searchInForTag(start,end)
|
||||
Config.debug("DEBUG: tag=%s %s, urls=%s\n" % (start,end,urls))
|
||||
for pattern in LinkPatterns:
|
||||
urls = self.searchInForTag(pattern)
|
||||
for url,line in urls:
|
||||
config.appendUrl(GetUrlDataFrom(url,
|
||||
self.recursionLevel+1, self.url, baseRef, line))
|
||||
|
||||
|
||||
def searchInForTag(self, tag_start, tag_end):
|
||||
def searchInForTag(self, pattern):
|
||||
urls = []
|
||||
prefix=r"<\s*"+tag_start+r"\s+[^>]*?"+tag_end+r"\s*=\s*"
|
||||
suffix="[^>]*>"
|
||||
pattern = re.compile(prefix+"([^\"\s>]+|\"[^\"]+\")"+suffix, re.I)
|
||||
index = 0
|
||||
while 1:
|
||||
match = pattern.search(self.getContent(), index)
|
||||
|
|
@ -318,7 +334,7 @@ class UrlData:
|
|||
index = match.end()
|
||||
if self._isInComment(match.start()): continue
|
||||
# need to strip optional ending quotes for the meta tag
|
||||
urls.append((string.strip(StringUtil.stripQuotes(match.group(1))),
|
||||
urls.append((string.strip(StringUtil.stripQuotes(match.group('value'))),
|
||||
StringUtil.getLineNumber(self.getContent(),
|
||||
match.start())))
|
||||
return urls
|
||||
|
|
|
|||
44
util1.py
44
util1.py
|
|
@ -1,5 +1,17 @@
|
|||
import sys,time,rotor,types
|
||||
|
||||
_curses = None
|
||||
try:
|
||||
from ncurses import curses
|
||||
_curses = curses
|
||||
except ImportError:
|
||||
try:
|
||||
import curses
|
||||
_curses = curses
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
_bs = [
|
||||
['\023\335\233\203\2323\016',
|
||||
'\023\335\233\215\324\244\016',
|
||||
|
|
@ -38,15 +50,11 @@ _3 = '\236\177\246\304\351F\203(\005z\375\220\324)\201\266z*j\342\344l\323\0325\
|
|||
_4 = '\222\360P\277\330\300\246\3670\256\303\223\036\311['
|
||||
|
||||
def abbuzze():
|
||||
try: import curses
|
||||
except ImportError:
|
||||
print "Sorry, this operating system can not wash clothes!"
|
||||
if not _curses:
|
||||
print "Sorry, this operating system can not wash clothes."
|
||||
return
|
||||
w = curses.initscr() # initialize the curses library
|
||||
curses.nonl() # tell curses not to do NL->CR/NL on output
|
||||
curses.noecho() # don't echo input
|
||||
curses.cbreak() # take input chars one at a time, no wait for \n
|
||||
curses.meta(1) # allow 8-bit chars
|
||||
w = _curses.initscr() # initialize the curses library
|
||||
config_curses()
|
||||
my,mx = w.getmaxyx()
|
||||
b = w.subwin(my-2, mx, 0, 0)
|
||||
s = w.subwin(my-2, 0)
|
||||
|
|
@ -63,7 +71,23 @@ def abbuzze():
|
|||
abspann(curses.newwin(8, 30, 0, 0))
|
||||
w.erase()
|
||||
w.refresh()
|
||||
curses.endwin()
|
||||
_curses.endwin()
|
||||
|
||||
def config_curses():
|
||||
_curses.nonl() # tell curses not to do NL->CR/NL on output
|
||||
_curses.noecho() # don't echo input
|
||||
_curses.cbreak() # take input chars one at a time, no wait for \n
|
||||
_curses.meta(1) # allow 8-bit chars
|
||||
if hasattr(_curses, "start_color"):
|
||||
_curses.start_color() # start the colour system
|
||||
if _curses.has_colors():
|
||||
if _curses.can_change_color():
|
||||
pass
|
||||
else:
|
||||
_curses.init_pair(1, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
|
||||
_curses.init_pair(2, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
|
||||
_curses.init_pair(3, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
|
||||
_curses.init_pair(4, curses.COLOR_MAGENTA, curses.COLOR_BLACK)
|
||||
|
||||
|
||||
def waddemol(f):
|
||||
|
|
@ -136,7 +160,7 @@ if __name__=='__main__':
|
|||
try:
|
||||
abbuzze()
|
||||
except:
|
||||
curses.endwin()
|
||||
_curses.endwin()
|
||||
type, value = sys.exc_info()[:2]
|
||||
print type,value
|
||||
print "Sorry, your washing machine is broken!"
|
||||
|
|
|
|||
Loading…
Reference in a new issue