mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Simpler link parsing routine.
This commit is contained in:
parent
f180592cc4
commit
b6b5c7a12e
4 changed files with 35 additions and 39 deletions
|
|
@ -187,16 +187,18 @@ class LinkFinder (TagFinder):
|
|||
"""Find HTML links, and apply them to the callback function with the
|
||||
format (url, lineno, column, name, codebase)."""
|
||||
|
||||
def __init__ (self, callback, tags=None):
|
||||
def __init__ (self, callback, tags):
|
||||
"""Store content in buffer and initialize URL list."""
|
||||
super(LinkFinder, self).__init__()
|
||||
self.callback = callback
|
||||
if tags is None:
|
||||
self.tags = LinkTags
|
||||
else:
|
||||
self.tags = tags
|
||||
# set universal tag attributes using tagname None
|
||||
self.universal_attrs = set(tags.get(None, []))
|
||||
self.tags = dict()
|
||||
for tag, attrs in tags.items():
|
||||
self.tags[tag] = set(attrs)
|
||||
# add universal tag attributes
|
||||
self.tags[tag].update(self.universal_attrs)
|
||||
self.base_ref = u''
|
||||
log.debug(LOG_CHECK, "link finder")
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
|
|
@ -204,15 +206,9 @@ class LinkFinder (TagFinder):
|
|||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
if tag == "base" and not self.base_ref:
|
||||
self.base_ref = unquote(attrs.get_true("href", u''))
|
||||
tagattrs = self.tags.get(tag, [])
|
||||
# add universal tag attributes using tagname None
|
||||
tagattrs.extend(self.tags.get(None, []))
|
||||
# eliminate duplicate tag attributes
|
||||
tagattrs = set(tagattrs)
|
||||
tagattrs = self.tags.get(tag, self.universal_attrs)
|
||||
# parse URLs in tag (possibly multiple URLs in CSS styles)
|
||||
for attr in tagattrs:
|
||||
if attr not in attrs:
|
||||
continue
|
||||
for attr in tagattrs.intersection(attrs):
|
||||
if tag == "meta" and not is_meta_url(attr, attrs):
|
||||
continue
|
||||
if tag == "form" and not is_form_get(attr, attrs):
|
||||
|
|
@ -252,31 +248,31 @@ class LinkFinder (TagFinder):
|
|||
name = u""
|
||||
return name
|
||||
|
||||
def parse_tag (self, tag, attr, url, name, base):
|
||||
def parse_tag (self, tag, attr, value, name, base):
|
||||
"""Add given url data to url list."""
|
||||
assert isinstance(tag, unicode), repr(tag)
|
||||
assert isinstance(attr, unicode), repr(attr)
|
||||
assert isinstance(name, unicode), repr(name)
|
||||
assert isinstance(base, unicode), repr(base)
|
||||
assert isinstance(url, unicode) or url is None, repr(url)
|
||||
urls = []
|
||||
assert isinstance(value, unicode) or value is None, repr(value)
|
||||
# look for meta refresh
|
||||
if tag == u'meta' and url:
|
||||
mo = refresh_re.match(url)
|
||||
if tag == u'meta' and value:
|
||||
mo = refresh_re.match(value)
|
||||
if mo:
|
||||
urls.append(mo.group("url"))
|
||||
self.found_url(mo.group("url"), name, base)
|
||||
elif attr != 'content':
|
||||
urls.append(url)
|
||||
elif attr == u'style' and url:
|
||||
for mo in css_url_re.finditer(url):
|
||||
u = mo.group("url")
|
||||
urls.append(unquote(u, matching=True))
|
||||
self.found_url(value, name, base)
|
||||
elif attr == u'style' and value:
|
||||
for mo in css_url_re.finditer(value):
|
||||
url = unquote(mo.group("url"), matching=True)
|
||||
self.found_url(url, name, base)
|
||||
elif attr == u'archive':
|
||||
urls.extend(url.split(u','))
|
||||
for url in value.split(u','):
|
||||
self.found_url(url, name, base)
|
||||
else:
|
||||
urls.append(url)
|
||||
for u in urls:
|
||||
assert isinstance(u, unicode) or u is None, repr(u)
|
||||
log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
|
||||
self.callback(u, self.parser.last_lineno(),
|
||||
self.parser.last_column(), name, base)
|
||||
self.found_url(value, name, base)
|
||||
|
||||
def found_url(self, url, name, base):
|
||||
assert isinstance(url, unicode) or url is None, repr(url)
|
||||
self.callback(url, self.parser.last_lineno(),
|
||||
self.parser.last_column(), name, base)
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ def parse_html (url_data):
|
|||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url)
|
||||
find_links(url_data, url_data.add_url, linkparse.LinkTags)
|
||||
|
||||
|
||||
def parse_opera (url_data):
|
||||
|
|
@ -126,7 +126,7 @@ def parse_wml (url_data):
|
|||
"""Parse into WML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url, tags=linkparse.WmlTags)
|
||||
find_links(url_data, url_data.add_url, linkparse.WmlTags)
|
||||
|
||||
|
||||
def get_temp_filename (content):
|
||||
|
|
@ -141,12 +141,12 @@ def get_temp_filename (content):
|
|||
return filename
|
||||
|
||||
|
||||
def find_links (url_data, callback, tags=None):
|
||||
def find_links (url_data, callback, tags):
|
||||
"""Parse into content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
# construct parser object
|
||||
handler = linkparse.LinkFinder(callback, tags=tags)
|
||||
handler = linkparse.LinkFinder(callback, tags)
|
||||
parser = htmlsax.parser(handler)
|
||||
if url_data.charset:
|
||||
parser.encoding = url_data.charset
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ class AnchorCheck(_ContentPlugin):
|
|||
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
|
||||
# list of parsed anchors
|
||||
self.anchors = []
|
||||
find_links(url_data, self.add_anchor, tags=linkparse.AnchorTags)
|
||||
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
|
||||
self.check_anchor(url_data)
|
||||
|
||||
def add_anchor (self, url, line, column, name, base):
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ class TestLinkparser (unittest.TestCase):
|
|||
|
||||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url))
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
try:
|
||||
|
|
@ -52,7 +52,7 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_no_link (self, content):
|
||||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in a new issue