Simpler link parsing routine.

This commit is contained in:
Bastian Kleineidam 2014-03-27 19:49:17 +01:00
parent f180592cc4
commit b6b5c7a12e
4 changed files with 35 additions and 39 deletions

View file

@ -187,16 +187,18 @@ class LinkFinder (TagFinder):
"""Find HTML links, and apply them to the callback function with the
format (url, lineno, column, name, codebase)."""
def __init__ (self, callback, tags=None):
def __init__ (self, callback, tags):
"""Store content in buffer and initialize URL list."""
super(LinkFinder, self).__init__()
self.callback = callback
if tags is None:
self.tags = LinkTags
else:
self.tags = tags
# set universal tag attributes using tagname None
self.universal_attrs = set(tags.get(None, []))
self.tags = dict()
for tag, attrs in tags.items():
self.tags[tag] = set(attrs)
# add universal tag attributes
self.tags[tag].update(self.universal_attrs)
self.base_ref = u''
log.debug(LOG_CHECK, "link finder")
def start_element (self, tag, attrs):
"""Search for links and store found URLs in a list."""
@ -204,15 +206,9 @@ class LinkFinder (TagFinder):
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
if tag == "base" and not self.base_ref:
self.base_ref = unquote(attrs.get_true("href", u''))
tagattrs = self.tags.get(tag, [])
# add universal tag attributes using tagname None
tagattrs.extend(self.tags.get(None, []))
# eliminate duplicate tag attributes
tagattrs = set(tagattrs)
tagattrs = self.tags.get(tag, self.universal_attrs)
# parse URLs in tag (possibly multiple URLs in CSS styles)
for attr in tagattrs:
if attr not in attrs:
continue
for attr in tagattrs.intersection(attrs):
if tag == "meta" and not is_meta_url(attr, attrs):
continue
if tag == "form" and not is_form_get(attr, attrs):
@ -252,31 +248,31 @@ class LinkFinder (TagFinder):
name = u""
return name
def parse_tag (self, tag, attr, url, name, base):
def parse_tag (self, tag, attr, value, name, base):
"""Add given url data to url list."""
assert isinstance(tag, unicode), repr(tag)
assert isinstance(attr, unicode), repr(attr)
assert isinstance(name, unicode), repr(name)
assert isinstance(base, unicode), repr(base)
assert isinstance(url, unicode) or url is None, repr(url)
urls = []
assert isinstance(value, unicode) or value is None, repr(value)
# look for meta refresh
if tag == u'meta' and url:
mo = refresh_re.match(url)
if tag == u'meta' and value:
mo = refresh_re.match(value)
if mo:
urls.append(mo.group("url"))
self.found_url(mo.group("url"), name, base)
elif attr != 'content':
urls.append(url)
elif attr == u'style' and url:
for mo in css_url_re.finditer(url):
u = mo.group("url")
urls.append(unquote(u, matching=True))
self.found_url(value, name, base)
elif attr == u'style' and value:
for mo in css_url_re.finditer(value):
url = unquote(mo.group("url"), matching=True)
self.found_url(url, name, base)
elif attr == u'archive':
urls.extend(url.split(u','))
for url in value.split(u','):
self.found_url(url, name, base)
else:
urls.append(url)
for u in urls:
assert isinstance(u, unicode) or u is None, repr(u)
log.debug(LOG_CHECK, u"LinkParser found link %r %r %r %r %r", tag, attr, u, name, base)
self.callback(u, self.parser.last_lineno(),
self.parser.last_column(), name, base)
self.found_url(value, name, base)
def found_url(self, url, name, base):
assert isinstance(url, unicode) or url is None, repr(url)
self.callback(url, self.parser.last_lineno(),
self.parser.last_column(), name, base)

View file

@ -40,7 +40,7 @@ def parse_html (url_data):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url)
find_links(url_data, url_data.add_url, linkparse.LinkTags)
def parse_opera (url_data):
@ -126,7 +126,7 @@ def parse_wml (url_data):
"""Parse into WML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
find_links(url_data, url_data.add_url, tags=linkparse.WmlTags)
find_links(url_data, url_data.add_url, linkparse.WmlTags)
def get_temp_filename (content):
@ -141,12 +141,12 @@ def get_temp_filename (content):
return filename
def find_links (url_data, callback, tags=None):
def find_links (url_data, callback, tags):
"""Parse into content and search for URLs to check.
Found URLs are added to the URL queue.
"""
# construct parser object
handler = linkparse.LinkFinder(callback, tags=tags)
handler = linkparse.LinkFinder(callback, tags)
parser = htmlsax.parser(handler)
if url_data.charset:
parser.encoding = url_data.charset

View file

@ -37,7 +37,7 @@ class AnchorCheck(_ContentPlugin):
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
# list of parsed anchors
self.anchors = []
find_links(url_data, self.add_anchor, tags=linkparse.AnchorTags)
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
self.check_anchor(url_data)
def add_anchor (self, url, line, column, name, base):

View file

@ -30,7 +30,7 @@ class TestLinkparser (unittest.TestCase):
def _test_one_link (self, content, url):
self.count_url = 0
h = linkparse.LinkFinder(self._test_one_url(url))
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
h.parser = p
try:
@ -52,7 +52,7 @@ class TestLinkparser (unittest.TestCase):
def _test_no_link (self, content):
def callback (url, line, column, name, base):
self.assertTrue(False, 'URL %r found' % url)
h = linkparse.LinkFinder(callback)
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
p = linkcheck.HtmlParser.htmlsax.parser(h)
h.parser = p
try: