mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-30 11:04:50 +00:00
remember base href in link parser
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1918 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
607425065e
commit
6f488fd189
2 changed files with 42 additions and 45 deletions
|
|
@ -527,21 +527,6 @@ class UrlBase (object):
|
|||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
# search for a possible base reference
|
||||
h = linkcheck.linkparse.LinkFinder(self.get_content(),
|
||||
tags={'base': [u'href']})
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p.feed(self.get_content())
|
||||
p.flush()
|
||||
h.parser = None
|
||||
p.handler = None
|
||||
base_ref = None
|
||||
if len(h.urls)>=1:
|
||||
base_ref = h.urls[0][0]
|
||||
if len(h.urls)>1:
|
||||
self.add_warning(_(
|
||||
"more than one <base> tag found, using only the first one"))
|
||||
h = linkcheck.linkparse.LinkFinder(self.get_content())
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
|
|
@ -554,12 +539,12 @@ class UrlBase (object):
|
|||
self.add_warning(s)
|
||||
for url, line, column, name, codebase in h.urls:
|
||||
if codebase:
|
||||
base = codebase
|
||||
base_ref = codebase
|
||||
else:
|
||||
base = base_ref
|
||||
base_ref = h.base_ref
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.consumer, parent_url=self.url,
|
||||
base_ref=base, line=line, column=column, name=name)
|
||||
base_ref=base_ref, line=line, column=column, name=name)
|
||||
self.consumer.append_url(url_data)
|
||||
|
||||
def parse_opera (self):
|
||||
|
|
|
|||
|
|
@ -101,6 +101,7 @@ class MetaRobotsFinder (TagFinder):
|
|||
super(MetaRobotsFinder, self).__init__(content)
|
||||
self.follow = True
|
||||
self.index = True
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "meta robots finder")
|
||||
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
|
|
@ -113,9 +114,9 @@ class MetaRobotsFinder (TagFinder):
|
|||
|
||||
|
||||
class LinkFinder (TagFinder):
|
||||
"""find a list of links. After parsing, the urls
|
||||
will have a list of parsed links entries with the format
|
||||
(url, lineno, column, name, base)
|
||||
"""Find a list of links. After parsing, self.urls
|
||||
will be a list of parsed links entries with the format
|
||||
(url, lineno, column, name, codebase)
|
||||
"""
|
||||
|
||||
def __init__ (self, content, tags=None):
|
||||
|
|
@ -126,6 +127,8 @@ class LinkFinder (TagFinder):
|
|||
else:
|
||||
self.tags = tags
|
||||
self.urls = []
|
||||
self.base_ref = u''
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK, "link finder")
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
"""search for links and store found URLs in a list"""
|
||||
|
|
@ -135,33 +138,42 @@ class LinkFinder (TagFinder):
|
|||
"line %d col %d old line %d old col %d",
|
||||
self.parser.lineno(), self.parser.column(),
|
||||
self.parser.last_lineno(), self.parser.last_column())
|
||||
if tag == "base" and not self.base:
|
||||
self.base_ref = attrs.get("href", u'')
|
||||
tagattrs = self.tags.get(tag, [])
|
||||
tagattrs.extend(self.tags.get(None, []))
|
||||
for attr in tagattrs:
|
||||
if attr in attrs:
|
||||
# name of this link
|
||||
if tag == 'a' and attr == 'href':
|
||||
name = linkcheck.strformat.unquote(
|
||||
attrs.get('title', u''))
|
||||
if not name:
|
||||
data = self.content[self.parser.pos():]
|
||||
data = data.decode(self.parser.encoding, "ignore")
|
||||
name = linkcheck.linkname.href_name(data)
|
||||
elif tag == 'img':
|
||||
name = linkcheck.strformat.unquote(attrs.get('alt', u''))
|
||||
if not name:
|
||||
name = linkcheck.strformat.unquote(
|
||||
attrs.get('title', u''))
|
||||
else:
|
||||
name = u""
|
||||
# possible codebase
|
||||
if tag in ('applet', 'object'):
|
||||
base = linkcheck.strformat.unquote(attrs.get('codebase', u''))
|
||||
else:
|
||||
base = u""
|
||||
value = linkcheck.strformat.unquote(attrs[attr])
|
||||
# add link to url list
|
||||
self.add_link(tag, attr, value, name, base)
|
||||
if attr not in attrs:
|
||||
continue
|
||||
# name of this link
|
||||
name = self.get_link_name(tag, attrs, attr)
|
||||
# possible codebase
|
||||
if tag in ('applet', 'object'):
|
||||
codebase = linkcheck.strformat.unquote(
|
||||
attrs.get('codebase', u''))
|
||||
else:
|
||||
codebase = u''
|
||||
value = linkcheck.strformat.unquote(attrs[attr])
|
||||
# add link to url list
|
||||
self.add_link(tag, attr, value, name, codebase)
|
||||
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
||||
"LinkFinder finished tag %s", tag)
|
||||
|
||||
def get_link_name (self, tag, attrs, attr):
|
||||
"""Parse attrs for link name. Return name of link"""
|
||||
if tag == 'a' and attr == 'href':
|
||||
name = linkcheck.strformat.unquote(attrs.get('title', u''))
|
||||
if not name:
|
||||
data = self.content[self.parser.pos():]
|
||||
data = data.decode(self.parser.encoding, "ignore")
|
||||
name = linkcheck.linkname.href_name(data)
|
||||
elif tag == 'img':
|
||||
name = linkcheck.strformat.unquote(attrs.get('alt', u''))
|
||||
if not name:
|
||||
name = linkcheck.strformat.unquote(attrs.get('title', u''))
|
||||
else:
|
||||
name = u""
|
||||
return name
|
||||
|
||||
def add_link (self, tag, attr, url, name, base):
|
||||
"""add given url data to url list"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue