From ebb48fa3b8b39764af6654af5ecd0b74ca255fb8 Mon Sep 17 00:00:00 2001 From: calvin Date: Tue, 29 Mar 2005 11:31:17 +0000 Subject: [PATCH] use get_true method to ensure that urls are always a unicode string and not None git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2472 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/linkparse.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py index 18f31507..2be0e4a8 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/linkparse.py @@ -25,6 +25,7 @@ import linkcheck.linkname import linkcheck.log MAX_NAMELEN = 256 +unquote = linkcheck.strformat.unquote # ripped mainly from HTML::Tagset.pm LinkTags = { @@ -129,7 +130,7 @@ class MetaRobotsFinder (TagFinder): """ if tag == 'meta': if attrs.get('name') == 'robots': - val = attrs.get('content', u'').lower().split(u',') + val = attrs.get_true('content', u'').lower().split(u',') self.follow = u'nofollow' not in val self.index = u'noindex' not in val @@ -165,7 +166,7 @@ class LinkFinder (TagFinder): self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) if tag == "base" and not self.base_ref: - self.base_ref = attrs.get("href", u'') + self.base_ref = attrs.get_true("href", u'') tagattrs = self.tags.get(tag, []) tagattrs.extend(self.tags.get(None, [])) # eliminate duplicate tag attrs @@ -174,18 +175,17 @@ class LinkFinder (TagFinder): if attr not in attrs: continue if tag == "meta": - refresh = attrs.get('http-equiv', u'').lower() + refresh = attrs.get_true('http-equiv', u'').lower() if refresh != 'refresh': continue # name of this link name = self.get_link_name(tag, attrs, attr) # possible codebase if tag in ('applet', 'object'): - codebase = linkcheck.strformat.unquote( - attrs.get('codebase', u'')) + codebase = unquote(attrs.get_true('codebase', u'')) else: codebase = u'' - value = linkcheck.strformat.unquote(attrs[attr]) + value = unquote(attrs.get_true(attr, u'')) # add link to url list self.add_link(tag, attr, value, name, codebase) linkcheck.log.debug(linkcheck.LOG_CHECK, @@ -196,7 +196,7 @@ class LinkFinder (TagFinder): Parse attrs for link name. Return name of link. """ if tag == 'a' and attr == 'href': - name = linkcheck.strformat.unquote(attrs.get('title', u'')) + name = unquote(attrs.get_true('title', u'')) if not name: pos = self.parser.pos() # Look for name only up to MAX_NAMELEN characters from current @@ -205,9 +205,9 @@ class LinkFinder (TagFinder): data = data.decode(self.parser.encoding, "ignore") name = linkcheck.linkname.href_name(data) elif tag == 'img': - name = linkcheck.strformat.unquote(attrs.get('alt', u'')) + name = unquote(attrs.get_true('alt', u'')) if not name: - name = linkcheck.strformat.unquote(attrs.get('title', u'')) + name = unquote(attrs.get_true('title', u'')) else: name = u"" return name @@ -216,27 +216,28 @@ class LinkFinder (TagFinder): """ Add given url data to url list. """ + assert isinstance(tag, unicode), repr(tag) + assert isinstance(attr, unicode), repr(attr) + assert isinstance(name, unicode), repr(name) + assert isinstance(base, unicode), repr(base) + assert isinstance(url, unicode), repr(url) urls = [] # look for meta refresh - if tag == 'meta': + if tag == u'meta': mo = refresh_re.match(url) if mo: urls.append(mo.group("url")) - elif attr == 'style': + elif attr == u'style': for mo in css_url_re.finditer(url): u = mo.group("url") - urls.append(linkcheck.strformat.unquote(u, matching=True)) + urls.append(unquote(u, matching=True)) else: urls.append(url) if not urls: # no url found return for u in urls: - assert isinstance(tag, unicode), tag - assert isinstance(attr, unicode), attr - assert isinstance(u, unicode), u - assert isinstance(name, unicode), name - assert isinstance(base, unicode), base + assert isinstance(u, unicode), repr(u) linkcheck.log.debug(linkcheck.LOG_CHECK, u"LinkParser add link %s %s %s %s %s", tag, attr, u, name, base) self.urls.append((u, self.parser.last_lineno(),