use new parser interface

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1203 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-13 00:53:11 +00:00 · 2004-01-28 22:49:20 +00:00 · 2004-01-28 22:49:20 +00:00 · 44f5941552
commit 44f5941552
parent 66ecc466b7
2 changed files with 40 additions and 28 deletions
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@ -19,10 +19,11 @@
 import sys, re, urlparse, urllib2, time, traceback, socket, select, i18n
 from urllib import splituser, splitport, unquote
 from linkcheck import DNS, LinkCheckerError, getLinkPat
+from linkcheck.parser import htmlsax
 DNS.DiscoverNameServers()

 import Config, StringUtil, test_support
-from linkparse import LinkParser, MetaRobotsParser
+from linkparse import LinkFinder, MetaRobotsFinder
 from debug import *

 ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
@ -450,7 +451,12 @@ class UrlData (object):
    def contentAllowsRobots (self):
        if not self.isHtml():
            return True
-        return MetaRobotsParser(self.getContent()).follow
+        h = MetaRobotsFinder(self.getContent())
+        p = htmlsax.parser(h)
+        h.parser = p
+        p.feed(self.getContent())
+        p.flush()
+        return h.follow


    def checkAnchors (self):
@ -459,7 +465,11 @@ class UrlData (object):
            # do not bother
            return
        debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
-        h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
+        h = LinkFinder(self.getContent(), tags={'a': ['name'], None: ['id']})
+        p = htmlsax.parser(h)
+        h.parser = p
+        p.feed(self.getContent())
+        p.flush()
        for cur_anchor,line,column,name,base in h.urls:
            if cur_anchor == self.anchor:
                return
@ -552,14 +562,22 @@ class UrlData (object):

    def parse_html (self):
        # search for a possible base reference
-        h = LinkParser(self.getContent(), tags={'base': ['href']})
+        h = LinkFinder(self.getContent(), tags={'base': ['href']})
+        p = htmlsax.parser(h)
+        h.parser = p
+        p.feed(self.getContent())
+        p.flush()
        baseRef = None
        if len(h.urls)>=1:
            baseRef = h.urls[0][0]
            if len(h.urls)>1:
                self.setWarning(i18n._(
                "more than one <base> tag found, using only the first one"))
-        h = LinkParser(self.getContent())
+        h = LinkFinder(self.getContent())
+        p = htmlsax.parser(h)
+        h.parser = p
+        p.feed(self.getContent())
+        p.flush()
        for s in h.parse_info:
            # the parser had warnings/errors
            self.setWarning(s)
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@ -17,7 +17,6 @@

 import re, StringUtil, linkname
 from debug import *
-from linkcheck.parser.htmllib import HtmlParser

 # ripped mainly from HTML::Tagset.pm
 LinkTags = {
@ -56,10 +55,9 @@ LinkTags = {
 _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
 _css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")

-class TagParser (HtmlParser):
-    """basic parser class putting message in list"""
+class TagFinder (object):
+    """base class putting message in list"""
    def __init__ (self, content):
-        super(TagParser, self).__init__()
        self.content = content
        # warnings and errors during parsing
        self.parse_info = []
@ -68,7 +66,7 @@ class TagParser (HtmlParser):
    def _errorfun (self, msg, name):
        """append msg to error list"""
        self.parse_info.append("%s at line %d col %d: %s" % \
-                (name, self.last_lineno(), self.last_column(), msg))
+            (name, self.parser.last_lineno(), self.parser.last_column(), msg))


    def error (self, msg):
@ -86,15 +84,12 @@ class TagParser (HtmlParser):
        self._errorfun(msg, "fatal error")


-class MetaRobotsParser (TagParser):
-    """parser class for robots.txt meta values in HTML"""
+class MetaRobotsFinder (TagFinder):
+    """class for finding robots.txt meta values in HTML"""
    def __init__ (self, content):
-        super(MetaRobotsParser, self).__init__(content)
+        super(MetaRobotsFinder, self).__init__(content)
        self.follow = True
        self.index = True
-        self.feed(self.content)
-        debug(HURT_ME_PLENTY, "flushing")
-        self.flush()


    def startElement (self, tag, attrs):
@ -105,25 +100,24 @@ class MetaRobotsParser (TagParser):
                self.index = 'noindex' not in val


-class LinkParser (TagParser):
-    """Parse the content for a list of links. After parsing, the urls
+class LinkFinder (TagFinder):
+    """find a list of links. After parsing, the urls
    will have a list of parsed links entries with the format
    (url, lineno, column, name, base)
    """

    def __init__ (self, content, tags=LinkTags):
-        super(LinkParser, self).__init__(content)
+        super(LinkFinder, self).__init__(content)
        self.tags = tags
        self.urls = []
-        self.feed(self.content)
-        debug(HURT_ME_PLENTY, "flushing")
-        self.flush()


    def startElement (self, tag, attrs):
-        debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs)
-        debug(NIGHTMARE, "line", self.lineno(), "col", self.column(),
-              "old line", self.last_lineno(), "old col", self.last_column())
+        debug(NIGHTMARE, "LinkFinder tag", tag, "attrs", attrs)
+        debug(NIGHTMARE, "line", self.parser.lineno(),
+              "col", self.parser.column(),
+              "old line", self.parser.last_lineno(),
+              "old col", self.parser.last_column())
        tagattrs = self.tags.get(tag, [])
        tagattrs.extend(self.tags.get(None, []))
        for attr in tagattrs:
@ -132,7 +126,7 @@ class LinkParser (TagParser):
                if tag=='a' and attr=='href':
                    name = StringUtil.unquote(attrs.get('title', ''))
                    if not name:
-                        name = linkname.href_name(self.content[self.pos():])
+                        name = linkname.href_name(self.content[self.parser.pos():])
                elif tag=='img':
                    name = StringUtil.unquote(attrs.get('alt', ''))
                    if not name:
@ -166,6 +160,6 @@ class LinkParser (TagParser):
            return
        for u in urls:
            debug(NIGHTMARE, "LinkParser add link", tag, attr, u, name, base)
-            self.urls.append((u, self.last_lineno(), self.last_column(),
-                              name, base))
+            self.urls.append((u, self.parser.last_lineno(),
+                              self.parser.last_column(), name, base))