diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py
index 20b47b81..38e81dd4 100644
--- a/linkcheck/UrlData.py
+++ b/linkcheck/UrlData.py
@@ -19,10 +19,11 @@
import sys, re, urlparse, urllib2, time, traceback, socket, select, i18n
from urllib import splituser, splitport, unquote
from linkcheck import DNS, LinkCheckerError, getLinkPat
+from linkcheck.parser import htmlsax
DNS.DiscoverNameServers()
import Config, StringUtil, test_support
-from linkparse import LinkParser, MetaRobotsParser
+from linkparse import LinkFinder, MetaRobotsFinder
from debug import *
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
@@ -450,7 +451,12 @@ class UrlData (object):
def contentAllowsRobots (self):
if not self.isHtml():
return True
- return MetaRobotsParser(self.getContent()).follow
+ h = MetaRobotsFinder(self.getContent())
+ p = htmlsax.parser(h)
+ h.parser = p
+ p.feed(self.getContent())
+ p.flush()
+ return h.follow
def checkAnchors (self):
@@ -459,7 +465,11 @@ class UrlData (object):
# do not bother
return
debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
- h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
+ h = LinkFinder(self.getContent(), tags={'a': ['name'], None: ['id']})
+ p = htmlsax.parser(h)
+ h.parser = p
+ p.feed(self.getContent())
+ p.flush()
for cur_anchor,line,column,name,base in h.urls:
if cur_anchor == self.anchor:
return
@@ -552,14 +562,22 @@ class UrlData (object):
def parse_html (self):
# search for a possible base reference
- h = LinkParser(self.getContent(), tags={'base': ['href']})
+ h = LinkFinder(self.getContent(), tags={'base': ['href']})
+ p = htmlsax.parser(h)
+ h.parser = p
+ p.feed(self.getContent())
+ p.flush()
baseRef = None
if len(h.urls)>=1:
baseRef = h.urls[0][0]
if len(h.urls)>1:
self.setWarning(i18n._(
"more than one tag found, using only the first one"))
- h = LinkParser(self.getContent())
+ h = LinkFinder(self.getContent())
+ p = htmlsax.parser(h)
+ h.parser = p
+ p.feed(self.getContent())
+ p.flush()
for s in h.parse_info:
# the parser had warnings/errors
self.setWarning(s)
diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py
index e08699a4..b4413324 100644
--- a/linkcheck/linkparse.py
+++ b/linkcheck/linkparse.py
@@ -17,7 +17,6 @@
import re, StringUtil, linkname
from debug import *
-from linkcheck.parser.htmllib import HtmlParser
# ripped mainly from HTML::Tagset.pm
LinkTags = {
@@ -56,10 +55,9 @@ LinkTags = {
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$")
_css_url_re = re.compile(r"url\((?P[^\)]+)\)")
-class TagParser (HtmlParser):
- """basic parser class putting message in list"""
+class TagFinder (object):
+ """base class putting message in list"""
def __init__ (self, content):
- super(TagParser, self).__init__()
self.content = content
# warnings and errors during parsing
self.parse_info = []
@@ -68,7 +66,7 @@ class TagParser (HtmlParser):
def _errorfun (self, msg, name):
"""append msg to error list"""
self.parse_info.append("%s at line %d col %d: %s" % \
- (name, self.last_lineno(), self.last_column(), msg))
+ (name, self.parser.last_lineno(), self.parser.last_column(), msg))
def error (self, msg):
@@ -86,15 +84,12 @@ class TagParser (HtmlParser):
self._errorfun(msg, "fatal error")
-class MetaRobotsParser (TagParser):
- """parser class for robots.txt meta values in HTML"""
+class MetaRobotsFinder (TagFinder):
+ """class for finding robots.txt meta values in HTML"""
def __init__ (self, content):
- super(MetaRobotsParser, self).__init__(content)
+ super(MetaRobotsFinder, self).__init__(content)
self.follow = True
self.index = True
- self.feed(self.content)
- debug(HURT_ME_PLENTY, "flushing")
- self.flush()
def startElement (self, tag, attrs):
@@ -105,25 +100,24 @@ class MetaRobotsParser (TagParser):
self.index = 'noindex' not in val
-class LinkParser (TagParser):
- """Parse the content for a list of links. After parsing, the urls
+class LinkFinder (TagFinder):
+ """find a list of links. After parsing, the urls
will have a list of parsed links entries with the format
(url, lineno, column, name, base)
"""
def __init__ (self, content, tags=LinkTags):
- super(LinkParser, self).__init__(content)
+ super(LinkFinder, self).__init__(content)
self.tags = tags
self.urls = []
- self.feed(self.content)
- debug(HURT_ME_PLENTY, "flushing")
- self.flush()
def startElement (self, tag, attrs):
- debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs)
- debug(NIGHTMARE, "line", self.lineno(), "col", self.column(),
- "old line", self.last_lineno(), "old col", self.last_column())
+ debug(NIGHTMARE, "LinkFinder tag", tag, "attrs", attrs)
+ debug(NIGHTMARE, "line", self.parser.lineno(),
+ "col", self.parser.column(),
+ "old line", self.parser.last_lineno(),
+ "old col", self.parser.last_column())
tagattrs = self.tags.get(tag, [])
tagattrs.extend(self.tags.get(None, []))
for attr in tagattrs:
@@ -132,7 +126,7 @@ class LinkParser (TagParser):
if tag=='a' and attr=='href':
name = StringUtil.unquote(attrs.get('title', ''))
if not name:
- name = linkname.href_name(self.content[self.pos():])
+ name = linkname.href_name(self.content[self.parser.pos():])
elif tag=='img':
name = StringUtil.unquote(attrs.get('alt', ''))
if not name:
@@ -166,6 +160,6 @@ class LinkParser (TagParser):
return
for u in urls:
debug(NIGHTMARE, "LinkParser add link", tag, attr, u, name, base)
- self.urls.append((u, self.last_lineno(), self.last_column(),
- name, base))
+ self.urls.append((u, self.parser.last_lineno(),
+ self.parser.last_column(), name, base))