From 44f59415529914502bf626cf8b6696c2f19d7807 Mon Sep 17 00:00:00 2001 From: calvin Date: Wed, 28 Jan 2004 22:49:20 +0000 Subject: [PATCH] use new parser interface git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1203 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/UrlData.py | 28 +++++++++++++++++++++++----- linkcheck/linkparse.py | 40 +++++++++++++++++----------------------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 20b47b81..38e81dd4 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -19,10 +19,11 @@ import sys, re, urlparse, urllib2, time, traceback, socket, select, i18n from urllib import splituser, splitport, unquote from linkcheck import DNS, LinkCheckerError, getLinkPat +from linkcheck.parser import htmlsax DNS.DiscoverNameServers() import Config, StringUtil, test_support -from linkparse import LinkParser, MetaRobotsParser +from linkparse import LinkFinder, MetaRobotsFinder from debug import * ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search @@ -450,7 +451,12 @@ class UrlData (object): def contentAllowsRobots (self): if not self.isHtml(): return True - return MetaRobotsParser(self.getContent()).follow + h = MetaRobotsFinder(self.getContent()) + p = htmlsax.parser(h) + h.parser = p + p.feed(self.getContent()) + p.flush() + return h.follow def checkAnchors (self): @@ -459,7 +465,11 @@ class UrlData (object): # do not bother return debug(HURT_ME_PLENTY, "checking anchor", self.anchor) - h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']}) + h = LinkFinder(self.getContent(), tags={'a': ['name'], None: ['id']}) + p = htmlsax.parser(h) + h.parser = p + p.feed(self.getContent()) + p.flush() for cur_anchor,line,column,name,base in h.urls: if cur_anchor == self.anchor: return @@ -552,14 +562,22 @@ class UrlData (object): def parse_html (self): # search for a possible base reference - h = LinkParser(self.getContent(), tags={'base': ['href']}) + h = LinkFinder(self.getContent(), tags={'base': ['href']}) + p = htmlsax.parser(h) + h.parser = p + p.feed(self.getContent()) + p.flush() baseRef = None if len(h.urls)>=1: baseRef = h.urls[0][0] if len(h.urls)>1: self.setWarning(i18n._( "more than one tag found, using only the first one")) - h = LinkParser(self.getContent()) + h = LinkFinder(self.getContent()) + p = htmlsax.parser(h) + h.parser = p + p.feed(self.getContent()) + p.flush() for s in h.parse_info: # the parser had warnings/errors self.setWarning(s) diff --git a/linkcheck/linkparse.py b/linkcheck/linkparse.py index e08699a4..b4413324 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/linkparse.py @@ -17,7 +17,6 @@ import re, StringUtil, linkname from debug import * -from linkcheck.parser.htmllib import HtmlParser # ripped mainly from HTML::Tagset.pm LinkTags = { @@ -56,10 +55,9 @@ LinkTags = { _refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P.+)$") _css_url_re = re.compile(r"url\((?P[^\)]+)\)") -class TagParser (HtmlParser): - """basic parser class putting message in list""" +class TagFinder (object): + """base class putting message in list""" def __init__ (self, content): - super(TagParser, self).__init__() self.content = content # warnings and errors during parsing self.parse_info = [] @@ -68,7 +66,7 @@ class TagParser (HtmlParser): def _errorfun (self, msg, name): """append msg to error list""" self.parse_info.append("%s at line %d col %d: %s" % \ - (name, self.last_lineno(), self.last_column(), msg)) + (name, self.parser.last_lineno(), self.parser.last_column(), msg)) def error (self, msg): @@ -86,15 +84,12 @@ class TagParser (HtmlParser): self._errorfun(msg, "fatal error") -class MetaRobotsParser (TagParser): - """parser class for robots.txt meta values in HTML""" +class MetaRobotsFinder (TagFinder): + """class for finding robots.txt meta values in HTML""" def __init__ (self, content): - super(MetaRobotsParser, self).__init__(content) + super(MetaRobotsFinder, self).__init__(content) self.follow = True self.index = True - self.feed(self.content) - debug(HURT_ME_PLENTY, "flushing") - self.flush() def startElement (self, tag, attrs): @@ -105,25 +100,24 @@ class MetaRobotsParser (TagParser): self.index = 'noindex' not in val -class LinkParser (TagParser): - """Parse the content for a list of links. After parsing, the urls +class LinkFinder (TagFinder): + """find a list of links. After parsing, the urls will have a list of parsed links entries with the format (url, lineno, column, name, base) """ def __init__ (self, content, tags=LinkTags): - super(LinkParser, self).__init__(content) + super(LinkFinder, self).__init__(content) self.tags = tags self.urls = [] - self.feed(self.content) - debug(HURT_ME_PLENTY, "flushing") - self.flush() def startElement (self, tag, attrs): - debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs) - debug(NIGHTMARE, "line", self.lineno(), "col", self.column(), - "old line", self.last_lineno(), "old col", self.last_column()) + debug(NIGHTMARE, "LinkFinder tag", tag, "attrs", attrs) + debug(NIGHTMARE, "line", self.parser.lineno(), + "col", self.parser.column(), + "old line", self.parser.last_lineno(), + "old col", self.parser.last_column()) tagattrs = self.tags.get(tag, []) tagattrs.extend(self.tags.get(None, [])) for attr in tagattrs: @@ -132,7 +126,7 @@ class LinkParser (TagParser): if tag=='a' and attr=='href': name = StringUtil.unquote(attrs.get('title', '')) if not name: - name = linkname.href_name(self.content[self.pos():]) + name = linkname.href_name(self.content[self.parser.pos():]) elif tag=='img': name = StringUtil.unquote(attrs.get('alt', '')) if not name: @@ -166,6 +160,6 @@ class LinkParser (TagParser): return for u in urls: debug(NIGHTMARE, "LinkParser add link", tag, attr, u, name, base) - self.urls.append((u, self.last_lineno(), self.last_column(), - name, base)) + self.urls.append((u, self.parser.last_lineno(), + self.parser.last_column(), name, base))