mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 00:53:11 +00:00
use new parser interface
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1203 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
66ecc466b7
commit
44f5941552
2 changed files with 40 additions and 28 deletions
|
|
@ -19,10 +19,11 @@
|
|||
import sys, re, urlparse, urllib2, time, traceback, socket, select, i18n
|
||||
from urllib import splituser, splitport, unquote
|
||||
from linkcheck import DNS, LinkCheckerError, getLinkPat
|
||||
from linkcheck.parser import htmlsax
|
||||
DNS.DiscoverNameServers()
|
||||
|
||||
import Config, StringUtil, test_support
|
||||
from linkparse import LinkParser, MetaRobotsParser
|
||||
from linkparse import LinkFinder, MetaRobotsFinder
|
||||
from debug import *
|
||||
|
||||
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
|
||||
|
|
@ -450,7 +451,12 @@ class UrlData (object):
|
|||
def contentAllowsRobots (self):
|
||||
if not self.isHtml():
|
||||
return True
|
||||
return MetaRobotsParser(self.getContent()).follow
|
||||
h = MetaRobotsFinder(self.getContent())
|
||||
p = htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p.feed(self.getContent())
|
||||
p.flush()
|
||||
return h.follow
|
||||
|
||||
|
||||
def checkAnchors (self):
|
||||
|
|
@ -459,7 +465,11 @@ class UrlData (object):
|
|||
# do not bother
|
||||
return
|
||||
debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
|
||||
h = LinkParser(self.getContent(), tags={'a': ['name'], None: ['id']})
|
||||
h = LinkFinder(self.getContent(), tags={'a': ['name'], None: ['id']})
|
||||
p = htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p.feed(self.getContent())
|
||||
p.flush()
|
||||
for cur_anchor,line,column,name,base in h.urls:
|
||||
if cur_anchor == self.anchor:
|
||||
return
|
||||
|
|
@ -552,14 +562,22 @@ class UrlData (object):
|
|||
|
||||
def parse_html (self):
|
||||
# search for a possible base reference
|
||||
h = LinkParser(self.getContent(), tags={'base': ['href']})
|
||||
h = LinkFinder(self.getContent(), tags={'base': ['href']})
|
||||
p = htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p.feed(self.getContent())
|
||||
p.flush()
|
||||
baseRef = None
|
||||
if len(h.urls)>=1:
|
||||
baseRef = h.urls[0][0]
|
||||
if len(h.urls)>1:
|
||||
self.setWarning(i18n._(
|
||||
"more than one <base> tag found, using only the first one"))
|
||||
h = LinkParser(self.getContent())
|
||||
h = LinkFinder(self.getContent())
|
||||
p = htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p.feed(self.getContent())
|
||||
p.flush()
|
||||
for s in h.parse_info:
|
||||
# the parser had warnings/errors
|
||||
self.setWarning(s)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
import re, StringUtil, linkname
|
||||
from debug import *
|
||||
from linkcheck.parser.htmllib import HtmlParser
|
||||
|
||||
# ripped mainly from HTML::Tagset.pm
|
||||
LinkTags = {
|
||||
|
|
@ -56,10 +55,9 @@ LinkTags = {
|
|||
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
|
||||
_css_url_re = re.compile(r"url\((?P<url>[^\)]+)\)")
|
||||
|
||||
class TagParser (HtmlParser):
|
||||
"""basic parser class putting message in list"""
|
||||
class TagFinder (object):
|
||||
"""base class putting message in list"""
|
||||
def __init__ (self, content):
|
||||
super(TagParser, self).__init__()
|
||||
self.content = content
|
||||
# warnings and errors during parsing
|
||||
self.parse_info = []
|
||||
|
|
@ -68,7 +66,7 @@ class TagParser (HtmlParser):
|
|||
def _errorfun (self, msg, name):
|
||||
"""append msg to error list"""
|
||||
self.parse_info.append("%s at line %d col %d: %s" % \
|
||||
(name, self.last_lineno(), self.last_column(), msg))
|
||||
(name, self.parser.last_lineno(), self.parser.last_column(), msg))
|
||||
|
||||
|
||||
def error (self, msg):
|
||||
|
|
@ -86,15 +84,12 @@ class TagParser (HtmlParser):
|
|||
self._errorfun(msg, "fatal error")
|
||||
|
||||
|
||||
class MetaRobotsParser (TagParser):
|
||||
"""parser class for robots.txt meta values in HTML"""
|
||||
class MetaRobotsFinder (TagFinder):
|
||||
"""class for finding robots.txt meta values in HTML"""
|
||||
def __init__ (self, content):
|
||||
super(MetaRobotsParser, self).__init__(content)
|
||||
super(MetaRobotsFinder, self).__init__(content)
|
||||
self.follow = True
|
||||
self.index = True
|
||||
self.feed(self.content)
|
||||
debug(HURT_ME_PLENTY, "flushing")
|
||||
self.flush()
|
||||
|
||||
|
||||
def startElement (self, tag, attrs):
|
||||
|
|
@ -105,25 +100,24 @@ class MetaRobotsParser (TagParser):
|
|||
self.index = 'noindex' not in val
|
||||
|
||||
|
||||
class LinkParser (TagParser):
|
||||
"""Parse the content for a list of links. After parsing, the urls
|
||||
class LinkFinder (TagFinder):
|
||||
"""find a list of links. After parsing, the urls
|
||||
will have a list of parsed links entries with the format
|
||||
(url, lineno, column, name, base)
|
||||
"""
|
||||
|
||||
def __init__ (self, content, tags=LinkTags):
|
||||
super(LinkParser, self).__init__(content)
|
||||
super(LinkFinder, self).__init__(content)
|
||||
self.tags = tags
|
||||
self.urls = []
|
||||
self.feed(self.content)
|
||||
debug(HURT_ME_PLENTY, "flushing")
|
||||
self.flush()
|
||||
|
||||
|
||||
def startElement (self, tag, attrs):
|
||||
debug(NIGHTMARE, "LinkParser tag", tag, "attrs", attrs)
|
||||
debug(NIGHTMARE, "line", self.lineno(), "col", self.column(),
|
||||
"old line", self.last_lineno(), "old col", self.last_column())
|
||||
debug(NIGHTMARE, "LinkFinder tag", tag, "attrs", attrs)
|
||||
debug(NIGHTMARE, "line", self.parser.lineno(),
|
||||
"col", self.parser.column(),
|
||||
"old line", self.parser.last_lineno(),
|
||||
"old col", self.parser.last_column())
|
||||
tagattrs = self.tags.get(tag, [])
|
||||
tagattrs.extend(self.tags.get(None, []))
|
||||
for attr in tagattrs:
|
||||
|
|
@ -132,7 +126,7 @@ class LinkParser (TagParser):
|
|||
if tag=='a' and attr=='href':
|
||||
name = StringUtil.unquote(attrs.get('title', ''))
|
||||
if not name:
|
||||
name = linkname.href_name(self.content[self.pos():])
|
||||
name = linkname.href_name(self.content[self.parser.pos():])
|
||||
elif tag=='img':
|
||||
name = StringUtil.unquote(attrs.get('alt', ''))
|
||||
if not name:
|
||||
|
|
@ -166,6 +160,6 @@ class LinkParser (TagParser):
|
|||
return
|
||||
for u in urls:
|
||||
debug(NIGHTMARE, "LinkParser add link", tag, attr, u, name, base)
|
||||
self.urls.append((u, self.last_lineno(), self.last_column(),
|
||||
name, base))
|
||||
self.urls.append((u, self.parser.last_lineno(),
|
||||
self.parser.last_column(), name, base))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue