mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-21 14:44:44 +00:00
Stop using HTML handlers
LinkFinder is the only remaining HTML handler therefore no need for htmlsoup.process_soup() as an independent function or TagFinder as a base class.
This commit is contained in:
parent
a1433767e5
commit
9eed070a73
5 changed files with 23 additions and 69 deletions
|
|
@ -15,35 +15,6 @@
|
|||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
HTML parser implemented using Beautiful Soup and html.parser.
|
||||
|
||||
USAGE
|
||||
|
||||
Two functions are provided, one to make a BeautifulSoup object from markup and
|
||||
another to call a handler's callbacks for each element in a BeautifulSoup
|
||||
object it can process.
|
||||
|
||||
The used callback of a handler is:
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs, text, line, column)
|
||||
@param tag: tag name
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: dict
|
||||
@param text: element text
|
||||
@type tag: string
|
||||
@param line: tag line number
|
||||
@type tag: integer
|
||||
@param column: tag column number
|
||||
@type tag: integer
|
||||
|
||||
EXAMPLE
|
||||
|
||||
# Create a new BeautifulSoup object.
|
||||
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
|
||||
# Process the soup with the chosen handler as a parameter.
|
||||
htmlutil.htmlsoup.proces_soup(handler, soup)
|
||||
|
||||
"""
|
||||
|
||||
from warnings import filterwarnings
|
||||
|
|
@ -58,10 +29,3 @@ from bs4 import BeautifulSoup
|
|||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
|
||||
def process_soup(handler, soup):
|
||||
for element in soup.find_all(True):
|
||||
handler.start_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
|
|
|
|||
|
|
@ -98,19 +98,6 @@ def strip_c_comments (text):
|
|||
return c_comment_re.sub('', text)
|
||||
|
||||
|
||||
class TagFinder (object):
|
||||
"""Base class handling HTML start elements.
|
||||
TagFinder instances are used as HTML parser handlers."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize local variables."""
|
||||
super(TagFinder, self).__init__()
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Does nothing, override in a subclass."""
|
||||
pass
|
||||
|
||||
|
||||
def is_meta_url (attr, attrs):
|
||||
"""Check if the meta attributes contain a URL."""
|
||||
res = False
|
||||
|
|
@ -133,13 +120,12 @@ def is_form_get(attr, attrs):
|
|||
return res
|
||||
|
||||
|
||||
class LinkFinder (TagFinder):
|
||||
class LinkFinder:
|
||||
"""Find HTML links, and apply them to the callback function with the
|
||||
format (url, lineno, column, name, codebase)."""
|
||||
|
||||
def __init__ (self, callback, tags):
|
||||
"""Store content in buffer and initialize URL list."""
|
||||
super(LinkFinder, self).__init__()
|
||||
self.callback = callback
|
||||
# set universal tag attributes using tagname None
|
||||
self.universal_attrs = set(tags.get(None, []))
|
||||
|
|
@ -150,7 +136,7 @@ class LinkFinder (TagFinder):
|
|||
self.tags[tag].update(self.universal_attrs)
|
||||
self.base_ref = u''
|
||||
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
def html_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
|
||||
|
|
@ -226,3 +212,15 @@ class LinkFinder (TagFinder):
|
|||
"""Add newly found URL to queue."""
|
||||
assert isinstance(url, str_text) or url is None, repr(url)
|
||||
self.callback(url, line=lineno, column=column, name=name, base=base)
|
||||
|
||||
|
||||
def find_links(soup, callback, tags):
|
||||
"""Parse into content and search for URLs to check.
|
||||
When a URL is found it is passed to the supplied callback.
|
||||
"""
|
||||
lf = LinkFinder(callback, tags)
|
||||
for element in soup.find_all(True):
|
||||
lf.html_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
Main functions for link parsing
|
||||
"""
|
||||
from .. import log, LOG_CHECK, strformat, url as urlutil
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
from ..htmlutil import linkparse
|
||||
from ..bookmarks import firefox
|
||||
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ def parse_html (url_data):
|
|||
"""Parse into HTML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url, linkparse.LinkTags)
|
||||
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.LinkTags)
|
||||
|
||||
|
||||
def parse_opera (url_data):
|
||||
|
|
@ -112,15 +112,7 @@ def parse_wml (url_data):
|
|||
"""Parse into WML content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
find_links(url_data, url_data.add_url, linkparse.WmlTags)
|
||||
|
||||
|
||||
def find_links (url_data, callback, tags):
|
||||
"""Parse into content and search for URLs to check.
|
||||
Found URLs are added to the URL queue.
|
||||
"""
|
||||
handler = linkparse.LinkFinder(callback, tags)
|
||||
htmlsoup.process_soup(handler, url_data.get_soup())
|
||||
linkparse.find_links(url_data.get_soup(), url_data.add_url, linkparse.WmlTags)
|
||||
|
||||
|
||||
def parse_firefox (url_data):
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@ from urllib import parse
|
|||
from . import _ContentPlugin
|
||||
from .. import log, LOG_PLUGIN
|
||||
from ..htmlutil import linkparse
|
||||
from ..parser import find_links
|
||||
|
||||
|
||||
class AnchorCheck(_ContentPlugin):
|
||||
|
|
@ -37,7 +36,8 @@ class AnchorCheck(_ContentPlugin):
|
|||
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
|
||||
# list of parsed anchors
|
||||
self.anchors = []
|
||||
find_links(url_data, self.add_anchor, linkparse.AnchorTags)
|
||||
linkparse.find_links(url_data.get_soup(), self.add_anchor,
|
||||
linkparse.AnchorTags)
|
||||
self.check_anchor(url_data)
|
||||
|
||||
def add_anchor (self, url, line, column, name, base):
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
|
||||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
linkparse.find_links(htmlsoup.make_soup(content),
|
||||
self._test_one_url(url), linkparse.LinkTags)
|
||||
self.assertEqual(self.count_url, 1)
|
||||
|
||||
def _test_one_url (self, origurl):
|
||||
|
|
@ -43,8 +43,8 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_no_link (self, content):
|
||||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
linkparse.find_links(htmlsoup.make_soup(content), callback,
|
||||
linkparse.LinkTags)
|
||||
|
||||
def test_href_parsing (self):
|
||||
# Test <a href> parsing.
|
||||
|
|
|
|||
Loading…
Reference in a new issue