diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py deleted file mode 100644 index de3a2a3e..00000000 --- a/linkcheck/HtmlParser/__init__.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding: iso-8859-1 -*- -# Copyright (C) 2000-2014 Bastian Kleineidam -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" -HTML parser module. - -USAGE - - -Two functions are provided, one to make a BeautifulSoup object from markup and -another to call a handler's callback for each element in a BeautifulSoup -object it can process. - -The used callback of a handler is: - -- Start tag: - def start_element (tag, attrs, text, line, column) - @param tag: tag name - @type tag: string - @param attrs: tag attributes - @type attrs: dict - @param text: element text - @type tag: string - @param line: tag line number - @type tag: integer - @param column: tag column number - @type tag: integer - -EXAMPLE - - # Create a new BeautifulSoup object. - soup = HtmlParser.htmlsax.make_soup("Blubb") - # Process the soup with the chosen handler as a parameter. - HtmlParser.htmlsax.proces_soup(handler, soup) - -""" diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index cbbf759e..e891402e 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -30,8 +30,7 @@ from io import BytesIO from .. import (log, LOG_CHECK, strformat, mimeutil, url as urlutil, LinkCheckerError, httputil) from . import (internpaturl, proxysupport) -from ..HtmlParser import htmlsax -from ..htmlutil import linkparse +from ..htmlutil import htmlsoup, linkparse # import warnings from .const import WARN_HTTP_EMPTY_CONTENT from requests.sessions import REDIRECT_STATI @@ -83,7 +82,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): handler = linkparse.MetaRobotsFinder() # parse try: - htmlsax.process_soup(handler, self.get_soup()) + htmlsoup.process_soup(handler, self.get_soup()) except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass @@ -302,7 +301,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): def get_content(self): if self.text is None: self.get_raw_content() - self.soup = htmlsax.make_soup(self.data, self.encoding) + self.soup = htmlsoup.make_soup(self.data, self.encoding) self.text = self.data.decode(self.soup.original_encoding) return self.text diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index bb7debef..78ce194f 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -45,7 +45,7 @@ from future.utils import python_2_unicode_compatible from . import absolute_url, get_url_from from .. import (log, LOG_CHECK, strformat, LinkCheckerError, url as urlutil, trace, get_link_pat) -from ..HtmlParser import htmlsax +from ..htmlutil import htmlsoup from ..network import iputil from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, @@ -651,7 +651,7 @@ class UrlBase (object): def get_content (self): if self.text is None: self.get_raw_content() - self.soup = htmlsax.make_soup(self.data) + self.soup = htmlsoup.make_soup(self.data) self.text = self.data.decode(self.soup.original_encoding) self.encoding = self.soup.original_encoding return self.text diff --git a/linkcheck/htmlutil/formsearch.py b/linkcheck/htmlutil/formsearch.py index 1f71542f..d10b5089 100644 --- a/linkcheck/htmlutil/formsearch.py +++ b/linkcheck/htmlutil/formsearch.py @@ -17,7 +17,7 @@ """ HTML form utils """ -from ..HtmlParser import htmlsax +from ..htmlutil import htmlsoup from .. import log, LOG_CHECK class Form(object): @@ -41,7 +41,7 @@ def search_form(content, cgiuser, cgipassword): """Search for a HTML form in the given HTML content that has the given CGI fields. If no form is found return None. """ - soup = htmlsax.make_soup(content) + soup = htmlsoup.make_soup(content) # The value of the name attribute is case-insensitive # https://www.w3.org/TR/html401/interact/forms.html#adef-name-INPUT cginames = {cgiuser.lower(), cgipassword.lower()} diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/htmlutil/htmlsoup.py similarity index 64% rename from linkcheck/HtmlParser/htmlsax.py rename to linkcheck/htmlutil/htmlsoup.py index 61ef5190..921703cf 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/htmlutil/htmlsoup.py @@ -15,6 +15,35 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ HTML parser implemented using Beautiful Soup and html.parser. + +USAGE + +Two functions are provided, one to make a BeautifulSoup object from markup and +another to call a handler's callbacks for each element in a BeautifulSoup +object it can process. + +The used callback of a handler is: + +- Start tag: + def start_element (tag, attrs, text, line, column) + @param tag: tag name + @type tag: string + @param attrs: tag attributes + @type attrs: dict + @param text: element text + @type tag: string + @param line: tag line number + @type tag: integer + @param column: tag column number + @type tag: integer + +EXAMPLE + + # Create a new BeautifulSoup object. + soup = htmlutil.htmlsoup.make_soup("Blubb") + # Process the soup with the chosen handler as a parameter. + htmlutil.htmlsoup.proces_soup(handler, soup) + """ from warnings import filterwarnings diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index c455cffa..cd5ac582 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -99,7 +99,7 @@ class StopParse(Exception): class TagFinder (object): """Base class handling HTML start elements. - TagFinder instances are used as HtmlParser handlers.""" + TagFinder instances are used as HTML parser handlers.""" def __init__ (self): """Initialize local variables.""" diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index bd1e3100..0f584f35 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -18,8 +18,7 @@ Main functions for link parsing """ from .. import log, LOG_CHECK, strformat, url as urlutil -from ..htmlutil import linkparse -from ..HtmlParser import htmlsax +from ..htmlutil import htmlsoup, linkparse from ..bookmarks import firefox @@ -121,7 +120,7 @@ def find_links (url_data, callback, tags): handler = linkparse.LinkFinder(callback, tags) # parse try: - htmlsax.process_soup(handler, url_data.get_soup()) + htmlsoup.process_soup(handler, url_data.get_soup()) except linkparse.StopParse as msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) pass diff --git a/setup.py b/setup.py index 4f3c07ef..237d94d4 100755 --- a/setup.py +++ b/setup.py @@ -382,7 +382,6 @@ setup( 'linkcheck.configuration', 'linkcheck.director', 'linkcheck.htmlutil', - 'linkcheck.HtmlParser', 'linkcheck.logger', 'linkcheck.network', 'linkcheck.parser', diff --git a/tests/test_linkparser.py b/tests/test_linkparser.py index 38c2277d..94171aa8 100644 --- a/tests/test_linkparser.py +++ b/tests/test_linkparser.py @@ -19,8 +19,7 @@ Test linkparser routines. """ import unittest -from linkcheck.htmlutil import linkparse -from linkcheck.HtmlParser import htmlsax +from linkcheck.htmlutil import htmlsoup, linkparse class TestLinkparser (unittest.TestCase): @@ -32,7 +31,7 @@ class TestLinkparser (unittest.TestCase): self.count_url = 0 h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags) try: - htmlsax.process_soup(h, htmlsax.make_soup(content)) + htmlsoup.process_soup(h, htmlsoup.make_soup(content)) except linkparse.StopParse: pass self.assertEqual(self.count_url, 1) @@ -49,7 +48,7 @@ class TestLinkparser (unittest.TestCase): self.assertTrue(False, 'URL %r found' % url) h = linkparse.LinkFinder(callback, linkparse.LinkTags) try: - htmlsax.process_soup(h, htmlsax.make_soup(content)) + htmlsoup.process_soup(h, htmlsoup.make_soup(content)) except linkparse.StopParse: pass diff --git a/tests/test_parser.py b/tests/test_parser.py index 3d73d8af..22c08fc1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -18,7 +18,7 @@ Test html parsing. """ -from linkcheck.HtmlParser import htmlsax +from linkcheck.htmlutil import htmlsoup from io import StringIO import unittest @@ -143,7 +143,7 @@ class TestParser (unittest.TestCase): # Parse all test patterns in one go. out = StringIO() handler = HtmlPrettyPrinter(out) - htmlsax.process_soup(handler, htmlsax.make_soup(_in)) + htmlsoup.process_soup(handler, htmlsoup.make_soup(_in)) self.check_results(_in, _out, out) def check_results (self, _in, _out, out): @@ -182,6 +182,6 @@ class TestParser (unittest.TestCase): def encoding_test (self, html, expected): out = StringIO() handler = HtmlPrettyPrinter(out) - soup = htmlsax.make_soup(html) - htmlsax.process_soup(handler, soup) + soup = htmlsoup.make_soup(html) + htmlsoup.process_soup(handler, soup) self.assertEqual(soup.original_encoding, expected)