mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Move HtmlParser/htmlsax.py to htmlutil/htmlsoup.py
Remove one subpackage and some import lines where htmlutil.linkparse is also being used.
This commit is contained in:
parent
0f18c9b8f0
commit
ee6628a831
10 changed files with 46 additions and 70 deletions
|
|
@ -1,49 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2014 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
HTML parser module.
|
||||
|
||||
USAGE
|
||||
|
||||
|
||||
Two functions are provided, one to make a BeautifulSoup object from markup and
|
||||
another to call a handler's callback for each element in a BeautifulSoup
|
||||
object it can process.
|
||||
|
||||
The used callback of a handler is:
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs, text, line, column)
|
||||
@param tag: tag name
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: dict
|
||||
@param text: element text
|
||||
@type tag: string
|
||||
@param line: tag line number
|
||||
@type tag: integer
|
||||
@param column: tag column number
|
||||
@type tag: integer
|
||||
|
||||
EXAMPLE
|
||||
|
||||
# Create a new BeautifulSoup object.
|
||||
soup = HtmlParser.htmlsax.make_soup("<html><body>Blubb</body></html>")
|
||||
# Process the soup with the chosen handler as a parameter.
|
||||
HtmlParser.htmlsax.proces_soup(handler, soup)
|
||||
|
||||
"""
|
||||
|
|
@ -30,8 +30,7 @@ from io import BytesIO
|
|||
from .. import (log, LOG_CHECK, strformat, mimeutil,
|
||||
url as urlutil, LinkCheckerError, httputil)
|
||||
from . import (internpaturl, proxysupport)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
# import warnings
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT
|
||||
from requests.sessions import REDIRECT_STATI
|
||||
|
|
@ -83,7 +82,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
handler = linkparse.MetaRobotsFinder()
|
||||
# parse
|
||||
try:
|
||||
htmlsax.process_soup(handler, self.get_soup())
|
||||
htmlsoup.process_soup(handler, self.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
|
|
@ -302,7 +301,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
def get_content(self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = htmlsax.make_soup(self.data, self.encoding)
|
||||
self.soup = htmlsoup.make_soup(self.data, self.encoding)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
return self.text
|
||||
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ from future.utils import python_2_unicode_compatible
|
|||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import htmlsoup
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
|
|
@ -651,7 +651,7 @@ class UrlBase (object):
|
|||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = htmlsax.make_soup(self.data)
|
||||
self.soup = htmlsoup.make_soup(self.data)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
return self.text
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
"""
|
||||
HTML form utils
|
||||
"""
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import htmlsoup
|
||||
from .. import log, LOG_CHECK
|
||||
|
||||
class Form(object):
|
||||
|
|
@ -41,7 +41,7 @@ def search_form(content, cgiuser, cgipassword):
|
|||
"""Search for a HTML form in the given HTML content that has the given
|
||||
CGI fields. If no form is found return None.
|
||||
"""
|
||||
soup = htmlsax.make_soup(content)
|
||||
soup = htmlsoup.make_soup(content)
|
||||
# The value of the name attribute is case-insensitive
|
||||
# https://www.w3.org/TR/html401/interact/forms.html#adef-name-INPUT
|
||||
cginames = {cgiuser.lower(), cgipassword.lower()}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,35 @@
|
|||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
HTML parser implemented using Beautiful Soup and html.parser.
|
||||
|
||||
USAGE
|
||||
|
||||
Two functions are provided, one to make a BeautifulSoup object from markup and
|
||||
another to call a handler's callbacks for each element in a BeautifulSoup
|
||||
object it can process.
|
||||
|
||||
The used callback of a handler is:
|
||||
|
||||
- Start tag: <tag {attr1:value1, attr2:value2, ..}>
|
||||
def start_element (tag, attrs, text, line, column)
|
||||
@param tag: tag name
|
||||
@type tag: string
|
||||
@param attrs: tag attributes
|
||||
@type attrs: dict
|
||||
@param text: element text
|
||||
@type tag: string
|
||||
@param line: tag line number
|
||||
@type tag: integer
|
||||
@param column: tag column number
|
||||
@type tag: integer
|
||||
|
||||
EXAMPLE
|
||||
|
||||
# Create a new BeautifulSoup object.
|
||||
soup = htmlutil.htmlsoup.make_soup("<html><body>Blubb</body></html>")
|
||||
# Process the soup with the chosen handler as a parameter.
|
||||
htmlutil.htmlsoup.proces_soup(handler, soup)
|
||||
|
||||
"""
|
||||
|
||||
from warnings import filterwarnings
|
||||
|
|
@ -99,7 +99,7 @@ class StopParse(Exception):
|
|||
|
||||
class TagFinder (object):
|
||||
"""Base class handling HTML start elements.
|
||||
TagFinder instances are used as HtmlParser handlers."""
|
||||
TagFinder instances are used as HTML parser handlers."""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize local variables."""
|
||||
|
|
|
|||
|
|
@ -18,8 +18,7 @@
|
|||
Main functions for link parsing
|
||||
"""
|
||||
from .. import log, LOG_CHECK, strformat, url as urlutil
|
||||
from ..htmlutil import linkparse
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import htmlsoup, linkparse
|
||||
from ..bookmarks import firefox
|
||||
|
||||
|
||||
|
|
@ -121,7 +120,7 @@ def find_links (url_data, callback, tags):
|
|||
handler = linkparse.LinkFinder(callback, tags)
|
||||
# parse
|
||||
try:
|
||||
htmlsax.process_soup(handler, url_data.get_soup())
|
||||
htmlsoup.process_soup(handler, url_data.get_soup())
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
|
|
|
|||
1
setup.py
1
setup.py
|
|
@ -382,7 +382,6 @@ setup(
|
|||
'linkcheck.configuration',
|
||||
'linkcheck.director',
|
||||
'linkcheck.htmlutil',
|
||||
'linkcheck.HtmlParser',
|
||||
'linkcheck.logger',
|
||||
'linkcheck.network',
|
||||
'linkcheck.parser',
|
||||
|
|
|
|||
|
|
@ -19,8 +19,7 @@ Test linkparser routines.
|
|||
"""
|
||||
|
||||
import unittest
|
||||
from linkcheck.htmlutil import linkparse
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
from linkcheck.htmlutil import htmlsoup, linkparse
|
||||
|
||||
|
||||
class TestLinkparser (unittest.TestCase):
|
||||
|
|
@ -32,7 +31,7 @@ class TestLinkparser (unittest.TestCase):
|
|||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
try:
|
||||
htmlsax.process_soup(h, htmlsax.make_soup(content))
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
self.assertEqual(self.count_url, 1)
|
||||
|
|
@ -49,7 +48,7 @@ class TestLinkparser (unittest.TestCase):
|
|||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
try:
|
||||
htmlsax.process_soup(h, htmlsax.make_soup(content))
|
||||
htmlsoup.process_soup(h, htmlsoup.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
Test html parsing.
|
||||
"""
|
||||
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
from linkcheck.htmlutil import htmlsoup
|
||||
|
||||
from io import StringIO
|
||||
import unittest
|
||||
|
|
@ -143,7 +143,7 @@ class TestParser (unittest.TestCase):
|
|||
# Parse all test patterns in one go.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
htmlsax.process_soup(handler, htmlsax.make_soup(_in))
|
||||
htmlsoup.process_soup(handler, htmlsoup.make_soup(_in))
|
||||
self.check_results(_in, _out, out)
|
||||
|
||||
def check_results (self, _in, _out, out):
|
||||
|
|
@ -182,6 +182,6 @@ class TestParser (unittest.TestCase):
|
|||
def encoding_test (self, html, expected):
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
soup = htmlsax.make_soup(html)
|
||||
htmlsax.process_soup(handler, soup)
|
||||
soup = htmlsoup.make_soup(html)
|
||||
htmlsoup.process_soup(handler, soup)
|
||||
self.assertEqual(soup.original_encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue