mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-13 02:41:03 +00:00
commit
d80a075372
9 changed files with 50 additions and 203 deletions
|
|
@ -17,7 +17,6 @@
|
|||
HTML parser implemented using Beautiful Soup and html.parser.
|
||||
"""
|
||||
|
||||
from io import BytesIO, StringIO
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
|
|
@ -27,64 +26,39 @@ filterwarnings("ignore",
|
|||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
|
||||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
|
||||
class Parser(object):
|
||||
handler = None
|
||||
encoding = None
|
||||
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
self.reset()
|
||||
|
||||
def feed(self, feed_text):
|
||||
if not self.html_doc:
|
||||
if isinstance(feed_text, bytes):
|
||||
self.html_doc = BytesIO()
|
||||
else:
|
||||
self.html_doc = StringIO()
|
||||
self.html_doc.write(feed_text)
|
||||
|
||||
def feed_soup(self, soup):
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
self.soup = None
|
||||
self.html_doc = None
|
||||
self.tag_lineno = None
|
||||
self.tag_column = None
|
||||
self.parse_contents(soup.contents)
|
||||
|
||||
def parse_contents(self, contents):
|
||||
for content in contents:
|
||||
if isinstance(content, Tag):
|
||||
self.tag_lineno = content.sourceline
|
||||
self.tag_column = None if content.sourcepos is None \
|
||||
tag_column = None if content.sourcepos is None \
|
||||
else content.sourcepos + 1
|
||||
if content.is_empty_element:
|
||||
self.handler.start_end_element(
|
||||
content.name, content.attrs, content.text.strip(),
|
||||
content.sourceline, tag_column
|
||||
)
|
||||
else:
|
||||
self.handler.start_element(
|
||||
content.name, content.attrs, content.text.strip(),
|
||||
content.sourceline, tag_column
|
||||
)
|
||||
if hasattr(content, 'contents'): # recursion
|
||||
self.parse_contents(content.contents)
|
||||
if hasattr(self.handler, 'end_element'):
|
||||
self.handler.end_element(content.name)
|
||||
|
||||
def flush(self):
|
||||
if self.soup is None:
|
||||
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
|
||||
multi_valued_attributes=None)
|
||||
if hasattr(self.soup, 'contents'):
|
||||
self.parse_contents(self.soup.contents)
|
||||
self.encoding = self.soup.original_encoding
|
||||
|
||||
def lineno(self):
|
||||
return self.tag_lineno
|
||||
|
||||
def column(self):
|
||||
return self.tag_column
|
||||
|
||||
|
||||
def parser(handler=None):
|
||||
return Parser(handler)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@
|
|||
"""
|
||||
Handle http links.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
# The validity of SSL certs is ignored to be able
|
||||
# the check the URL and recurse into it.
|
||||
|
|
@ -83,17 +82,12 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# construct parser object
|
||||
handler = linkparse.MetaRobotsFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
# parse
|
||||
try:
|
||||
parser.feed_soup(self.get_soup())
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
return handler.follow
|
||||
|
||||
def add_size_info (self):
|
||||
|
|
@ -309,9 +303,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
def get_content(self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None,
|
||||
from_encoding=self.encoding)
|
||||
self.soup = htmlsax.make_soup(self.data, self.encoding)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
return self.text
|
||||
|
||||
|
|
|
|||
|
|
@ -41,17 +41,11 @@ import select
|
|||
from io import BytesIO
|
||||
from builtins import str as str_text
|
||||
from future.utils import python_2_unicode_compatible
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
|
|
@ -657,8 +651,7 @@ class UrlBase (object):
|
|||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None)
|
||||
self.soup = htmlsax.make_soup(self.data)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
return self.text
|
||||
|
|
|
|||
|
|
@ -44,13 +44,10 @@ class FormFinder(object):
|
|||
def __init__(self):
|
||||
"""Initialize local variables."""
|
||||
super(FormFinder, self).__init__()
|
||||
# parser object will be initialized when it is used as
|
||||
# a handler object
|
||||
self.parser = None
|
||||
self.forms = []
|
||||
self.form = None
|
||||
|
||||
def start_element(self, tag, attrs, element_text=None):
|
||||
def start_element(self, tag, attrs, element_text, lineno, column):
|
||||
"""Does nothing, override in a subclass."""
|
||||
if tag == u'form':
|
||||
if u'action' in attrs:
|
||||
|
|
@ -69,10 +66,10 @@ class FormFinder(object):
|
|||
log.warn(LOG_CHECK, "formless input %s" % attrs)
|
||||
pass
|
||||
|
||||
def start_end_element(self, tag, attrs, element_text=None):
|
||||
def start_end_element(self, tag, attrs, element_text, lineno, column):
|
||||
"""Delegate a combined start/end element (eg. <input .../>) to
|
||||
the start_element method. Ignore the end element part."""
|
||||
self.start_element(tag, attrs, element_text)
|
||||
self.start_element(tag, attrs, element_text, lineno, column)
|
||||
|
||||
def end_element(self, tag):
|
||||
"""search for ending form values."""
|
||||
|
|
@ -87,13 +84,8 @@ def search_form(content, cgiuser, cgipassword):
|
|||
"""
|
||||
handler = FormFinder()
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
# parse
|
||||
parser.feed(content)
|
||||
parser.flush()
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
parser.feed_soup(htmlsax.make_soup(content))
|
||||
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
|
||||
cginames = (cgiuser.lower(), cgipassword.lower())
|
||||
for form in handler.forms:
|
||||
|
|
|
|||
|
|
@ -104,18 +104,15 @@ class TagFinder (object):
|
|||
def __init__ (self):
|
||||
"""Initialize local variables."""
|
||||
super(TagFinder, self).__init__()
|
||||
# parser object will be initialized when it is used as
|
||||
# a handler object
|
||||
self.parser = None
|
||||
|
||||
def start_element (self, tag, attrs):
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Does nothing, override in a subclass."""
|
||||
pass
|
||||
|
||||
def start_end_element (self, tag, attrs, element_text=None):
|
||||
def start_end_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Delegate a combined start/end element (eg. <br/>) to
|
||||
the start_element method. Ignore the end element part."""
|
||||
self.start_element(tag, attrs, element_text)
|
||||
self.start_element(tag, attrs, element_text, lineno, column)
|
||||
|
||||
|
||||
class MetaRobotsFinder (TagFinder):
|
||||
|
|
@ -127,7 +124,7 @@ class MetaRobotsFinder (TagFinder):
|
|||
log.debug(LOG_CHECK, "meta robots finder")
|
||||
self.follow = self.index = True
|
||||
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
|
||||
if tag == 'meta' and attrs.get('name') == 'robots':
|
||||
val = attrs.get('content', u'').lower().split(u',')
|
||||
|
|
@ -177,10 +174,10 @@ class LinkFinder (TagFinder):
|
|||
self.tags[tag].update(self.universal_attrs)
|
||||
self.base_ref = u''
|
||||
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
|
||||
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
|
||||
if tag == "base" and not self.base_ref:
|
||||
self.base_ref = attrs.get("href", u'')
|
||||
tagattrs = self.tags.get(tag, self.universal_attrs)
|
||||
|
|
@ -205,7 +202,7 @@ class LinkFinder (TagFinder):
|
|||
value = value.split(':', 1)[1]
|
||||
value = 'dns:' + value.rstrip('/')
|
||||
# parse tag for URLs
|
||||
self.parse_tag(tag, attr, value, name, base)
|
||||
self.parse_tag(tag, attr, value, name, base, lineno, column)
|
||||
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
|
||||
|
||||
def get_link_name (self, tag, attrs, attr, name=None):
|
||||
|
|
@ -221,7 +218,7 @@ class LinkFinder (TagFinder):
|
|||
name = u""
|
||||
return name
|
||||
|
||||
def parse_tag (self, tag, attr, value, name, base):
|
||||
def parse_tag (self, tag, attr, value, name, base, lineno, column):
|
||||
"""Add given url data to url list."""
|
||||
assert isinstance(tag, str_text), repr(tag)
|
||||
assert isinstance(attr, str_text), repr(attr)
|
||||
|
|
@ -232,25 +229,24 @@ class LinkFinder (TagFinder):
|
|||
if tag == u'meta' and value:
|
||||
mo = refresh_re.match(value)
|
||||
if mo:
|
||||
self.found_url(mo.group("url"), name, base)
|
||||
self.found_url(mo.group("url"), name, base, lineno, column)
|
||||
elif attr != 'content':
|
||||
self.found_url(value, name, base)
|
||||
self.found_url(value, name, base, lineno, column)
|
||||
elif attr == u'style' and value:
|
||||
for mo in css_url_re.finditer(value):
|
||||
url = unquote(mo.group("url"), matching=True)
|
||||
self.found_url(url, name, base)
|
||||
self.found_url(url, name, base, lineno, column)
|
||||
elif attr == u'archive':
|
||||
for url in value.split(u','):
|
||||
self.found_url(url, name, base)
|
||||
self.found_url(url, name, base, lineno, column)
|
||||
elif attr == u'srcset':
|
||||
for img_candidate in value.split(u','):
|
||||
url = img_candidate.split()[0]
|
||||
self.found_url(url, name, base)
|
||||
self.found_url(url, name, base, lineno, column)
|
||||
else:
|
||||
self.found_url(value, name, base)
|
||||
self.found_url(value, name, base, lineno, column)
|
||||
|
||||
def found_url(self, url, name, base):
|
||||
def found_url(self, url, name, base, lineno, column):
|
||||
"""Add newly found URL to queue."""
|
||||
assert isinstance(url, str_text) or url is None, repr(url)
|
||||
self.callback(url, line=self.parser.lineno(),
|
||||
column=self.parser.column(), name=name, base=base)
|
||||
self.callback(url, line=lineno, column=column, name=name, base=base)
|
||||
|
|
|
|||
|
|
@ -120,18 +120,13 @@ def find_links (url_data, callback, tags):
|
|||
# construct parser object
|
||||
handler = linkparse.LinkFinder(callback, tags)
|
||||
parser = htmlsax.parser(handler)
|
||||
handler.parser = parser
|
||||
# parse
|
||||
try:
|
||||
soup = url_data.get_soup()
|
||||
parser.feed_soup(soup)
|
||||
parser.flush()
|
||||
except linkparse.StopParse as msg:
|
||||
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
|
||||
pass
|
||||
# break cyclic dependencies
|
||||
handler.parser = None
|
||||
parser.handler = None
|
||||
|
||||
|
||||
def parse_firefox (url_data):
|
||||
|
|
|
|||
|
|
@ -15,50 +15,12 @@
|
|||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Default HTML parser handler classes.
|
||||
HTML parser handler test class.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class HtmlPrinter:
|
||||
"""
|
||||
Handles all functions by printing the function name and attributes.
|
||||
"""
|
||||
|
||||
def __init__ (self, fd=sys.stdout):
|
||||
"""
|
||||
Write to given file descriptor.
|
||||
|
||||
@param fd: file like object (default=sys.stdout)
|
||||
@type fd: file
|
||||
"""
|
||||
self.fd = fd
|
||||
|
||||
def _print (self, *attrs):
|
||||
"""
|
||||
Print function attributes to stored file descriptor.
|
||||
|
||||
@param attrs: list of values to print
|
||||
@type attrs: tuple
|
||||
@return: None
|
||||
"""
|
||||
self.fd.write(self.mem)
|
||||
self.fd.write(str(attrs))
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""
|
||||
Remember the called method name in self.mem.
|
||||
|
||||
@param name: attribute name
|
||||
@type name: string
|
||||
@return: method which just prints out its arguments
|
||||
@rtype: a bound function object
|
||||
"""
|
||||
self.mem = name
|
||||
return self._print
|
||||
|
||||
|
||||
class HtmlPrettyPrinter:
|
||||
"""
|
||||
Print out all parsed HTML data in encoded form.
|
||||
|
|
@ -77,7 +39,7 @@ class HtmlPrettyPrinter:
|
|||
self.fd = fd
|
||||
self.encoding = encoding
|
||||
|
||||
def start_element (self, tag, attrs, element_text=None):
|
||||
def start_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""
|
||||
Print HTML start element.
|
||||
|
||||
|
|
@ -89,7 +51,7 @@ class HtmlPrettyPrinter:
|
|||
"""
|
||||
self._start_element(tag, attrs, ">", element_text)
|
||||
|
||||
def start_end_element (self, tag, attrs, element_text=None):
|
||||
def start_end_element (self, tag, attrs, element_text, lineno, column):
|
||||
"""
|
||||
Print HTML start-end element.
|
||||
|
||||
|
|
@ -101,7 +63,7 @@ class HtmlPrettyPrinter:
|
|||
"""
|
||||
self._start_element(tag, attrs, "/>", element_text)
|
||||
|
||||
def _start_element (self, tag, attrs, end, element_text=None):
|
||||
def _start_element (self, tag, attrs, end, element_text):
|
||||
"""
|
||||
Print HTML element with end string.
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Test linkparser routines.
|
|||
|
||||
import unittest
|
||||
from linkcheck.htmlutil import linkparse
|
||||
import linkcheck.HtmlParser.htmlsax
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
|
||||
|
||||
class TestLinkparser (unittest.TestCase):
|
||||
|
|
@ -31,15 +31,11 @@ class TestLinkparser (unittest.TestCase):
|
|||
def _test_one_link (self, content, url):
|
||||
self.count_url = 0
|
||||
h = linkparse.LinkFinder(self._test_one_url(url), linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed(content)
|
||||
p.flush()
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
h.parser = None
|
||||
p.handler = None
|
||||
self.assertEqual(self.count_url, 1)
|
||||
|
||||
def _test_one_url (self, origurl):
|
||||
|
|
@ -53,15 +49,11 @@ class TestLinkparser (unittest.TestCase):
|
|||
def callback (url, line, column, name, base):
|
||||
self.assertTrue(False, 'URL %r found' % url)
|
||||
h = linkparse.LinkFinder(callback, linkparse.LinkTags)
|
||||
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
||||
h.parser = p
|
||||
p = htmlsax.parser(h)
|
||||
try:
|
||||
p.feed(content)
|
||||
p.flush()
|
||||
p.feed_soup(htmlsax.make_soup(content))
|
||||
except linkparse.StopParse:
|
||||
pass
|
||||
h.parser = None
|
||||
p.handler = None
|
||||
|
||||
def test_href_parsing (self):
|
||||
# Test <a href> parsing.
|
||||
|
|
|
|||
|
|
@ -18,14 +18,14 @@
|
|||
Test html parsing.
|
||||
"""
|
||||
|
||||
import linkcheck.HtmlParser.htmlsax
|
||||
from linkcheck.HtmlParser import htmlsax
|
||||
|
||||
from io import StringIO
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from .htmllib import HtmlPrinter, HtmlPrettyPrinter
|
||||
from .htmllib import HtmlPrettyPrinter
|
||||
|
||||
# list of tuples
|
||||
# (<test pattern>, <expected parse output>)
|
||||
|
|
@ -137,70 +137,23 @@ class TestParser (unittest.TestCase):
|
|||
Test html parser.
|
||||
"""
|
||||
|
||||
def setUp (self):
|
||||
"""
|
||||
Initialize two internal html parsers to be used for testing.
|
||||
"""
|
||||
self.htmlparser = linkcheck.HtmlParser.htmlsax.parser()
|
||||
self.htmlparser2 = linkcheck.HtmlParser.htmlsax.parser()
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_parse (self, _in, _out):
|
||||
# Parse all test patterns in one go.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
self.htmlparser.feed(_in)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
parser = htmlsax.parser(handler)
|
||||
parser.feed_soup(htmlsax.make_soup(_in))
|
||||
self.check_results(_in, _out, out)
|
||||
|
||||
def check_results (self, htmlparser, _in, _out, out):
|
||||
def check_results (self, _in, _out, out):
|
||||
"""
|
||||
Check parse results.
|
||||
"""
|
||||
htmlparser.flush()
|
||||
res = out.getvalue()
|
||||
msg = "Test error; in: %r, out: %r, expect: %r" % \
|
||||
(_in, res, _out)
|
||||
self.assertEqual(res, _out, msg=msg)
|
||||
htmlparser.reset()
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_feed (self, _in, _out):
|
||||
# Parse all test patterns sequentially.
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_interwoven (self, _in, _out):
|
||||
# Parse all test patterns on two parsers interwoven.
|
||||
out = StringIO()
|
||||
out2 = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
handler2 = HtmlPrettyPrinter(out2)
|
||||
self.htmlparser2.handler = handler2
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser2.feed(c)
|
||||
self.check_results(self.htmlparser, _in, _out, out)
|
||||
self.check_results(self.htmlparser2, _in, _out, out2)
|
||||
|
||||
@parameterized.expand(parsetests)
|
||||
def test_handler (self, _in, _out):
|
||||
out = StringIO()
|
||||
out2 = StringIO()
|
||||
handler = HtmlPrinter(out)
|
||||
self.htmlparser.handler = handler
|
||||
handler2 = HtmlPrinter(out2)
|
||||
self.htmlparser2.handler = handler2
|
||||
for c in _in:
|
||||
self.htmlparser.feed(c)
|
||||
self.htmlparser2.feed(c)
|
||||
self.assertEqual(out.getvalue(), out2.getvalue())
|
||||
|
||||
def test_encoding_detection_utf_content (self):
|
||||
html = b'<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
|
||||
|
|
@ -227,11 +180,9 @@ class TestParser (unittest.TestCase):
|
|||
self.encoding_test(html, "ascii")
|
||||
|
||||
def encoding_test (self, html, expected):
|
||||
parser = linkcheck.HtmlParser.htmlsax.parser()
|
||||
self.assertEqual(parser.encoding, None)
|
||||
out = StringIO()
|
||||
handler = HtmlPrettyPrinter(out)
|
||||
parser.handler = handler
|
||||
parser.feed(html)
|
||||
parser.flush()
|
||||
self.assertEqual(parser.encoding, expected)
|
||||
parser = htmlsax.parser(handler)
|
||||
soup = htmlsax.make_soup(html)
|
||||
parser.feed_soup(soup)
|
||||
self.assertEqual(soup.original_encoding, expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue