Support Beautiful Soup line numbers

This commit is contained in:
Chris Mayo 2019-10-05 19:38:57 +01:00
parent e46fb7fe9c
commit 607328d5c5
13 changed files with 93 additions and 36 deletions

View file

@ -43,6 +43,10 @@ class Parser(object):
def reset(self):
self.html_doc = None
self.tag_lineno = None
self.tag_column = None
self.last_tag_lineno = None
self.last_tag_column = None
def parse_contents(self, contents):
for content in contents:
@ -57,6 +61,11 @@ class Parser(object):
if v == b'':
v = u''
attrs[k] = v
self.last_tag_lineno = self.tag_lineno
self.last_tag_column = self.tag_column
self.tag_lineno = content.sourceline
self.tag_column = None if content.sourcepos is None \
else content.sourcepos + 1
if content.is_empty_element:
self.handler.start_end_element(
content.name, attrs, content.text.strip(),
@ -99,21 +108,16 @@ class Parser(object):
raise NotImplementedError("debug is not implemented")
def lineno(self):
# It seems, that getting line number of element is not
# implemented in BeautifulSoup, so this is faked
return 0
return self.tag_lineno
def last_lineno(self):
return 0
return self.last_tag_lineno
def column(self):
return 0
return self.tag_column
def last_column(self):
return 0
def pos(self, text):
return 0
return self.last_tag_column
def parser(handler=None):

View file

@ -68,8 +68,8 @@ def absolute_url (base_url, base_ref, parent_url):
def get_url_from (base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=0, column=0, page=0,
name=u"", parent_content_type=None, extern=None):
parent_url=None, base_ref=None, line=None, column=None,
page=0, name=u"", parent_content_type=None, extern=None):
"""
Get url data from given base data.

View file

@ -704,8 +704,8 @@ class UrlBase (object):
u"base_ref=%r" % self.base_ref,
u"recursion_level=%d" % self.recursion_level,
u"url_connection=%s" % self.url_connection,
u"line=%d" % self.line,
u"column=%d" % self.column,
u"line=%s" % self.line,
u"column=%s" % self.column,
u"page=%d" % self.page,
u"name=%r" % self.name,
u"anchor=%r" % self.anchor,
@ -791,9 +791,9 @@ class UrlBase (object):
- url_data.info: list of unicode
Additional information about this URL.
- url_data.line: int
Line number of this URL at parent document, or -1
Line number of this URL at parent document, or None
- url_data.column: int
Column number of this URL at parent document, or -1
Column number of this URL at parent document, or None
- url_data.page: int
Page number of this URL at parent document, or -1
- url_data.cache_url: unicode

View file

@ -181,7 +181,7 @@ class LinkFinder (TagFinder):
def start_element (self, tag, attrs, element_text=None):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
if tag == "base" and not self.base_ref:
self.base_ref = attrs.get_true("href", u'')
tagattrs = self.tags.get(tag, self.universal_attrs)
@ -253,5 +253,5 @@ class LinkFinder (TagFinder):
def found_url(self, url, name, base):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
self.callback(url, line=self.parser.last_lineno(),
column=self.parser.last_column(), name=name, base=base)
self.callback(url, line=self.parser.lineno(),
column=self.parser.column(), name=name, base=base)

View file

@ -101,9 +101,9 @@ class CSVLogger (_Logger):
row.append(url_data.valid)
if self.has_part("url"):
row.append(url_data.url)
if self.has_part("line"):
if self.has_part("line") and url_data.line is not None:
row.append(url_data.line)
if self.has_part("column"):
if self.has_part("column") and url_data.column is not None:
row.append(url_data.column)
if self.has_part("name"):
row.append(url_data.name)

View file

@ -54,8 +54,8 @@ class CustomXMLLogger (xmllog._XMLLogger):
self.xml_tag(u"name", str_text(url_data.name))
if url_data.parent_url and self.has_part('parenturl'):
attrs = {
u'line': u"%d" % url_data.line,
u'column': u"%d" % url_data.column,
u'line': u"%s" % url_data.line,
u'column': u"%s" % url_data.column,
}
self.xml_tag(u"parent", str_text(url_data.parent_url),
attrs=attrs)

View file

@ -191,9 +191,9 @@ class HtmlLogger (_Logger):
u'</td><td><a target="top" href="'+
url_data.parent_url+u'">'+
html_escape(url_data.parent_url)+u"</a>")
if url_data.line > 0:
if url_data.line is not None:
self.write(_(", line %d") % url_data.line)
if url_data.column > 0:
if url_data.column is not None:
self.write(_(", col %d") % url_data.column)
if url_data.page > 0:
self.write(_(", page %d") % url_data.page)

View file

@ -99,8 +99,8 @@ class SQLLogger (_Logger):
"%(warning)s,"
"%(info)s,"
"%(url)s,"
"%(line)d,"
"%(column)d,"
"%(line)s,"
"%(column)s,"
"%(name)s,"
"%(checktime)d,"
"%(dltime)d,"
@ -118,8 +118,8 @@ class SQLLogger (_Logger):
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
'info': sqlify(os.linesep.join(url_data.info)),
'url': sqlify(urlutil.url_quote(url_data.url)),
'line': url_data.line,
'column': url_data.column,
'line': 'NULL' if url_data.line is None else url_data.line,
'column': 'NULL' if url_data.column is None else url_data.column,
'name': sqlify(url_data.name),
'checktime': url_data.checktime,
'dltime': url_data.dltime,

View file

@ -155,9 +155,9 @@ class TextLogger (_Logger):
"""Write url_data.parent_url."""
self.write(self.part('parenturl') + self.spaces("parenturl"))
txt = url_data.parent_url
if url_data.line > 0:
if url_data.line is not None:
txt += _(", line %d") % url_data.line
if url_data.column > 0:
if url_data.column is not None:
txt += _(", col %d") % url_data.column
if url_data.page > 0:
txt += _(", page %d") % url_data.page

View file

@ -3,8 +3,8 @@ cache key file://%(curdir)s/%(datadir)s/all_parts.html
real url file://%(curdir)s/%(datadir)s/all_parts.html
name %(datadir)s/all_parts.html
valid
line 0
col 0
line None
col None
size 184
parent_url
page 0
@ -14,8 +14,8 @@ url base2.html
cache key file://%(curdir)s/%(datadir)s/base2.html
real url file://%(curdir)s/%(datadir)s/base2.html
valid
line 4
col 1
line None
col None
size 64
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
page 0
@ -25,8 +25,8 @@ url file.html
cache key file://%(curdir)s/%(datadir)s/file.html
real url file://%(curdir)s/%(datadir)s/file.html
valid
line 6
col 1
line None
col None
size 115
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
page 0

View file

@ -0,0 +1,8 @@
<!-- base without href -->
<base target="_top">
<!-- meta url -->
<META HTTP-equiv="refresh" content="0; url=base2.html">
<!-- spaces between key and value -->
<a href
=
"file.html">

View file

@ -0,0 +1,33 @@
url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
cache key file://%(curdir)s/%(datadir)s/all_parts_linenos.html
real url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
name %(datadir)s/all_parts_linenos.html
valid
line None
col None
size 184
parent_url
page 0
content_type text/html
url base2.html
cache key file://%(curdir)s/%(datadir)s/base2.html
real url file://%(curdir)s/%(datadir)s/base2.html
valid
line 4
col 1
size 64
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
page 0
content_type text/html
url file.html
cache key file://%(curdir)s/%(datadir)s/file.html
real url file://%(curdir)s/%(datadir)s/file.html
valid
line 6
col 1
size 115
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
page 0
content_type text/html

View file

@ -17,9 +17,14 @@
"""
Test http checking.
"""
from bs4 import BeautifulSoup
import pytest
from . import LinkCheckTest
from . import TestLogger
bs_has_linenos = BeautifulSoup("<a>", "html.parser").a.sourceline is not None
class AllPartsLogger(TestLogger):
logparts = [
'cachekey',
@ -45,5 +50,12 @@ class TestAllParts(LinkCheckTest):
"""
logger = AllPartsLogger
@pytest.mark.skipif(bs_has_linenos,
reason="Beautiful Soup supports line numbers")
def test_all_parts(self):
self.file_test("all_parts.html")
@pytest.mark.skipif(not bs_has_linenos,
reason="Beautiful Soup does not support line numbers")
def test_all_parts_linenos(self):
self.file_test("all_parts_linenos.html")