mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-17 06:20:27 +00:00
Support Beautiful Soup line numbers
This commit is contained in:
parent
e46fb7fe9c
commit
607328d5c5
13 changed files with 93 additions and 36 deletions
|
|
@ -43,6 +43,10 @@ class Parser(object):
|
|||
|
||||
def reset(self):
|
||||
self.html_doc = None
|
||||
self.tag_lineno = None
|
||||
self.tag_column = None
|
||||
self.last_tag_lineno = None
|
||||
self.last_tag_column = None
|
||||
|
||||
def parse_contents(self, contents):
|
||||
for content in contents:
|
||||
|
|
@ -57,6 +61,11 @@ class Parser(object):
|
|||
if v == b'':
|
||||
v = u''
|
||||
attrs[k] = v
|
||||
self.last_tag_lineno = self.tag_lineno
|
||||
self.last_tag_column = self.tag_column
|
||||
self.tag_lineno = content.sourceline
|
||||
self.tag_column = None if content.sourcepos is None \
|
||||
else content.sourcepos + 1
|
||||
if content.is_empty_element:
|
||||
self.handler.start_end_element(
|
||||
content.name, attrs, content.text.strip(),
|
||||
|
|
@ -99,21 +108,16 @@ class Parser(object):
|
|||
raise NotImplementedError("debug is not implemented")
|
||||
|
||||
def lineno(self):
|
||||
# It seems, that getting line number of element is not
|
||||
# implemented in BeautifulSoup, so this is faked
|
||||
return 0
|
||||
return self.tag_lineno
|
||||
|
||||
def last_lineno(self):
|
||||
return 0
|
||||
return self.last_tag_lineno
|
||||
|
||||
def column(self):
|
||||
return 0
|
||||
return self.tag_column
|
||||
|
||||
def last_column(self):
|
||||
return 0
|
||||
|
||||
def pos(self, text):
|
||||
return 0
|
||||
return self.last_tag_column
|
||||
|
||||
|
||||
def parser(handler=None):
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ def absolute_url (base_url, base_ref, parent_url):
|
|||
|
||||
|
||||
def get_url_from (base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=0, column=0, page=0,
|
||||
name=u"", parent_content_type=None, extern=None):
|
||||
parent_url=None, base_ref=None, line=None, column=None,
|
||||
page=0, name=u"", parent_content_type=None, extern=None):
|
||||
"""
|
||||
Get url data from given base data.
|
||||
|
||||
|
|
|
|||
|
|
@ -704,8 +704,8 @@ class UrlBase (object):
|
|||
u"base_ref=%r" % self.base_ref,
|
||||
u"recursion_level=%d" % self.recursion_level,
|
||||
u"url_connection=%s" % self.url_connection,
|
||||
u"line=%d" % self.line,
|
||||
u"column=%d" % self.column,
|
||||
u"line=%s" % self.line,
|
||||
u"column=%s" % self.column,
|
||||
u"page=%d" % self.page,
|
||||
u"name=%r" % self.name,
|
||||
u"anchor=%r" % self.anchor,
|
||||
|
|
@ -791,9 +791,9 @@ class UrlBase (object):
|
|||
- url_data.info: list of unicode
|
||||
Additional information about this URL.
|
||||
- url_data.line: int
|
||||
Line number of this URL at parent document, or -1
|
||||
Line number of this URL at parent document, or None
|
||||
- url_data.column: int
|
||||
Column number of this URL at parent document, or -1
|
||||
Column number of this URL at parent document, or None
|
||||
- url_data.page: int
|
||||
Page number of this URL at parent document, or -1
|
||||
- url_data.cache_url: unicode
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ class LinkFinder (TagFinder):
|
|||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
if tag == "base" and not self.base_ref:
|
||||
self.base_ref = attrs.get_true("href", u'')
|
||||
tagattrs = self.tags.get(tag, self.universal_attrs)
|
||||
|
|
@ -253,5 +253,5 @@ class LinkFinder (TagFinder):
|
|||
def found_url(self, url, name, base):
|
||||
"""Add newly found URL to queue."""
|
||||
assert isinstance(url, str_text) or url is None, repr(url)
|
||||
self.callback(url, line=self.parser.last_lineno(),
|
||||
column=self.parser.last_column(), name=name, base=base)
|
||||
self.callback(url, line=self.parser.lineno(),
|
||||
column=self.parser.column(), name=name, base=base)
|
||||
|
|
|
|||
|
|
@ -101,9 +101,9 @@ class CSVLogger (_Logger):
|
|||
row.append(url_data.valid)
|
||||
if self.has_part("url"):
|
||||
row.append(url_data.url)
|
||||
if self.has_part("line"):
|
||||
if self.has_part("line") and url_data.line is not None:
|
||||
row.append(url_data.line)
|
||||
if self.has_part("column"):
|
||||
if self.has_part("column") and url_data.column is not None:
|
||||
row.append(url_data.column)
|
||||
if self.has_part("name"):
|
||||
row.append(url_data.name)
|
||||
|
|
|
|||
|
|
@ -54,8 +54,8 @@ class CustomXMLLogger (xmllog._XMLLogger):
|
|||
self.xml_tag(u"name", str_text(url_data.name))
|
||||
if url_data.parent_url and self.has_part('parenturl'):
|
||||
attrs = {
|
||||
u'line': u"%d" % url_data.line,
|
||||
u'column': u"%d" % url_data.column,
|
||||
u'line': u"%s" % url_data.line,
|
||||
u'column': u"%s" % url_data.column,
|
||||
}
|
||||
self.xml_tag(u"parent", str_text(url_data.parent_url),
|
||||
attrs=attrs)
|
||||
|
|
|
|||
|
|
@ -191,9 +191,9 @@ class HtmlLogger (_Logger):
|
|||
u'</td><td><a target="top" href="'+
|
||||
url_data.parent_url+u'">'+
|
||||
html_escape(url_data.parent_url)+u"</a>")
|
||||
if url_data.line > 0:
|
||||
if url_data.line is not None:
|
||||
self.write(_(", line %d") % url_data.line)
|
||||
if url_data.column > 0:
|
||||
if url_data.column is not None:
|
||||
self.write(_(", col %d") % url_data.column)
|
||||
if url_data.page > 0:
|
||||
self.write(_(", page %d") % url_data.page)
|
||||
|
|
|
|||
|
|
@ -99,8 +99,8 @@ class SQLLogger (_Logger):
|
|||
"%(warning)s,"
|
||||
"%(info)s,"
|
||||
"%(url)s,"
|
||||
"%(line)d,"
|
||||
"%(column)d,"
|
||||
"%(line)s,"
|
||||
"%(column)s,"
|
||||
"%(name)s,"
|
||||
"%(checktime)d,"
|
||||
"%(dltime)d,"
|
||||
|
|
@ -118,8 +118,8 @@ class SQLLogger (_Logger):
|
|||
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
|
||||
'info': sqlify(os.linesep.join(url_data.info)),
|
||||
'url': sqlify(urlutil.url_quote(url_data.url)),
|
||||
'line': url_data.line,
|
||||
'column': url_data.column,
|
||||
'line': 'NULL' if url_data.line is None else url_data.line,
|
||||
'column': 'NULL' if url_data.column is None else url_data.column,
|
||||
'name': sqlify(url_data.name),
|
||||
'checktime': url_data.checktime,
|
||||
'dltime': url_data.dltime,
|
||||
|
|
|
|||
|
|
@ -155,9 +155,9 @@ class TextLogger (_Logger):
|
|||
"""Write url_data.parent_url."""
|
||||
self.write(self.part('parenturl') + self.spaces("parenturl"))
|
||||
txt = url_data.parent_url
|
||||
if url_data.line > 0:
|
||||
if url_data.line is not None:
|
||||
txt += _(", line %d") % url_data.line
|
||||
if url_data.column > 0:
|
||||
if url_data.column is not None:
|
||||
txt += _(", col %d") % url_data.column
|
||||
if url_data.page > 0:
|
||||
txt += _(", page %d") % url_data.page
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@ cache key file://%(curdir)s/%(datadir)s/all_parts.html
|
|||
real url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||
name %(datadir)s/all_parts.html
|
||||
valid
|
||||
line 0
|
||||
col 0
|
||||
line None
|
||||
col None
|
||||
size 184
|
||||
parent_url
|
||||
page 0
|
||||
|
|
@ -14,8 +14,8 @@ url base2.html
|
|||
cache key file://%(curdir)s/%(datadir)s/base2.html
|
||||
real url file://%(curdir)s/%(datadir)s/base2.html
|
||||
valid
|
||||
line 4
|
||||
col 1
|
||||
line None
|
||||
col None
|
||||
size 64
|
||||
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||
page 0
|
||||
|
|
@ -25,8 +25,8 @@ url file.html
|
|||
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||
real url file://%(curdir)s/%(datadir)s/file.html
|
||||
valid
|
||||
line 6
|
||||
col 1
|
||||
line None
|
||||
col None
|
||||
size 115
|
||||
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||
page 0
|
||||
|
|
|
|||
8
tests/checker/data/all_parts_linenos.html
Normal file
8
tests/checker/data/all_parts_linenos.html
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
<!-- base without href -->
|
||||
<base target="_top">
|
||||
<!-- meta url -->
|
||||
<META HTTP-equiv="refresh" content="0; url=base2.html">
|
||||
<!-- spaces between key and value -->
|
||||
<a href
|
||||
=
|
||||
"file.html">
|
||||
33
tests/checker/data/all_parts_linenos.html.result
Normal file
33
tests/checker/data/all_parts_linenos.html.result
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||
cache key file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||
real url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||
name %(datadir)s/all_parts_linenos.html
|
||||
valid
|
||||
line None
|
||||
col None
|
||||
size 184
|
||||
parent_url
|
||||
page 0
|
||||
|
||||
content_type text/html
|
||||
url base2.html
|
||||
cache key file://%(curdir)s/%(datadir)s/base2.html
|
||||
real url file://%(curdir)s/%(datadir)s/base2.html
|
||||
valid
|
||||
line 4
|
||||
col 1
|
||||
size 64
|
||||
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||
page 0
|
||||
content_type text/html
|
||||
|
||||
url file.html
|
||||
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||
real url file://%(curdir)s/%(datadir)s/file.html
|
||||
valid
|
||||
line 6
|
||||
col 1
|
||||
size 115
|
||||
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||
page 0
|
||||
content_type text/html
|
||||
|
|
@ -17,9 +17,14 @@
|
|||
"""
|
||||
Test http checking.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
import pytest
|
||||
|
||||
from . import LinkCheckTest
|
||||
from . import TestLogger
|
||||
|
||||
bs_has_linenos = BeautifulSoup("<a>", "html.parser").a.sourceline is not None
|
||||
|
||||
class AllPartsLogger(TestLogger):
|
||||
logparts = [
|
||||
'cachekey',
|
||||
|
|
@ -45,5 +50,12 @@ class TestAllParts(LinkCheckTest):
|
|||
"""
|
||||
logger = AllPartsLogger
|
||||
|
||||
@pytest.mark.skipif(bs_has_linenos,
|
||||
reason="Beautiful Soup supports line numbers")
|
||||
def test_all_parts(self):
|
||||
self.file_test("all_parts.html")
|
||||
|
||||
@pytest.mark.skipif(not bs_has_linenos,
|
||||
reason="Beautiful Soup does not support line numbers")
|
||||
def test_all_parts_linenos(self):
|
||||
self.file_test("all_parts_linenos.html")
|
||||
|
|
|
|||
Loading…
Reference in a new issue