mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-05 05:04:46 +00:00
Support Beautiful Soup line numbers
This commit is contained in:
parent
e46fb7fe9c
commit
607328d5c5
13 changed files with 93 additions and 36 deletions
|
|
@ -43,6 +43,10 @@ class Parser(object):
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.html_doc = None
|
self.html_doc = None
|
||||||
|
self.tag_lineno = None
|
||||||
|
self.tag_column = None
|
||||||
|
self.last_tag_lineno = None
|
||||||
|
self.last_tag_column = None
|
||||||
|
|
||||||
def parse_contents(self, contents):
|
def parse_contents(self, contents):
|
||||||
for content in contents:
|
for content in contents:
|
||||||
|
|
@ -57,6 +61,11 @@ class Parser(object):
|
||||||
if v == b'':
|
if v == b'':
|
||||||
v = u''
|
v = u''
|
||||||
attrs[k] = v
|
attrs[k] = v
|
||||||
|
self.last_tag_lineno = self.tag_lineno
|
||||||
|
self.last_tag_column = self.tag_column
|
||||||
|
self.tag_lineno = content.sourceline
|
||||||
|
self.tag_column = None if content.sourcepos is None \
|
||||||
|
else content.sourcepos + 1
|
||||||
if content.is_empty_element:
|
if content.is_empty_element:
|
||||||
self.handler.start_end_element(
|
self.handler.start_end_element(
|
||||||
content.name, attrs, content.text.strip(),
|
content.name, attrs, content.text.strip(),
|
||||||
|
|
@ -99,21 +108,16 @@ class Parser(object):
|
||||||
raise NotImplementedError("debug is not implemented")
|
raise NotImplementedError("debug is not implemented")
|
||||||
|
|
||||||
def lineno(self):
|
def lineno(self):
|
||||||
# It seems, that getting line number of element is not
|
return self.tag_lineno
|
||||||
# implemented in BeautifulSoup, so this is faked
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def last_lineno(self):
|
def last_lineno(self):
|
||||||
return 0
|
return self.last_tag_lineno
|
||||||
|
|
||||||
def column(self):
|
def column(self):
|
||||||
return 0
|
return self.tag_column
|
||||||
|
|
||||||
def last_column(self):
|
def last_column(self):
|
||||||
return 0
|
return self.last_tag_column
|
||||||
|
|
||||||
def pos(self, text):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def parser(handler=None):
|
def parser(handler=None):
|
||||||
|
|
|
||||||
|
|
@ -68,8 +68,8 @@ def absolute_url (base_url, base_ref, parent_url):
|
||||||
|
|
||||||
|
|
||||||
def get_url_from (base_url, recursion_level, aggregate,
|
def get_url_from (base_url, recursion_level, aggregate,
|
||||||
parent_url=None, base_ref=None, line=0, column=0, page=0,
|
parent_url=None, base_ref=None, line=None, column=None,
|
||||||
name=u"", parent_content_type=None, extern=None):
|
page=0, name=u"", parent_content_type=None, extern=None):
|
||||||
"""
|
"""
|
||||||
Get url data from given base data.
|
Get url data from given base data.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -704,8 +704,8 @@ class UrlBase (object):
|
||||||
u"base_ref=%r" % self.base_ref,
|
u"base_ref=%r" % self.base_ref,
|
||||||
u"recursion_level=%d" % self.recursion_level,
|
u"recursion_level=%d" % self.recursion_level,
|
||||||
u"url_connection=%s" % self.url_connection,
|
u"url_connection=%s" % self.url_connection,
|
||||||
u"line=%d" % self.line,
|
u"line=%s" % self.line,
|
||||||
u"column=%d" % self.column,
|
u"column=%s" % self.column,
|
||||||
u"page=%d" % self.page,
|
u"page=%d" % self.page,
|
||||||
u"name=%r" % self.name,
|
u"name=%r" % self.name,
|
||||||
u"anchor=%r" % self.anchor,
|
u"anchor=%r" % self.anchor,
|
||||||
|
|
@ -791,9 +791,9 @@ class UrlBase (object):
|
||||||
- url_data.info: list of unicode
|
- url_data.info: list of unicode
|
||||||
Additional information about this URL.
|
Additional information about this URL.
|
||||||
- url_data.line: int
|
- url_data.line: int
|
||||||
Line number of this URL at parent document, or -1
|
Line number of this URL at parent document, or None
|
||||||
- url_data.column: int
|
- url_data.column: int
|
||||||
Column number of this URL at parent document, or -1
|
Column number of this URL at parent document, or None
|
||||||
- url_data.page: int
|
- url_data.page: int
|
||||||
Page number of this URL at parent document, or -1
|
Page number of this URL at parent document, or -1
|
||||||
- url_data.cache_url: unicode
|
- url_data.cache_url: unicode
|
||||||
|
|
|
||||||
|
|
@ -181,7 +181,7 @@ class LinkFinder (TagFinder):
|
||||||
def start_element (self, tag, attrs, element_text=None):
|
def start_element (self, tag, attrs, element_text=None):
|
||||||
"""Search for links and store found URLs in a list."""
|
"""Search for links and store found URLs in a list."""
|
||||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||||
log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||||
if tag == "base" and not self.base_ref:
|
if tag == "base" and not self.base_ref:
|
||||||
self.base_ref = attrs.get_true("href", u'')
|
self.base_ref = attrs.get_true("href", u'')
|
||||||
tagattrs = self.tags.get(tag, self.universal_attrs)
|
tagattrs = self.tags.get(tag, self.universal_attrs)
|
||||||
|
|
@ -253,5 +253,5 @@ class LinkFinder (TagFinder):
|
||||||
def found_url(self, url, name, base):
|
def found_url(self, url, name, base):
|
||||||
"""Add newly found URL to queue."""
|
"""Add newly found URL to queue."""
|
||||||
assert isinstance(url, str_text) or url is None, repr(url)
|
assert isinstance(url, str_text) or url is None, repr(url)
|
||||||
self.callback(url, line=self.parser.last_lineno(),
|
self.callback(url, line=self.parser.lineno(),
|
||||||
column=self.parser.last_column(), name=name, base=base)
|
column=self.parser.column(), name=name, base=base)
|
||||||
|
|
|
||||||
|
|
@ -101,9 +101,9 @@ class CSVLogger (_Logger):
|
||||||
row.append(url_data.valid)
|
row.append(url_data.valid)
|
||||||
if self.has_part("url"):
|
if self.has_part("url"):
|
||||||
row.append(url_data.url)
|
row.append(url_data.url)
|
||||||
if self.has_part("line"):
|
if self.has_part("line") and url_data.line is not None:
|
||||||
row.append(url_data.line)
|
row.append(url_data.line)
|
||||||
if self.has_part("column"):
|
if self.has_part("column") and url_data.column is not None:
|
||||||
row.append(url_data.column)
|
row.append(url_data.column)
|
||||||
if self.has_part("name"):
|
if self.has_part("name"):
|
||||||
row.append(url_data.name)
|
row.append(url_data.name)
|
||||||
|
|
|
||||||
|
|
@ -54,8 +54,8 @@ class CustomXMLLogger (xmllog._XMLLogger):
|
||||||
self.xml_tag(u"name", str_text(url_data.name))
|
self.xml_tag(u"name", str_text(url_data.name))
|
||||||
if url_data.parent_url and self.has_part('parenturl'):
|
if url_data.parent_url and self.has_part('parenturl'):
|
||||||
attrs = {
|
attrs = {
|
||||||
u'line': u"%d" % url_data.line,
|
u'line': u"%s" % url_data.line,
|
||||||
u'column': u"%d" % url_data.column,
|
u'column': u"%s" % url_data.column,
|
||||||
}
|
}
|
||||||
self.xml_tag(u"parent", str_text(url_data.parent_url),
|
self.xml_tag(u"parent", str_text(url_data.parent_url),
|
||||||
attrs=attrs)
|
attrs=attrs)
|
||||||
|
|
|
||||||
|
|
@ -191,9 +191,9 @@ class HtmlLogger (_Logger):
|
||||||
u'</td><td><a target="top" href="'+
|
u'</td><td><a target="top" href="'+
|
||||||
url_data.parent_url+u'">'+
|
url_data.parent_url+u'">'+
|
||||||
html_escape(url_data.parent_url)+u"</a>")
|
html_escape(url_data.parent_url)+u"</a>")
|
||||||
if url_data.line > 0:
|
if url_data.line is not None:
|
||||||
self.write(_(", line %d") % url_data.line)
|
self.write(_(", line %d") % url_data.line)
|
||||||
if url_data.column > 0:
|
if url_data.column is not None:
|
||||||
self.write(_(", col %d") % url_data.column)
|
self.write(_(", col %d") % url_data.column)
|
||||||
if url_data.page > 0:
|
if url_data.page > 0:
|
||||||
self.write(_(", page %d") % url_data.page)
|
self.write(_(", page %d") % url_data.page)
|
||||||
|
|
|
||||||
|
|
@ -99,8 +99,8 @@ class SQLLogger (_Logger):
|
||||||
"%(warning)s,"
|
"%(warning)s,"
|
||||||
"%(info)s,"
|
"%(info)s,"
|
||||||
"%(url)s,"
|
"%(url)s,"
|
||||||
"%(line)d,"
|
"%(line)s,"
|
||||||
"%(column)d,"
|
"%(column)s,"
|
||||||
"%(name)s,"
|
"%(name)s,"
|
||||||
"%(checktime)d,"
|
"%(checktime)d,"
|
||||||
"%(dltime)d,"
|
"%(dltime)d,"
|
||||||
|
|
@ -118,8 +118,8 @@ class SQLLogger (_Logger):
|
||||||
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
|
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
|
||||||
'info': sqlify(os.linesep.join(url_data.info)),
|
'info': sqlify(os.linesep.join(url_data.info)),
|
||||||
'url': sqlify(urlutil.url_quote(url_data.url)),
|
'url': sqlify(urlutil.url_quote(url_data.url)),
|
||||||
'line': url_data.line,
|
'line': 'NULL' if url_data.line is None else url_data.line,
|
||||||
'column': url_data.column,
|
'column': 'NULL' if url_data.column is None else url_data.column,
|
||||||
'name': sqlify(url_data.name),
|
'name': sqlify(url_data.name),
|
||||||
'checktime': url_data.checktime,
|
'checktime': url_data.checktime,
|
||||||
'dltime': url_data.dltime,
|
'dltime': url_data.dltime,
|
||||||
|
|
|
||||||
|
|
@ -155,9 +155,9 @@ class TextLogger (_Logger):
|
||||||
"""Write url_data.parent_url."""
|
"""Write url_data.parent_url."""
|
||||||
self.write(self.part('parenturl') + self.spaces("parenturl"))
|
self.write(self.part('parenturl') + self.spaces("parenturl"))
|
||||||
txt = url_data.parent_url
|
txt = url_data.parent_url
|
||||||
if url_data.line > 0:
|
if url_data.line is not None:
|
||||||
txt += _(", line %d") % url_data.line
|
txt += _(", line %d") % url_data.line
|
||||||
if url_data.column > 0:
|
if url_data.column is not None:
|
||||||
txt += _(", col %d") % url_data.column
|
txt += _(", col %d") % url_data.column
|
||||||
if url_data.page > 0:
|
if url_data.page > 0:
|
||||||
txt += _(", page %d") % url_data.page
|
txt += _(", page %d") % url_data.page
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,8 @@ cache key file://%(curdir)s/%(datadir)s/all_parts.html
|
||||||
real url file://%(curdir)s/%(datadir)s/all_parts.html
|
real url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||||
name %(datadir)s/all_parts.html
|
name %(datadir)s/all_parts.html
|
||||||
valid
|
valid
|
||||||
line 0
|
line None
|
||||||
col 0
|
col None
|
||||||
size 184
|
size 184
|
||||||
parent_url
|
parent_url
|
||||||
page 0
|
page 0
|
||||||
|
|
@ -14,8 +14,8 @@ url base2.html
|
||||||
cache key file://%(curdir)s/%(datadir)s/base2.html
|
cache key file://%(curdir)s/%(datadir)s/base2.html
|
||||||
real url file://%(curdir)s/%(datadir)s/base2.html
|
real url file://%(curdir)s/%(datadir)s/base2.html
|
||||||
valid
|
valid
|
||||||
line 4
|
line None
|
||||||
col 1
|
col None
|
||||||
size 64
|
size 64
|
||||||
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||||
page 0
|
page 0
|
||||||
|
|
@ -25,8 +25,8 @@ url file.html
|
||||||
cache key file://%(curdir)s/%(datadir)s/file.html
|
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||||
real url file://%(curdir)s/%(datadir)s/file.html
|
real url file://%(curdir)s/%(datadir)s/file.html
|
||||||
valid
|
valid
|
||||||
line 6
|
line None
|
||||||
col 1
|
col None
|
||||||
size 115
|
size 115
|
||||||
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
parent_url file://%(curdir)s/%(datadir)s/all_parts.html
|
||||||
page 0
|
page 0
|
||||||
|
|
|
||||||
8
tests/checker/data/all_parts_linenos.html
Normal file
8
tests/checker/data/all_parts_linenos.html
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
<!-- base without href -->
|
||||||
|
<base target="_top">
|
||||||
|
<!-- meta url -->
|
||||||
|
<META HTTP-equiv="refresh" content="0; url=base2.html">
|
||||||
|
<!-- spaces between key and value -->
|
||||||
|
<a href
|
||||||
|
=
|
||||||
|
"file.html">
|
||||||
33
tests/checker/data/all_parts_linenos.html.result
Normal file
33
tests/checker/data/all_parts_linenos.html.result
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||||
|
cache key file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||||
|
real url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||||
|
name %(datadir)s/all_parts_linenos.html
|
||||||
|
valid
|
||||||
|
line None
|
||||||
|
col None
|
||||||
|
size 184
|
||||||
|
parent_url
|
||||||
|
page 0
|
||||||
|
|
||||||
|
content_type text/html
|
||||||
|
url base2.html
|
||||||
|
cache key file://%(curdir)s/%(datadir)s/base2.html
|
||||||
|
real url file://%(curdir)s/%(datadir)s/base2.html
|
||||||
|
valid
|
||||||
|
line 4
|
||||||
|
col 1
|
||||||
|
size 64
|
||||||
|
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||||
|
page 0
|
||||||
|
content_type text/html
|
||||||
|
|
||||||
|
url file.html
|
||||||
|
cache key file://%(curdir)s/%(datadir)s/file.html
|
||||||
|
real url file://%(curdir)s/%(datadir)s/file.html
|
||||||
|
valid
|
||||||
|
line 6
|
||||||
|
col 1
|
||||||
|
size 115
|
||||||
|
parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html
|
||||||
|
page 0
|
||||||
|
content_type text/html
|
||||||
|
|
@ -17,9 +17,14 @@
|
||||||
"""
|
"""
|
||||||
Test http checking.
|
Test http checking.
|
||||||
"""
|
"""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pytest
|
||||||
|
|
||||||
from . import LinkCheckTest
|
from . import LinkCheckTest
|
||||||
from . import TestLogger
|
from . import TestLogger
|
||||||
|
|
||||||
|
bs_has_linenos = BeautifulSoup("<a>", "html.parser").a.sourceline is not None
|
||||||
|
|
||||||
class AllPartsLogger(TestLogger):
|
class AllPartsLogger(TestLogger):
|
||||||
logparts = [
|
logparts = [
|
||||||
'cachekey',
|
'cachekey',
|
||||||
|
|
@ -45,5 +50,12 @@ class TestAllParts(LinkCheckTest):
|
||||||
"""
|
"""
|
||||||
logger = AllPartsLogger
|
logger = AllPartsLogger
|
||||||
|
|
||||||
|
@pytest.mark.skipif(bs_has_linenos,
|
||||||
|
reason="Beautiful Soup supports line numbers")
|
||||||
def test_all_parts(self):
|
def test_all_parts(self):
|
||||||
self.file_test("all_parts.html")
|
self.file_test("all_parts.html")
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not bs_has_linenos,
|
||||||
|
reason="Beautiful Soup does not support line numbers")
|
||||||
|
def test_all_parts_linenos(self):
|
||||||
|
self.file_test("all_parts_linenos.html")
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue