diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index a7ad30b5..70a02d9c 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -43,6 +43,10 @@ class Parser(object): def reset(self): self.html_doc = None + self.tag_lineno = None + self.tag_column = None + self.last_tag_lineno = None + self.last_tag_column = None def parse_contents(self, contents): for content in contents: @@ -57,6 +61,11 @@ class Parser(object): if v == b'': v = u'' attrs[k] = v + self.last_tag_lineno = self.tag_lineno + self.last_tag_column = self.tag_column + self.tag_lineno = content.sourceline + self.tag_column = None if content.sourcepos is None \ + else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( content.name, attrs, content.text.strip(), @@ -99,21 +108,16 @@ class Parser(object): raise NotImplementedError("debug is not implemented") def lineno(self): - # It seems, that getting line number of element is not - # implemented in BeautifulSoup, so this is faked - return 0 + return self.tag_lineno def last_lineno(self): - return 0 + return self.last_tag_lineno def column(self): - return 0 + return self.tag_column def last_column(self): - return 0 - - def pos(self, text): - return 0 + return self.last_tag_column def parser(handler=None): diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 75ca1820..e857a7de 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -68,8 +68,8 @@ def absolute_url (base_url, base_ref, parent_url): def get_url_from (base_url, recursion_level, aggregate, - parent_url=None, base_ref=None, line=0, column=0, page=0, - name=u"", parent_content_type=None, extern=None): + parent_url=None, base_ref=None, line=None, column=None, + page=0, name=u"", parent_content_type=None, extern=None): """ Get url data from given base data. diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 0f50621e..b3a30512 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -704,8 +704,8 @@ class UrlBase (object): u"base_ref=%r" % self.base_ref, u"recursion_level=%d" % self.recursion_level, u"url_connection=%s" % self.url_connection, - u"line=%d" % self.line, - u"column=%d" % self.column, + u"line=%s" % self.line, + u"column=%s" % self.column, u"page=%d" % self.page, u"name=%r" % self.name, u"anchor=%r" % self.anchor, @@ -791,9 +791,9 @@ class UrlBase (object): - url_data.info: list of unicode Additional information about this URL. - url_data.line: int - Line number of this URL at parent document, or -1 + Line number of this URL at parent document, or None - url_data.column: int - Column number of this URL at parent document, or -1 + Column number of this URL at parent document, or None - url_data.page: int Page number of this URL at parent document, or -1 - url_data.cache_url: unicode diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 20d45a38..f706e272 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -181,7 +181,7 @@ class LinkFinder (TagFinder): def start_element (self, tag, attrs, element_text=None): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) - log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) + log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) if tag == "base" and not self.base_ref: self.base_ref = attrs.get_true("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) @@ -253,5 +253,5 @@ class LinkFinder (TagFinder): def found_url(self, url, name, base): """Add newly found URL to queue.""" assert isinstance(url, str_text) or url is None, repr(url) - self.callback(url, line=self.parser.last_lineno(), - column=self.parser.last_column(), name=name, base=base) + self.callback(url, line=self.parser.lineno(), + column=self.parser.column(), name=name, base=base) diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index 13c1bc8e..57dddd71 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -101,9 +101,9 @@ class CSVLogger (_Logger): row.append(url_data.valid) if self.has_part("url"): row.append(url_data.url) - if self.has_part("line"): + if self.has_part("line") and url_data.line is not None: row.append(url_data.line) - if self.has_part("column"): + if self.has_part("column") and url_data.column is not None: row.append(url_data.column) if self.has_part("name"): row.append(url_data.name) diff --git a/linkcheck/logger/customxml.py b/linkcheck/logger/customxml.py index d576855f..8352445e 100644 --- a/linkcheck/logger/customxml.py +++ b/linkcheck/logger/customxml.py @@ -54,8 +54,8 @@ class CustomXMLLogger (xmllog._XMLLogger): self.xml_tag(u"name", str_text(url_data.name)) if url_data.parent_url and self.has_part('parenturl'): attrs = { - u'line': u"%d" % url_data.line, - u'column': u"%d" % url_data.column, + u'line': u"%s" % url_data.line, + u'column': u"%s" % url_data.column, } self.xml_tag(u"parent", str_text(url_data.parent_url), attrs=attrs) diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index 09ac28ff..cdcdab0a 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -191,9 +191,9 @@ class HtmlLogger (_Logger): u''+ html_escape(url_data.parent_url)+u"") - if url_data.line > 0: + if url_data.line is not None: self.write(_(", line %d") % url_data.line) - if url_data.column > 0: + if url_data.column is not None: self.write(_(", col %d") % url_data.column) if url_data.page > 0: self.write(_(", page %d") % url_data.page) diff --git a/linkcheck/logger/sql.py b/linkcheck/logger/sql.py index 8412c892..d9ec0de0 100644 --- a/linkcheck/logger/sql.py +++ b/linkcheck/logger/sql.py @@ -99,8 +99,8 @@ class SQLLogger (_Logger): "%(warning)s," "%(info)s," "%(url)s," - "%(line)d," - "%(column)d," + "%(line)s," + "%(column)s," "%(name)s," "%(checktime)d," "%(dltime)d," @@ -118,8 +118,8 @@ class SQLLogger (_Logger): 'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)), 'info': sqlify(os.linesep.join(url_data.info)), 'url': sqlify(urlutil.url_quote(url_data.url)), - 'line': url_data.line, - 'column': url_data.column, + 'line': 'NULL' if url_data.line is None else url_data.line, + 'column': 'NULL' if url_data.column is None else url_data.column, 'name': sqlify(url_data.name), 'checktime': url_data.checktime, 'dltime': url_data.dltime, diff --git a/linkcheck/logger/text.py b/linkcheck/logger/text.py index 100d5b45..53571277 100644 --- a/linkcheck/logger/text.py +++ b/linkcheck/logger/text.py @@ -155,9 +155,9 @@ class TextLogger (_Logger): """Write url_data.parent_url.""" self.write(self.part('parenturl') + self.spaces("parenturl")) txt = url_data.parent_url - if url_data.line > 0: + if url_data.line is not None: txt += _(", line %d") % url_data.line - if url_data.column > 0: + if url_data.column is not None: txt += _(", col %d") % url_data.column if url_data.page > 0: txt += _(", page %d") % url_data.page diff --git a/tests/checker/data/all_parts.html.result b/tests/checker/data/all_parts.html.result index 650012d9..4a125a35 100644 --- a/tests/checker/data/all_parts.html.result +++ b/tests/checker/data/all_parts.html.result @@ -3,8 +3,8 @@ cache key file://%(curdir)s/%(datadir)s/all_parts.html real url file://%(curdir)s/%(datadir)s/all_parts.html name %(datadir)s/all_parts.html valid -line 0 -col 0 +line None +col None size 184 parent_url page 0 @@ -14,8 +14,8 @@ url base2.html cache key file://%(curdir)s/%(datadir)s/base2.html real url file://%(curdir)s/%(datadir)s/base2.html valid -line 4 -col 1 +line None +col None size 64 parent_url file://%(curdir)s/%(datadir)s/all_parts.html page 0 @@ -25,8 +25,8 @@ url file.html cache key file://%(curdir)s/%(datadir)s/file.html real url file://%(curdir)s/%(datadir)s/file.html valid -line 6 -col 1 +line None +col None size 115 parent_url file://%(curdir)s/%(datadir)s/all_parts.html page 0 diff --git a/tests/checker/data/all_parts_linenos.html b/tests/checker/data/all_parts_linenos.html new file mode 100644 index 00000000..ba64efc0 --- /dev/null +++ b/tests/checker/data/all_parts_linenos.html @@ -0,0 +1,8 @@ + + + + + + diff --git a/tests/checker/data/all_parts_linenos.html.result b/tests/checker/data/all_parts_linenos.html.result new file mode 100644 index 00000000..09b2375c --- /dev/null +++ b/tests/checker/data/all_parts_linenos.html.result @@ -0,0 +1,33 @@ +url file://%(curdir)s/%(datadir)s/all_parts_linenos.html +cache key file://%(curdir)s/%(datadir)s/all_parts_linenos.html +real url file://%(curdir)s/%(datadir)s/all_parts_linenos.html +name %(datadir)s/all_parts_linenos.html +valid +line None +col None +size 184 +parent_url +page 0 + +content_type text/html +url base2.html +cache key file://%(curdir)s/%(datadir)s/base2.html +real url file://%(curdir)s/%(datadir)s/base2.html +valid +line 4 +col 1 +size 64 +parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html +page 0 +content_type text/html + +url file.html +cache key file://%(curdir)s/%(datadir)s/file.html +real url file://%(curdir)s/%(datadir)s/file.html +valid +line 6 +col 1 +size 115 +parent_url file://%(curdir)s/%(datadir)s/all_parts_linenos.html +page 0 +content_type text/html diff --git a/tests/checker/test_all_parts.py b/tests/checker/test_all_parts.py index fedaefd4..1a9da793 100644 --- a/tests/checker/test_all_parts.py +++ b/tests/checker/test_all_parts.py @@ -17,9 +17,14 @@ """ Test http checking. """ +from bs4 import BeautifulSoup +import pytest + from . import LinkCheckTest from . import TestLogger +bs_has_linenos = BeautifulSoup("", "html.parser").a.sourceline is not None + class AllPartsLogger(TestLogger): logparts = [ 'cachekey', @@ -45,5 +50,12 @@ class TestAllParts(LinkCheckTest): """ logger = AllPartsLogger + @pytest.mark.skipif(bs_has_linenos, + reason="Beautiful Soup supports line numbers") def test_all_parts(self): self.file_test("all_parts.html") + + @pytest.mark.skipif(not bs_has_linenos, + reason="Beautiful Soup does not support line numbers") + def test_all_parts_linenos(self): + self.file_test("all_parts_linenos.html")