diff --git a/linkcheck/HtmlParser/htmlsax.py b/linkcheck/HtmlParser/htmlsax.py index a7ad30b5..70a02d9c 100644 --- a/linkcheck/HtmlParser/htmlsax.py +++ b/linkcheck/HtmlParser/htmlsax.py @@ -43,6 +43,10 @@ class Parser(object): def reset(self): self.html_doc = None + self.tag_lineno = None + self.tag_column = None + self.last_tag_lineno = None + self.last_tag_column = None def parse_contents(self, contents): for content in contents: @@ -57,6 +61,11 @@ class Parser(object): if v == b'': v = u'' attrs[k] = v + self.last_tag_lineno = self.tag_lineno + self.last_tag_column = self.tag_column + self.tag_lineno = content.sourceline + self.tag_column = None if content.sourcepos is None \ + else content.sourcepos + 1 if content.is_empty_element: self.handler.start_end_element( content.name, attrs, content.text.strip(), @@ -99,21 +108,16 @@ class Parser(object): raise NotImplementedError("debug is not implemented") def lineno(self): - # It seems, that getting line number of element is not - # implemented in BeautifulSoup, so this is faked - return 0 + return self.tag_lineno def last_lineno(self): - return 0 + return self.last_tag_lineno def column(self): - return 0 + return self.tag_column def last_column(self): - return 0 - - def pos(self, text): - return 0 + return self.last_tag_column def parser(handler=None): diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 75ca1820..e857a7de 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -68,8 +68,8 @@ def absolute_url (base_url, base_ref, parent_url): def get_url_from (base_url, recursion_level, aggregate, - parent_url=None, base_ref=None, line=0, column=0, page=0, - name=u"", parent_content_type=None, extern=None): + parent_url=None, base_ref=None, line=None, column=None, + page=0, name=u"", parent_content_type=None, extern=None): """ Get url data from given base data. diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 0f50621e..b3a30512 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -704,8 +704,8 @@ class UrlBase (object): u"base_ref=%r" % self.base_ref, u"recursion_level=%d" % self.recursion_level, u"url_connection=%s" % self.url_connection, - u"line=%d" % self.line, - u"column=%d" % self.column, + u"line=%s" % self.line, + u"column=%s" % self.column, u"page=%d" % self.page, u"name=%r" % self.name, u"anchor=%r" % self.anchor, @@ -791,9 +791,9 @@ class UrlBase (object): - url_data.info: list of unicode Additional information about this URL. - url_data.line: int - Line number of this URL at parent document, or -1 + Line number of this URL at parent document, or None - url_data.column: int - Column number of this URL at parent document, or -1 + Column number of this URL at parent document, or None - url_data.page: int Page number of this URL at parent document, or -1 - url_data.cache_url: unicode diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 20d45a38..f706e272 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -181,7 +181,7 @@ class LinkFinder (TagFinder): def start_element (self, tag, attrs, element_text=None): """Search for links and store found URLs in a list.""" log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs) - log.debug(LOG_CHECK, "line %d col %d old line %d old col %d", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) + log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column()) if tag == "base" and not self.base_ref: self.base_ref = attrs.get_true("href", u'') tagattrs = self.tags.get(tag, self.universal_attrs) @@ -253,5 +253,5 @@ class LinkFinder (TagFinder): def found_url(self, url, name, base): """Add newly found URL to queue.""" assert isinstance(url, str_text) or url is None, repr(url) - self.callback(url, line=self.parser.last_lineno(), - column=self.parser.last_column(), name=name, base=base) + self.callback(url, line=self.parser.lineno(), + column=self.parser.column(), name=name, base=base) diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index 13c1bc8e..57dddd71 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -101,9 +101,9 @@ class CSVLogger (_Logger): row.append(url_data.valid) if self.has_part("url"): row.append(url_data.url) - if self.has_part("line"): + if self.has_part("line") and url_data.line is not None: row.append(url_data.line) - if self.has_part("column"): + if self.has_part("column") and url_data.column is not None: row.append(url_data.column) if self.has_part("name"): row.append(url_data.name) diff --git a/linkcheck/logger/customxml.py b/linkcheck/logger/customxml.py index d576855f..8352445e 100644 --- a/linkcheck/logger/customxml.py +++ b/linkcheck/logger/customxml.py @@ -54,8 +54,8 @@ class CustomXMLLogger (xmllog._XMLLogger): self.xml_tag(u"name", str_text(url_data.name)) if url_data.parent_url and self.has_part('parenturl'): attrs = { - u'line': u"%d" % url_data.line, - u'column': u"%d" % url_data.column, + u'line': u"%s" % url_data.line, + u'column': u"%s" % url_data.column, } self.xml_tag(u"parent", str_text(url_data.parent_url), attrs=attrs) diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index 09ac28ff..cdcdab0a 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -191,9 +191,9 @@ class HtmlLogger (_Logger): u'