diff --git a/linkcheck/htmlutil/__init__.py b/linkcheck/htmlutil/__init__.py new file mode 100644 index 00000000..93a2a97d --- /dev/null +++ b/linkcheck/htmlutil/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2008 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +HTML utils +""" diff --git a/linkcheck/linkname.py b/linkcheck/htmlutil/linkname.py similarity index 72% rename from linkcheck/linkname.py rename to linkcheck/htmlutil/linkname.py index 98a36478..e08f219b 100644 --- a/linkcheck/linkname.py +++ b/linkcheck/htmlutil/linkname.py @@ -15,23 +15,29 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ -Parse name of common link types. +Parse names of title tags and link types. """ import re -from . import HtmlParser -from . import strformat +from .. import HtmlParser, strformat imgtag_re = re.compile(r"(?i)\s+alt\s*=\s*"+\ r"""(?P("[^"\n]*"|'[^'\n]*'|[^\s>]+))""") img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>])+>""") -endtag_re = re.compile(r"""(?i)""") + +def endtag_re (tag): + """Return matcher for given end tag""" + return re.compile(r"(?i)" % tag) + +a_end_search = endtag_re("a").search +title_end_search = endtag_re("title").search def _unquote (txt): - """Resolve entities and markup from txt.""" + """Resolve entities and remove markup from txt.""" return HtmlParser.resolve_entities(strformat.remove_markup(txt)) + def image_name (txt): """Return the alt part of the first tag in txt.""" mo = imgtag_re.search(txt) @@ -44,10 +50,21 @@ def image_name (txt): def href_name (txt): """Return the name part of the first name link in txt.""" name = u"" - endtag = endtag_re.search(txt) + endtag = a_end_search(txt) if not endtag: return name name = txt[:endtag.start()] if img_re.search(name): return image_name(name) return _unquote(name) + + +def title_name (txt): + """Return the part of the first name in txt.""" + name = u"" + endtag = title_end_search(txt) + if not endtag: + return name + name = txt[:endtag.start()] + return _unquote(name) + diff --git a/linkcheck/linkparse.py b/linkcheck/htmlutil/linkparse.py similarity index 99% rename from linkcheck/linkparse.py rename to linkcheck/htmlutil/linkparse.py index 1d74e4d7..72168c3e 100644 --- a/linkcheck/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -19,7 +19,8 @@ Find link tags in HTML text. """ import re -from . import strformat, log, LOG_CHECK, linkname, url as urlutil +from .. import strformat, log, LOG_CHECK, url as urlutil +from . import linkname MAX_NAMELEN = 256 unquote = strformat.unquote diff --git a/linkcheck/htmlutil/titleparse.py b/linkcheck/htmlutil/titleparse.py new file mode 100644 index 00000000..ec3727dd --- /dev/null +++ b/linkcheck/htmlutil/titleparse.py @@ -0,0 +1,42 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2001-2008 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +""" +Find title tags in HTML text. +""" + +from .. import log, LOG_CHECK +from . import linkname + +MAX_TITLELEN = 256 + + +class TitleFinder (object): + + def __init__ (self, content): + """Initialize flags.""" + super(TitleFinder, self).__init__() + log.debug(LOG_CHECK, "HTML title parser") + self.content = content + self.title = None + + def start_element (self, tag, attrs): + """Search for meta robots.txt "nofollow" and "noindex" flags.""" + if tag == 'title': + pos = self.parser.pos() + data = self.content[pos:pos+MAX_TITLELEN] + data = data.decode(self.parser.encoding, "ignore") + self.title = linkname.title_name(data)