diff --git a/linkcheck/htmlutil/__init__.py b/linkcheck/htmlutil/__init__.py
new file mode 100644
index 00000000..93a2a97d
--- /dev/null
+++ b/linkcheck/htmlutil/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2008 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+HTML utils
+"""
diff --git a/linkcheck/linkname.py b/linkcheck/htmlutil/linkname.py
similarity index 72%
rename from linkcheck/linkname.py
rename to linkcheck/htmlutil/linkname.py
index 98a36478..e08f219b 100644
--- a/linkcheck/linkname.py
+++ b/linkcheck/htmlutil/linkname.py
@@ -15,23 +15,29 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
-Parse name of common link types.
+Parse names of title tags and link types.
"""
import re
-from . import HtmlParser
-from . import strformat
+from .. import HtmlParser, strformat
imgtag_re = re.compile(r"(?i)\s+alt\s*=\s*"+\
r"""(?P("[^"\n]*"|'[^'\n]*'|[^\s>]+))""")
img_re = re.compile(r"""(?i)<\s*img\s+("[^"\n]*"|'[^'\n]*'|[^>])+>""")
-endtag_re = re.compile(r"""(?i)""")
+
+def endtag_re (tag):
+ """Return matcher for given end tag"""
+ return re.compile(r"(?i)%s\s*>" % tag)
+
+a_end_search = endtag_re("a").search
+title_end_search = endtag_re("title").search
def _unquote (txt):
- """Resolve entities and markup from txt."""
+ """Resolve entities and remove markup from txt."""
return HtmlParser.resolve_entities(strformat.remove_markup(txt))
+
def image_name (txt):
"""Return the alt part of the first
tag in txt."""
mo = imgtag_re.search(txt)
@@ -44,10 +50,21 @@ def image_name (txt):
def href_name (txt):
"""Return the name part of the first name link in txt."""
name = u""
- endtag = endtag_re.search(txt)
+ endtag = a_end_search(txt)
if not endtag:
return name
name = txt[:endtag.start()]
if img_re.search(name):
return image_name(name)
return _unquote(name)
+
+
+def title_name (txt):
+ """Return the part of the first name in txt."""
+ name = u""
+ endtag = title_end_search(txt)
+ if not endtag:
+ return name
+ name = txt[:endtag.start()]
+ return _unquote(name)
+
diff --git a/linkcheck/linkparse.py b/linkcheck/htmlutil/linkparse.py
similarity index 99%
rename from linkcheck/linkparse.py
rename to linkcheck/htmlutil/linkparse.py
index 1d74e4d7..72168c3e 100644
--- a/linkcheck/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@@ -19,7 +19,8 @@ Find link tags in HTML text.
"""
import re
-from . import strformat, log, LOG_CHECK, linkname, url as urlutil
+from .. import strformat, log, LOG_CHECK, url as urlutil
+from . import linkname
MAX_NAMELEN = 256
unquote = strformat.unquote
diff --git a/linkcheck/htmlutil/titleparse.py b/linkcheck/htmlutil/titleparse.py
new file mode 100644
index 00000000..ec3727dd
--- /dev/null
+++ b/linkcheck/htmlutil/titleparse.py
@@ -0,0 +1,42 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2001-2008 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Find title tags in HTML text.
+"""
+
+from .. import log, LOG_CHECK
+from . import linkname
+
+MAX_TITLELEN = 256
+
+
+class TitleFinder (object):
+
+ def __init__ (self, content):
+ """Initialize flags."""
+ super(TitleFinder, self).__init__()
+ log.debug(LOG_CHECK, "HTML title parser")
+ self.content = content
+ self.title = None
+
+ def start_element (self, tag, attrs):
+ """Search for meta robots.txt "nofollow" and "noindex" flags."""
+ if tag == 'title':
+ pos = self.parser.pos()
+ data = self.content[pos:pos+MAX_TITLELEN]
+ data = data.decode(self.parser.encoding, "ignore")
+ self.title = linkname.title_name(data)