enable meta refresh url parsing again

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@489 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-08-02 08:56:19 +00:00
parent 35b807854b
commit 53264c1fc9
2 changed files with 8 additions and 4 deletions

View file

@ -13,7 +13,6 @@ dpkg --listfiles $PACKAGE |
xargs rm -f >&2
rmdir /usr/lib/$PYTHON/site-packages/linkcheck 2>/dev/null || true
rmdir /usr/lib/$PYTHON/site-packages/DNS 2>/dev/null || true
# for later use of python-central
#/usr/sbin/register-python-package module remove linkchecker ">=2.0"

View file

@ -90,9 +90,6 @@ _linkMatcher = r"""
"""
# disable meta tag for now, the modified linkmatcher does not allow it
# (['meta'], ['url']), # <meta http-equiv='refresh' content='x; url=...'>
# ripped mainly from HTML::Tagset.pm
LinkTags = (
(['a'], ['href']),
@ -118,8 +115,12 @@ LinkTags = (
(['script'], ['src', 'for']),
(['body', 'table', 'td', 'th', 'tr'], ['background']),
(['xmp'], ['href']),
(['meta'], ['content']),
)
# matcher for <meta http-equiv=refresh> tags
_refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
LinkPatterns = []
for _tags,_attrs in LinkTags:
_tag = '(%s)'%'|'.join(_tags)
@ -499,6 +500,10 @@ class UrlData:
if self.is_in_comment(match.start()): continue
# strip quotes
url = StringUtil.stripQuotes(match.group('value'))
if 'meta' in pattern['tags']:
match = _refresh_re.match(url)
if match:
url = match.group("url")
# need to resolve HTML entities
url = StringUtil.unhtmlify(url)
lineno= StringUtil.getLineNumber(self.getContent(), match.start())