Parse Refresh: and Content-Location: header values for URLs.

This commit is contained in:
Bastian Kleineidam 2014-07-01 20:16:43 +02:00
parent d1ef9f7683
commit cde261c009
3 changed files with 51 additions and 2 deletions

View file

@ -2,6 +2,7 @@
Features:
- checking: Parse and check links in PDF files.
- checking: Parse Refresh: and Content-Location: HTTP headers for URLs.
Changes:
- plugins: PDF and Word checks are now parser plugins
@ -11,7 +12,7 @@ Changes:
import needed third party modules.
Fixes:
- checking: Catch XML parse errors in sitemap XML files and print them
- checking: Catch XML parse errors in sitemap XML files and print them
as warnings. Patch by Mark-Hetherington.
Closes: GH bug #516
- checking: Fix internal URL match pattern. Patch by Mark-Hetherington.

View file

@ -297,11 +297,23 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
return buf.getvalue()
def parse_header_links(self):
"""Parse Link: header URLs."""
"""Parse URLs in HTTP headers Link:."""
for linktype, linkinfo in self.url_connection.links.items():
url = linkinfo["url"]
name = u"Link: header %s" % linktype
self.add_url(url, name=name)
if 'Refresh' in self.headers:
from ..htmlutil.linkparse import refresh_re
value = self.headers['Refresh'].strip()
mo = refresh_re.match(value)
if mo:
url = unicode_safe(mo.group("url"))
name = u"Refresh: header"
self.add_url(url, name=name)
if 'Content-Location' in self.headers:
url = self.headers['Content-Location'].strip()
name = u"Content-Location: header"
self.add_url(url, name=name)
def is_parseable (self):
"""

View file

@ -66,3 +66,39 @@ class TestHttpbin(LinkCheckTest):
u"valid",
]
self.direct(url, resultlines, confargs=confargs)
def test_http_refresh_header(self):
linkurl = u"http://www.example.com"
nlinkurl = self.norm(linkurl)
url = get_httpbin_url(u"/response-headers?Refresh=5;url=%s" % linkurl)
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"valid",
u"url %s" % linkurl,
u"cache key %s" % nlinkurl,
u"real url %s" % nlinkurl,
u"name Refresh: header",
u"valid",
]
self.direct(url, resultlines, recursionlevel=1)
def test_http_content_location_header(self):
linkurl = u"http://www.example.com"
nlinkurl = self.norm(linkurl)
url = get_httpbin_url(u"/response-headers?Content-Location=%s" % linkurl)
nurl = self.norm(url)
resultlines = [
u"url %s" % url,
u"cache key %s" % nurl,
u"real url %s" % nurl,
u"valid",
u"url %s" % linkurl,
u"cache key %s" % nlinkurl,
u"real url %s" % nlinkurl,
u"name Content-Location: header",
u"valid",
]
self.direct(url, resultlines, recursionlevel=1)