mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-19 22:01:00 +00:00
Parse Refresh: and Content-Location: header values for URLs.
This commit is contained in:
parent
d1ef9f7683
commit
cde261c009
3 changed files with 51 additions and 2 deletions
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
Features:
|
||||
- checking: Parse and check links in PDF files.
|
||||
- checking: Parse Refresh: and Content-Location: HTTP headers for URLs.
|
||||
|
||||
Changes:
|
||||
- plugins: PDF and Word checks are now parser plugins
|
||||
|
|
@ -11,7 +12,7 @@ Changes:
|
|||
import needed third party modules.
|
||||
|
||||
Fixes:
|
||||
- checking: Catch XML parse errors in sitemap XML files and print them
|
||||
- checking: Catch XML parse errors in sitemap XML files and print them
|
||||
as warnings. Patch by Mark-Hetherington.
|
||||
Closes: GH bug #516
|
||||
- checking: Fix internal URL match pattern. Patch by Mark-Hetherington.
|
||||
|
|
|
|||
|
|
@ -297,11 +297,23 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
return buf.getvalue()
|
||||
|
||||
def parse_header_links(self):
|
||||
"""Parse Link: header URLs."""
|
||||
"""Parse URLs in HTTP headers Link:."""
|
||||
for linktype, linkinfo in self.url_connection.links.items():
|
||||
url = linkinfo["url"]
|
||||
name = u"Link: header %s" % linktype
|
||||
self.add_url(url, name=name)
|
||||
if 'Refresh' in self.headers:
|
||||
from ..htmlutil.linkparse import refresh_re
|
||||
value = self.headers['Refresh'].strip()
|
||||
mo = refresh_re.match(value)
|
||||
if mo:
|
||||
url = unicode_safe(mo.group("url"))
|
||||
name = u"Refresh: header"
|
||||
self.add_url(url, name=name)
|
||||
if 'Content-Location' in self.headers:
|
||||
url = self.headers['Content-Location'].strip()
|
||||
name = u"Content-Location: header"
|
||||
self.add_url(url, name=name)
|
||||
|
||||
def is_parseable (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -66,3 +66,39 @@ class TestHttpbin(LinkCheckTest):
|
|||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, confargs=confargs)
|
||||
|
||||
def test_http_refresh_header(self):
|
||||
linkurl = u"http://www.example.com"
|
||||
nlinkurl = self.norm(linkurl)
|
||||
url = get_httpbin_url(u"/response-headers?Refresh=5;url=%s" % linkurl)
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"valid",
|
||||
u"url %s" % linkurl,
|
||||
u"cache key %s" % nlinkurl,
|
||||
u"real url %s" % nlinkurl,
|
||||
u"name Refresh: header",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
|
||||
def test_http_content_location_header(self):
|
||||
linkurl = u"http://www.example.com"
|
||||
nlinkurl = self.norm(linkurl)
|
||||
url = get_httpbin_url(u"/response-headers?Content-Location=%s" % linkurl)
|
||||
nurl = self.norm(url)
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % nurl,
|
||||
u"real url %s" % nurl,
|
||||
u"valid",
|
||||
u"url %s" % linkurl,
|
||||
u"cache key %s" % nlinkurl,
|
||||
u"real url %s" % nlinkurl,
|
||||
u"name Content-Location: header",
|
||||
u"valid",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
|
|
|
|||
Loading…
Reference in a new issue