Add warning url-content-type-unparseable

This commit is contained in:
Chris Mayo 2022-09-02 19:29:11 +01:00
parent fb8df6f089
commit d6936ceb91
12 changed files with 34 additions and 21 deletions

View file

@ -569,6 +569,8 @@ file entry:
The URL content size is zero.
**url-content-too-large**
The URL content size is too large.
**url-content-type-unparseable**
The URL content type is not parseable.
**url-effective-url**
The effective URL is different from the original.
**url-error-getting-content**

View file

@ -85,6 +85,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url"
WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content"
WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large"
WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero"
WARN_URL_CONTENT_TYPE_UNPARSEABLE = "url-content-type-unparseable"
WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip"
WARN_URL_RATE_LIMITED = "url-rate-limited"
WARN_URL_TOO_LONG = "url-too-long"
@ -106,6 +107,7 @@ Warnings = {
WARN_URL_ERROR_GETTING_CONTENT: _("Could not get the content of the URL."),
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_CONTENT_TYPE_UNPARSEABLE: _("The URL content type is not parseable."),
WARN_URL_RATE_LIMITED: _(
"The URL request was rate limited so need reduce number of requests."
),

View file

@ -272,12 +272,7 @@ class FileUrl(urlbase.UrlBase):
return True
if firefox.has_sqlite and firefox.extension.search(self.url):
return True
if self.content_type in self.ContentMimetypes:
return True
log.debug(
LOG_CHECK, "File with content type %r is not parseable.", self.content_type
)
return False
return self.is_content_type_parseable()
def set_content_type(self):
"""Return URL content type, or an empty string if content

View file

@ -154,12 +154,7 @@ class FtpUrl(internpaturl.InternPatternUrl):
"""See if URL target is parseable for recursion."""
if self.is_directory():
return True
if self.content_type in self.ContentMimetypes:
return True
log.debug(
LOG_CHECK, "URL with content type %r is not parseable.", self.content_type
)
return False
return self.is_content_type_parseable()
def is_directory(self):
"""See if URL target is a directory."""

View file

@ -375,14 +375,7 @@ class HttpUrl(internpaturl.InternPatternUrl):
if rtype is not None:
# XXX side effect
self.content_type = rtype
if self.content_type not in self.ContentMimetypes:
log.debug(
LOG_CHECK,
"URL with content type %r is not parseable",
self.content_type,
)
return False
return True
return self.is_content_type_parseable()
def get_robots_txt_url(self):
"""

View file

@ -44,6 +44,7 @@ from .const import (
WARN_URL_OBFUSCATED_IP,
WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_TOO_LARGE,
WARN_URL_CONTENT_TYPE_UNPARSEABLE,
WARN_URL_WHITESPACE,
URL_MAX_LENGTH,
WARN_URL_TOO_LONG,
@ -306,6 +307,24 @@ class UrlBase:
self.title = title
return self.title
def is_content_type_parseable(self):
"""
Return True iff the content type of this url is parseable.
"""
if self.content_type in self.ContentMimetypes:
return True
log.debug(
LOG_CHECK,
"URL with content type %r is not parseable",
self.content_type,
)
if self.recursion_level == 0:
self.add_warning(
_("The URL with content type %r is not parseable.") % self.content_type,
tag=WARN_URL_CONTENT_TYPE_UNPARSEABLE,
)
return False
def is_parseable(self):
"""
Return True iff content of this url is parseable.

View file

@ -2,4 +2,5 @@ url file://%(curdir)s/%(datadir)s/file.asc
cache key file://%(curdir)s/%(datadir)s/file.asc
real url file://%(curdir)s/%(datadir)s/file.asc
name %(datadir)s/file.asc
warning The URL with content type 'application/octet-stream' is not parseable.
valid

View file

@ -2,6 +2,7 @@ url file://%(curdir)s/%(datadir)s/file.markdown
cache key file://%(curdir)s/%(datadir)s/file.markdown
real url file://%(curdir)s/%(datadir)s/file.markdown
name %(datadir)s/file.markdown
warning The URL with content type 'application/octet-stream' is not parseable.
valid
url http://url.example.com

View file

@ -2,4 +2,5 @@ url file://%(curdir)s/%(datadir)s/file.txt
cache key file://%(curdir)s/%(datadir)s/file.txt
real url file://%(curdir)s/%(datadir)s/file.txt
name %(datadir)s/file.txt
warning The URL with content type 'text/plain' is not parseable.
valid

View file

@ -2,5 +2,5 @@ url file://%(curdir)s/%(datadir)s/Мошкова.bin
cache key file://%(curdir)s/%(datadir)s/%%D0%%9C%%D0%%BE%%D1%%88%%D0%%BA%%D0%%BE%%D0%%B2%%D0%%B0.bin
real url file://%(curdir)s/%(datadir)s/%%D0%%9C%%D0%%BE%%D1%%88%%D0%%BA%%D0%%BE%%D0%%B2%%D0%%B0.bin
name %(datadir)s/Мошкова.bin
warning The URL with content type 'application/octet-stream' is not parseable.
valid

View file

@ -32,6 +32,7 @@ class TestHttpRobots(HttpServerTest):
"url %s" % url,
"cache key %s" % url,
"real url %s" % url,
"warning The URL with content type 'text/plain' is not parseable.",
"valid",
]
self.direct(url, resultlines, recursionlevel=5)

View file

@ -40,6 +40,7 @@ class TestHttpbin(LinkCheckTest):
"url %s" % url,
"cache key %s" % nurl,
"real url %s" % nurl,
"warning The URL with content type 'application/json' is not parseable.",
"valid",
"url %s" % linkurl,
"cache key %s" % nlinkurl,
@ -75,6 +76,7 @@ class TestHttpbin(LinkCheckTest):
"url %s" % url,
"cache key %s" % nurl,
"real url %s" % nurl,
"warning The URL with content type 'application/json' is not parseable.",
"valid",
"url %s" % linkurl,
"cache key %s" % nlinkurl,
@ -94,6 +96,7 @@ class TestHttpbin(LinkCheckTest):
"url %s" % url,
"cache key %s" % nurl,
"real url %s" % nurl,
"warning The URL with content type 'application/json' is not parseable.",
"valid",
"url %s" % linkurl,
"cache key %s" % nlinkurl,