From d6936ceb916cf434c6ddd913786657bef170da28 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Fri, 2 Sep 2022 19:29:11 +0100 Subject: [PATCH] Add warning url-content-type-unparseable --- doc/src/man/linkcheckerrc.rst | 2 ++ linkcheck/checker/const.py | 2 ++ linkcheck/checker/fileurl.py | 7 +------ linkcheck/checker/ftpurl.py | 7 +------ linkcheck/checker/httpurl.py | 9 +-------- linkcheck/checker/urlbase.py | 19 +++++++++++++++++++ tests/checker/data/file.asc.result | 1 + tests/checker/data/file.markdown.result | 1 + tests/checker/data/file.txt.result | 1 + tests/checker/data/Мошкова.bin.result | 2 +- tests/checker/test_http_robots.py | 1 + tests/checker/test_httpbin.py | 3 +++ 12 files changed, 34 insertions(+), 21 deletions(-) diff --git a/doc/src/man/linkcheckerrc.rst b/doc/src/man/linkcheckerrc.rst index 73c8ddfe..370cf1ed 100644 --- a/doc/src/man/linkcheckerrc.rst +++ b/doc/src/man/linkcheckerrc.rst @@ -569,6 +569,8 @@ file entry: The URL content size is zero. **url-content-too-large** The URL content size is too large. +**url-content-type-unparseable** + The URL content type is not parseable. **url-effective-url** The effective URL is different from the original. **url-error-getting-content** diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index f473ad02..8dfef7d1 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -85,6 +85,7 @@ WARN_URL_EFFECTIVE_URL = "url-effective-url" WARN_URL_ERROR_GETTING_CONTENT = "url-error-getting-content" WARN_URL_CONTENT_SIZE_TOO_LARGE = "url-content-too-large" WARN_URL_CONTENT_SIZE_ZERO = "url-content-size-zero" +WARN_URL_CONTENT_TYPE_UNPARSEABLE = "url-content-type-unparseable" WARN_URL_OBFUSCATED_IP = "url-obfuscated-ip" WARN_URL_RATE_LIMITED = "url-rate-limited" WARN_URL_TOO_LONG = "url-too-long" @@ -106,6 +107,7 @@ Warnings = { WARN_URL_ERROR_GETTING_CONTENT: _("Could not get the content of the URL."), WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."), WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."), + WARN_URL_CONTENT_TYPE_UNPARSEABLE: _("The URL content type is not parseable."), WARN_URL_RATE_LIMITED: _( "The URL request was rate limited so need reduce number of requests." ), diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 19b075f5..b72a8bc5 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -272,12 +272,7 @@ class FileUrl(urlbase.UrlBase): return True if firefox.has_sqlite and firefox.extension.search(self.url): return True - if self.content_type in self.ContentMimetypes: - return True - log.debug( - LOG_CHECK, "File with content type %r is not parseable.", self.content_type - ) - return False + return self.is_content_type_parseable() def set_content_type(self): """Return URL content type, or an empty string if content diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index c368ae94..2d56491c 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -154,12 +154,7 @@ class FtpUrl(internpaturl.InternPatternUrl): """See if URL target is parseable for recursion.""" if self.is_directory(): return True - if self.content_type in self.ContentMimetypes: - return True - log.debug( - LOG_CHECK, "URL with content type %r is not parseable.", self.content_type - ) - return False + return self.is_content_type_parseable() def is_directory(self): """See if URL target is a directory.""" diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index f662b978..c0cbdf19 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -375,14 +375,7 @@ class HttpUrl(internpaturl.InternPatternUrl): if rtype is not None: # XXX side effect self.content_type = rtype - if self.content_type not in self.ContentMimetypes: - log.debug( - LOG_CHECK, - "URL with content type %r is not parseable", - self.content_type, - ) - return False - return True + return self.is_content_type_parseable() def get_robots_txt_url(self): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 0fadddb4..0f76e732 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -44,6 +44,7 @@ from .const import ( WARN_URL_OBFUSCATED_IP, WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE, + WARN_URL_CONTENT_TYPE_UNPARSEABLE, WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG, @@ -306,6 +307,24 @@ class UrlBase: self.title = title return self.title + def is_content_type_parseable(self): + """ + Return True iff the content type of this url is parseable. + """ + if self.content_type in self.ContentMimetypes: + return True + log.debug( + LOG_CHECK, + "URL with content type %r is not parseable", + self.content_type, + ) + if self.recursion_level == 0: + self.add_warning( + _("The URL with content type %r is not parseable.") % self.content_type, + tag=WARN_URL_CONTENT_TYPE_UNPARSEABLE, + ) + return False + def is_parseable(self): """ Return True iff content of this url is parseable. diff --git a/tests/checker/data/file.asc.result b/tests/checker/data/file.asc.result index 287fbb25..3d679e5f 100644 --- a/tests/checker/data/file.asc.result +++ b/tests/checker/data/file.asc.result @@ -2,4 +2,5 @@ url file://%(curdir)s/%(datadir)s/file.asc cache key file://%(curdir)s/%(datadir)s/file.asc real url file://%(curdir)s/%(datadir)s/file.asc name %(datadir)s/file.asc +warning The URL with content type 'application/octet-stream' is not parseable. valid diff --git a/tests/checker/data/file.markdown.result b/tests/checker/data/file.markdown.result index a0423b48..3909cc4e 100644 --- a/tests/checker/data/file.markdown.result +++ b/tests/checker/data/file.markdown.result @@ -2,6 +2,7 @@ url file://%(curdir)s/%(datadir)s/file.markdown cache key file://%(curdir)s/%(datadir)s/file.markdown real url file://%(curdir)s/%(datadir)s/file.markdown name %(datadir)s/file.markdown +warning The URL with content type 'application/octet-stream' is not parseable. valid url http://url.example.com diff --git a/tests/checker/data/file.txt.result b/tests/checker/data/file.txt.result index 3a61aaf7..f0450826 100644 --- a/tests/checker/data/file.txt.result +++ b/tests/checker/data/file.txt.result @@ -2,4 +2,5 @@ url file://%(curdir)s/%(datadir)s/file.txt cache key file://%(curdir)s/%(datadir)s/file.txt real url file://%(curdir)s/%(datadir)s/file.txt name %(datadir)s/file.txt +warning The URL with content type 'text/plain' is not parseable. valid diff --git a/tests/checker/data/Мошкова.bin.result b/tests/checker/data/Мошкова.bin.result index a8091e80..0f5f5e5a 100644 --- a/tests/checker/data/Мошкова.bin.result +++ b/tests/checker/data/Мошкова.bin.result @@ -2,5 +2,5 @@ url file://%(curdir)s/%(datadir)s/Мошкова.bin cache key file://%(curdir)s/%(datadir)s/%%D0%%9C%%D0%%BE%%D1%%88%%D0%%BA%%D0%%BE%%D0%%B2%%D0%%B0.bin real url file://%(curdir)s/%(datadir)s/%%D0%%9C%%D0%%BE%%D1%%88%%D0%%BA%%D0%%BE%%D0%%B2%%D0%%B0.bin name %(datadir)s/Мошкова.bin +warning The URL with content type 'application/octet-stream' is not parseable. valid - \ No newline at end of file diff --git a/tests/checker/test_http_robots.py b/tests/checker/test_http_robots.py index f58e2aaa..abda0395 100644 --- a/tests/checker/test_http_robots.py +++ b/tests/checker/test_http_robots.py @@ -32,6 +32,7 @@ class TestHttpRobots(HttpServerTest): "url %s" % url, "cache key %s" % url, "real url %s" % url, + "warning The URL with content type 'text/plain' is not parseable.", "valid", ] self.direct(url, resultlines, recursionlevel=5) diff --git a/tests/checker/test_httpbin.py b/tests/checker/test_httpbin.py index c8df3046..7b449585 100644 --- a/tests/checker/test_httpbin.py +++ b/tests/checker/test_httpbin.py @@ -40,6 +40,7 @@ class TestHttpbin(LinkCheckTest): "url %s" % url, "cache key %s" % nurl, "real url %s" % nurl, + "warning The URL with content type 'application/json' is not parseable.", "valid", "url %s" % linkurl, "cache key %s" % nlinkurl, @@ -75,6 +76,7 @@ class TestHttpbin(LinkCheckTest): "url %s" % url, "cache key %s" % nurl, "real url %s" % nurl, + "warning The URL with content type 'application/json' is not parseable.", "valid", "url %s" % linkurl, "cache key %s" % nlinkurl, @@ -94,6 +96,7 @@ class TestHttpbin(LinkCheckTest): "url %s" % url, "cache key %s" % nurl, "real url %s" % nurl, + "warning The URL with content type 'application/json' is not parseable.", "valid", "url %s" % linkurl, "cache key %s" % nlinkurl,