diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 6838b933..8cc26ce5 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -127,6 +127,16 @@ class FileUrl (urlbase.UrlBase): self.urlparts[2] += '/' self.url = urlparse.urlunsplit(self.urlparts) + def add_size_info (self): + """Get size of file content from filename path.""" + if self.is_directory(): + # Directory size always differs from the customer index.html + # that is generated. So return without calculating any size. + return + self.size = fileutil.get_size(self.get_os_filename()) + if self.dlsize == -1: + self.dlsize = self.size + def check_connection (self): """ Try to open the local file. Under NT systems the case sensitivity diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 84debcad..917a0717 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -142,6 +142,19 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): return rb.allows_url(roboturl, url, self.proxy, user, password, callback=callback) + def add_size_info (self): + """Get size of URL content from HTTP header.""" + if self.headers and "Content-Length" in self.headers and \ + "Content-Encoding" not in self.headers: + # Note that content-encoding causes size differences since + # the content data is always decoded. + try: + self.size = int(self.headers["Content-Length"]) + if self.dlsize == -1: + self.dlsize = self.size + except (ValueError, OverflowError): + pass + def check_connection (self): """ Check a URL with HTTP protocol. diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 8c6bc7a8..3f18f7b5 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -38,8 +38,8 @@ from ..htmlutil import linkparse from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN, WARN_URL_UNNORMED, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND, - WARN_URL_CONTENT_TOO_LARGE, WARN_URL_CONTENT_ZERO_SIZE, - ExcList, ExcSyntaxList, ExcNoCacheList) + WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO, + WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList) # helper alias unicode_safe = strformat.unicode_safe @@ -143,6 +143,8 @@ class UrlBase (object): self.warnings = [] # list of infos self.info = [] + # content size + self.size = -1 # download time self.dltime = -1 # download size @@ -417,14 +419,17 @@ class UrlBase (object): self.close_connection() def add_country_info (self): - """ - Try to ask GeoIP database for country info. - """ + """Try to ask GeoIP database for country info.""" country = geoip.get_country(self.host) if country is not None: self.add_info(_("URL is located in %(country)s.") % {"country": _(country)}) + def add_size_info (self): + """Store size of URL content from meta info into self.size. + Must be implemented in subclasses.""" + pass + def local_check (self): """Local check function can be overridden in subclasses.""" log.debug(LOG_CHECK, "Checking %s", self) @@ -439,6 +444,7 @@ class UrlBase (object): log.debug(LOG_CHECK, "checking connection") try: self.check_connection() + self.add_size_info() self.add_country_info() self.check_content() except tuple(ExcList): @@ -518,7 +524,7 @@ class UrlBase (object): log.debug(LOG_CHECK, "... no, cannot get content.") return False rec_level = self.aggregate.config["recursionlevel"] - if rec_level >= 0 and self.recursion_level >= rec_level: + if rec_level >= 0 and self.recursion_level >= rec_level: log.debug(LOG_CHECK, "... no, maximum recursion level reached.") return False if self.extern[0]: @@ -677,7 +683,7 @@ class UrlBase (object): """ if self.dlsize == 0: self.add_warning(_("Content size is zero."), - tag=WARN_URL_CONTENT_ZERO_SIZE) + tag=WARN_URL_CONTENT_SIZE_ZERO) else: maxbytes = self.aggregate.config["warnsizebytes"] if maxbytes is not None and self.dlsize >= maxbytes: @@ -685,7 +691,13 @@ class UrlBase (object): _("Content size %(dlsize)s is larger than %(maxbytes)s.") % {"dlsize": strformat.strsize(self.dlsize), "maxbytes": strformat.strsize(maxbytes)}, - tag=WARN_URL_CONTENT_TOO_LARGE) + tag=WARN_URL_CONTENT_SIZE_TOO_LARGE) + if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size: + self.add_warning(_("Download size (%(dlsize)d Byte) " + "does not equal content size (%(size)d Byte).") % + {"dlsize": self.dlsize, + "size": self.size}, + tag=WARN_URL_CONTENT_SIZE_UNEQUAL) def check_html (self): """Check HTML syntax of this page (which is supposed to be HTML)