diff --git a/ChangeLog.txt b/ChangeLog.txt index cc5a9bd6..67c7688e 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -9,6 +9,8 @@ * Improved progress dialog in GUI client. + * The content size of downloads is now shown again. + 5.0.2 "All the boys love Mandy Lane" (released 13.2.2009) * Properly detect location of the log configuration file in the Windows diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 5f03e41a..5cb3d09f 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -20,7 +20,6 @@ Handle local file: links. import re import os -import time import urlparse import urllib import urllib2 @@ -162,36 +161,16 @@ class FileUrl (urlbase.UrlBase): {"path": path, "realpath": realpath}, tag=WARN_FILE_SYSTEM_PATH) - def get_content (self): - """ - Return file content, or in case of directories a dummy HTML file - with links to the files. - """ - if not self.valid: - return "" - if self.data is not None: - return self.data - elif self.is_directory(): - return self.get_directory_content() + def read_content (self): + """Return file content, or in case of directories a dummy HTML file + with links to the files.""" + if self.is_directory(): + data = get_index_html(get_files(self.get_os_filename())) + if isinstance(data, unicode): + data = data.encode("iso8859-1", "ignore") else: - return super(FileUrl, self).get_content() - - def get_directory_content (self): - """ - Get dummy HTML data for the directory content. - - @return: HTML data - @rtype: string - """ - t = time.time() - files = get_files(self.get_os_filename()) - data = get_index_html(files) - if isinstance(data, unicode): - data = data.encode("iso8859-1", "ignore") - self.data = data - self.dltime = time.time() - t - self.dlsize = len(self.data) - return self.data + data = super(FileUrl, self).read_content() + return data def is_html (self): """ diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index 8e0574d2..52312ad6 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -19,7 +19,6 @@ Handle FTP links. """ import ftplib -import time import urllib from cStringIO import StringIO @@ -194,20 +193,13 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if ro.search(self.url): getattr(self, "parse_"+key)() - def get_content (self): - """ - Return URL target content, or in case of directories a dummy HTML - file with links to the files. - """ - if not self.valid: - return "" - if self.data is not None: - return self.data - t = time.time() + def read_content (self): + """Return URL target content, or in case of directories a dummy HTML + file with links to the files.""" if self.is_directory(): self.url_connection.cwd(self.filename) self.files = self.get_files() - self.data = get_index_html(self.files) + data = get_index_html(self.files) else: # download file in BINARY mode ftpcmd = "RETR %s" % self.filename @@ -216,11 +208,9 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """Helper method storing given data""" buf.write(s) self.url_connection.retrbinary(ftpcmd, stor_data) - self.data = buf.getvalue() + data = buf.getvalue() buf.close() - self.dltime = time.time() - t - self.dlsize = len(self.data) - return self.data + return data def close_connection (self): """ diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 8078072d..f45f04e8 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -20,7 +20,6 @@ Handle http links. import urlparse import urllib -import time import re import zlib import socket @@ -124,6 +123,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.headers = None self.auth = None self.cookies = [] + # temporary data filled when reading redirections + self._data = None def allows_robots (self, url): """ @@ -548,26 +549,24 @@ Use URL `%(newurl)s' instead for checking.""") % { h.connect() return h - def get_content (self): - """ - Get content of the URL target. The content data is cached after + def read_content (self): + """Get content of the URL target. The content data is cached after the first call to this method. @return: URL content, decompressed and decoded @rtype: string """ - if self.data is None: - self.method = "GET" - response = self._get_http_response() - response = self.follow_redirections(response, set_result=False)[1] - self.headers = response.msg + self.method = "GET" + response = self._get_http_response() + response = self.follow_redirections(response, set_result=False)[1] + self.headers = response.msg + if self._data is None: self._read_content(response) - if self.data is None: - self.data = "" - return self.data + data = self._data + self._data = None + return data def _read_content (self, response): - t = time.time() data = response.read() encoding = headers.get_content_encoding(self.headers) if encoding in _supported_encodings: @@ -582,10 +581,8 @@ Use URL `%(newurl)s' instead for checking.""") % { tag=WARN_HTTP_DECOMPRESS_ERROR) f = StringIO(data) data = f.read() - if self.data is None and self.method == "GET" and \ - response.status not in [301, 302]: - self.data = data - self.dltime = time.time() - t + # store temporary data + self._data = data def encoding_supported (self): """Check if page encoding is supported.""" diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index b734f023..d51388cc 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -599,16 +599,19 @@ class UrlBase (object): return True def get_content (self): - """ - Precondition: url_connection is an opened URL. - """ + """Precondition: url_connection is an opened URL.""" if self.data is None: + log.debug(LOG_CHECK, "Get content of %r", self.url) t = time.time() - self.data = self.url_connection.read() + self.data = self.read_content() self.dltime = time.time() - t self.dlsize = len(self.data) return self.data + def read_content (self): + """Return data for this URL. Can be overridden in subclasses.""" + return self.url_connection.read() + def check_content (self): """Check content data for warnings, syntax errors, viruses etc.""" if not (self.can_get_content() and self.valid):