From 18a200d85fcb090bcd0ffcbd53868eb2884956df Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 19 Sep 2012 11:05:26 +0200 Subject: [PATCH] Fix tests. --- linkcheck/checker/httpurl.py | 3 ++- linkcheck/checker/urlbase.py | 6 ++--- linkcheck/logger/__init__.py | 11 +++++++++ linkcheck/logger/csvlog.py | 2 +- linkcheck/logger/customxml.py | 2 +- linkcheck/logger/html.py | 2 +- linkcheck/logger/sitemapxml.py | 41 +++++++++++++--------------------- linkcheck/logger/sql.py | 2 +- linkcheck/logger/text.py | 2 +- 9 files changed, 36 insertions(+), 35 deletions(-) diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 62d30594..244fd441 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -492,7 +492,8 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): else: self.set_result(_("OK")) modified = rfc822.parsedate(self.getheader('Last-Modified', u'')) - self.modified = datetime.utcfromtimestamp(time.mktime(modified)) + if modified: + self.modified = datetime.utcfromtimestamp(time.mktime(modified)) def _try_http_response (self): """Try to get a HTTP response object. For reused persistent diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 37ccf04c..ce11b592 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -185,7 +185,7 @@ class UrlBase (object): # content size self.size = -1 # last modification time of content in HTTP-date format as specified in RFC2616 chapter 3.3.1 - self.modified = u"" + self.modified = None # download time self.dltime = -1 # download size @@ -1198,8 +1198,8 @@ class UrlBase (object): MIME content type for URL content. - url_data.level: int Recursion level until reaching this URL from start URL - - url_data.last_modified: unicode - Last modification date of retrieved page (or empty). + - url_data.last_modified: datetime + Last modification date of retrieved page (or None). """ return dict(valid=self.valid, extern=self.extern[0], diff --git a/linkcheck/logger/__init__.py b/linkcheck/logger/__init__.py index 6102253c..df43b90c 100644 --- a/linkcheck/logger/__init__.py +++ b/linkcheck/logger/__init__.py @@ -437,6 +437,17 @@ class Logger (object): self.stats.addrinfo_stats = addrinfo_stats self.stats.downloaded_bytes = download_stats + def format_modified(self, modified, sep=" "): + """Format modification date if it's not None. + @param modified: modification date + @ptype modified: datetime or None + @return: formatted date or empty string + @rtype: unicode + """ + if modified is not None: + return modified.isoformat(sep) + return u"" + # the standard URL logger implementations from .text import TextLogger diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index 30387e65..b8cdbfec 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -116,7 +116,7 @@ class CSVLogger (Logger): if self.has_part("level"): row.append(url_data.level) if self.has_part("modified"): - row.append(url_data.modified) + row.append(self.format_modified(url_data.modified)) self.writerow(map(strformat.unicode_safe, row)) self.flush() diff --git a/linkcheck/logger/customxml.py b/linkcheck/logger/customxml.py index d6b7d43b..1be55735 100644 --- a/linkcheck/logger/customxml.py +++ b/linkcheck/logger/customxml.py @@ -72,7 +72,7 @@ class CustomXMLLogger (xmllog.XMLLogger): self.xml_tag(u"info", info) self.xml_endtag(u"infos") if url_data.modified and self.has_part('modified'): - self.xml_tag(u"modified", url_data.modified) + self.xml_tag(u"modified", self.format_modified(url_data.modified)) if url_data.warnings and self.has_part('warning'): self.xml_starttag(u"warnings") for tag, data in url_data.warnings: diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index d30b1bbd..45b280e7 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -221,7 +221,7 @@ class HtmlLogger (Logger): def write_modified(self, url_data): """Write url_data.modified.""" - text = cgi.escape(url_data.modified.isoformat(" ")) + text = cgi.escape(self.format_modified(url_data.modified)) self.writeln(u'' + self.part("modified") + u""+text+u"") diff --git a/linkcheck/logger/sitemapxml.py b/linkcheck/logger/sitemapxml.py index 9981ae52..f7f73814 100644 --- a/linkcheck/logger/sitemapxml.py +++ b/linkcheck/logger/sitemapxml.py @@ -67,34 +67,33 @@ class SitemapXmlLogger (xmllog.XMLLogger): Update accounting data and determine if URL should be included in the sitemap. """ self.stats.log_url(url_data, do_print) - # ignore the do_print flag and determine ourselves if we filter the url - if (url_data.valid and - url_data.url.startswith((u'http:', u'https:')) and - url_data.url.startswith(self.prefix) and - url_data.content_type in ('text/html', "application/xhtml+xml")): - self.log_url(url_data) - - def log_url (self, url_data): - """ - Log URL data in sitemap format. - """ + # initialize prefix and priority if self.prefix is None: - # first URL (ie. the homepage) gets priority 1.0 per default self.prefix = url_data.url + # first URL (ie. the homepage) gets priority 1.0 per default priority = 1.0 else: # all other pages get priority 0.5 per default priority = 0.5 if self.priority is not None: priority = self.priority + # ignore the do_print flag and determine ourselves if we filter the url + if (url_data.valid and + url_data.url.startswith((u'http:', u'https:')) and + url_data.url.startswith(self.prefix) and + url_data.content_type in ('text/html', "application/xhtml+xml")): + self.log_url(url_data, priority=priority) + + def log_url (self, url_data, priority=None): + """ + Log URL data in sitemap format. + """ self.xml_starttag(u'url') self.xml_tag(u'loc', url_data.url) if url_data.modified: - modified = get_sitemap_modified(url_data.modified) - if modified: - self.xml_tag(u'lastmod', modified) + self.xml_tag(u'lastmod', self.format_modified(url_data.modified, sep="T")) self.xml_tag(u'changefreq', self.frequency) - self.xml_tag(u'priority', "%.1f" % priority) + self.xml_tag(u'priority', "%.2f" % priority) self.xml_endtag(u'url') self.flush() @@ -106,13 +105,3 @@ class SitemapXmlLogger (xmllog.XMLLogger): self.xml_end_output() self.close_fileoutput() - -def get_sitemap_modified(modified): - """Reformat UrlData modified string into sitemap format specified at - http://www.w3.org/TR/NOTE-datetime. - @param modified: last modified time - @ptype modified: datetime object with timezone information - @return: formatted date - @rtype: string - """ - return modified.isoformat('T') diff --git a/linkcheck/logger/sql.py b/linkcheck/logger/sql.py index 6556c9bd..d352401f 100644 --- a/linkcheck/logger/sql.py +++ b/linkcheck/logger/sql.py @@ -120,7 +120,7 @@ class SQLLogger (Logger): 'cached': intify(url_data.cached), 'separator': self.separator, "level": url_data.level, - "modified": url_data.modified, + "modified": sqlify(self.format_modified(url_data.modified)), }) self.flush() diff --git a/linkcheck/logger/text.py b/linkcheck/logger/text.py index 79e316f0..7b9051ac 100644 --- a/linkcheck/logger/text.py +++ b/linkcheck/logger/text.py @@ -181,7 +181,7 @@ class TextLogger (Logger): def write_modified(self, url_data): """Write url_data.modified.""" self.write(self.part("modified") + self.spaces("modified")) - self.writeln(url_data.modified.isoformat(" ")) + self.writeln(self.format_modified(url_data.modified)) def write_warning (self, url_data): """Write url_data.warning."""