From 3a352631bab7a16e862e832c0cedfc85f783cfc8 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Tue, 18 Sep 2012 12:12:00 +0200 Subject: [PATCH] Add modified field to loggers. --- config/create.sql | 3 ++- config/linkcheckerrc | 5 +++++ doc/changelog.txt | 2 ++ doc/upgrading.txt | 7 +++++++ linkcheck/checker/fileurl.py | 8 +++++--- linkcheck/checker/httpurl.py | 4 +--- linkcheck/checker/urlbase.py | 6 ++++++ linkcheck/logger/__init__.py | 1 + linkcheck/logger/csvlog.py | 25 +++++++++---------------- linkcheck/logger/customxml.py | 4 +++- linkcheck/logger/html.py | 8 ++++++++ linkcheck/logger/sitemapxml.py | 14 +++++++++++--- linkcheck/logger/sql.py | 6 ++++-- linkcheck/logger/text.py | 7 +++++++ 14 files changed, 71 insertions(+), 29 deletions(-) diff --git a/config/create.sql b/config/create.sql index eeae1035..ca6ad8d2 100644 --- a/config/create.sql +++ b/config/create.sql @@ -19,5 +19,6 @@ create table linksdb ( dltime int, dlsize int, cached int, - level int not null + level int not null, + modified varchar(256) ); diff --git a/config/linkcheckerrc b/config/linkcheckerrc index 40aa5fc9..337c9ee5 100644 --- a/config/linkcheckerrc +++ b/config/linkcheckerrc @@ -119,6 +119,11 @@ [gxml] #encoding=iso-8859-1 +# Sitemap logger +[sitemap] +#priority=0.7 +#frequency=weekly + ##################### checking options ########################## [checking] diff --git a/doc/changelog.txt b/doc/changelog.txt index 4a7343ee..d90b519c 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -14,6 +14,8 @@ Changes: - logging: Print download and cache statistics in text output logger. - logging: Print warning tag in text output logger. Makes warning filtering more easy. +- logging: Make the last modification time a separate field in logging + output. See doc/upgrading.txt for compatibility changes. Fixes: - logging: Close logger properly on I/O errors. diff --git a/doc/upgrading.txt b/doc/upgrading.txt index 906bb068..fdcfcd5f 100644 --- a/doc/upgrading.txt +++ b/doc/upgrading.txt @@ -1,5 +1,12 @@ Upgrading ========= +Migrating from 8.0 to 8.1 +------------------------- +All loggers have an additional output field "modified". +If these loggers are not configured with specific output parts, +the output format will change. +For example existing SQL tables can be altered with: +alter table linkcheck add (modified varchar(256)); Migrating from 7.9 to 8.0 ------------------------- diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 8f68ec38..3d6d602a 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -25,7 +25,7 @@ import urllib import urllib2 from . import urlbase, get_index_html, get_url_from -from .. import log, LOG_CHECK, fileutil, LinkCheckerError, url as urlutil +from .. import log, LOG_CHECK, fileutil, strformat, LinkCheckerError, url as urlutil from ..bookmarks import firefox from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH @@ -148,14 +148,16 @@ class FileUrl (urlbase.UrlBase): self.url = urlutil.urlunsplit(self.urlparts) def add_size_info (self): - """Get size of file content from filename path.""" + """Get size of file content and modification time from filename path.""" if self.is_directory(): # Directory size always differs from the customer index.html # that is generated. So return without calculating any size. return - self.size = fileutil.get_size(self.get_os_filename()) + filename = self.get_os_filename() + self.size = fileutil.get_size(filename) if self.dlsize == -1: self.dlsize = self.size + self.modified = strformat.strtime(fileutil.get_mtime(filename)) def check_connection (self): """ diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 68d0637d..051bc172 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -488,9 +488,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.set_result(u"%r %s" % (response.status, response.reason)) else: self.set_result(_("OK")) - modified = self.getheader('Last-Modified', u'') - if modified: - self.add_info(_("Last modified %(date)s.") % {"date": modified}) + self.modified = self.getheader('Last-Modified', u'') def _try_http_response (self): """Try to get a HTTP response object. For reused persistent diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index d9003b0b..37ccf04c 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -184,6 +184,8 @@ class UrlBase (object): self.info = [] # content size self.size = -1 + # last modification time of content in HTTP-date format as specified in RFC2616 chapter 3.3.1 + self.modified = u"" # download time self.dltime = -1 # download size @@ -1196,6 +1198,8 @@ class UrlBase (object): MIME content type for URL content. - url_data.level: int Recursion level until reaching this URL from start URL + - url_data.last_modified: unicode + Last modification date of retrieved page (or empty). """ return dict(valid=self.valid, extern=self.extern[0], @@ -1218,6 +1222,7 @@ class UrlBase (object): cache_url_key=self.cache_url_key, content_type=self.get_content_type(), level=self.recursion_level, + modified=self.modified, ) def to_wire (self): @@ -1249,6 +1254,7 @@ urlDataAttr = [ 'dltime', 'dlsize', 'info', + 'modified', 'line', 'column', 'cache_url_key', diff --git a/linkcheck/logger/__init__.py b/linkcheck/logger/__init__.py index c9824ea9..6102253c 100644 --- a/linkcheck/logger/__init__.py +++ b/linkcheck/logger/__init__.py @@ -42,6 +42,7 @@ Fields = dict( checktime=_("Check time"), url=_("URL"), level=_("Level"), + modified=_("Modified"), ) del _ diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index b580f1e0..30387e65 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -23,6 +23,12 @@ import sys from . import Logger from .. import strformat +Columns = ( + u"urlname", u"parentname", u"baseref", u"result", u"warningstring", + u"infostring", u"valid", u"url", u"line", u"column", u"name", + u"dltime", u"dlsize", u"checktime", u"cached", u"level", u"modified", +) + class CSVLogger (Logger): """ @@ -68,22 +74,7 @@ class CSVLogger (Logger): self.writer = csv.writer(self.fd, dialect='excel', delimiter=self.separator, lineterminator=self.linesep, quotechar=self.quotechar) - for s in (u"urlname", - u"parentname", - u"baseref", - u"result", - u"warningstring", - u"infostring", - u"valid", - u"url", - u"line", - u"column", - u"name", - u"dltime", - u"dlsize", - u"checktime", - u"cached", - u"level"): + for s in Columns: if self.has_part(s): row.append(s) if row: @@ -124,6 +115,8 @@ class CSVLogger (Logger): row.append(url_data.cached) if self.has_part("level"): row.append(url_data.level) + if self.has_part("modified"): + row.append(url_data.modified) self.writerow(map(strformat.unicode_safe, row)) self.flush() diff --git a/linkcheck/logger/customxml.py b/linkcheck/logger/customxml.py index 66fcad59..d6b7d43b 100644 --- a/linkcheck/logger/customxml.py +++ b/linkcheck/logger/customxml.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2000-2011 Bastian Kleineidam +# Copyright (C) 2000-2012 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -71,6 +71,8 @@ class CustomXMLLogger (xmllog.XMLLogger): for info in url_data.info: self.xml_tag(u"info", info) self.xml_endtag(u"infos") + if url_data.modified and self.has_part('modified'): + self.xml_tag(u"modified", url_data.modified) if url_data.warnings and self.has_part('warning'): self.xml_starttag(u"warnings") for tag, data in url_data.warnings: diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index ef45d496..45e46cc3 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -119,6 +119,8 @@ class HtmlLogger (Logger): self.write_checktime(url_data) if url_data.info and self.has_part("info"): self.write_info(url_data) + if url_data.modified and self.has_part("modified"): + self.write_modified(url_data) if url_data.warnings and self.has_part("warning"): self.write_warning(url_data) if self.has_part("result"): @@ -217,6 +219,12 @@ class HtmlLogger (Logger): self.writeln(u'' + self.part("info")+ u""+text+u"") + def write_modified(self, url_data): + """Write url_data.modified.""" + text = cgi.escape(url_data.modified) + self.writeln(u'' + self.part("modified") + + u""+text+u"") + def write_warning (self, url_data): """Write url_data.warnings.""" sep = u"
"+os.linesep diff --git a/linkcheck/logger/sitemapxml.py b/linkcheck/logger/sitemapxml.py index ea2b88a4..8b4afd5f 100644 --- a/linkcheck/logger/sitemapxml.py +++ b/linkcheck/logger/sitemapxml.py @@ -91,9 +91,10 @@ class SitemapXmlLogger (xmllog.XMLLogger): priority = self.priority self.xml_starttag(u'url') self.xml_tag(u'loc', url_data.url) - # use it when last modified is part of official URL - #if url_data.last_modified: - # self.xml_tag(u'lastmod', url_data.last_modified) + if url_data.modified: + modified = get_sitemap_modified(url_data.modified) + if modified: + self.xml_tag(u'lastmod', modified) self.xml_tag(u'changefreq', self.frequency) self.xml_tag(u'priority', "%.1f" % priority) self.xml_endtag(u'url') @@ -106,3 +107,10 @@ class SitemapXmlLogger (xmllog.XMLLogger): self.xml_endtag(u"urlset") self.xml_end_output() self.close_fileoutput() + + +def get_sitemap_modified(s): + """Reformat UrlData modified string into sitemap format specified at + http://www.w3.org/TR/NOTE-datetime.""" + # XXX + return None diff --git a/linkcheck/logger/sql.py b/linkcheck/logger/sql.py index 0ec117dc..6556c9bd 100644 --- a/linkcheck/logger/sql.py +++ b/linkcheck/logger/sql.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2000-2011 Bastian Kleineidam +# Copyright (C) 2000-2012 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -99,7 +99,8 @@ class SQLLogger (Logger): "%(dltime)d," "%(dlsize)d," "%(cached)d," - "%(level)d" + "%(level)d," + "%(modified)s" ")%(separator)s" % {'table': self.dbname, 'base_url': sqlify(url_data.base_url), @@ -119,6 +120,7 @@ class SQLLogger (Logger): 'cached': intify(url_data.cached), 'separator': self.separator, "level": url_data.level, + "modified": url_data.modified, }) self.flush() diff --git a/linkcheck/logger/text.py b/linkcheck/logger/text.py index 6bdde9f3..cb381867 100644 --- a/linkcheck/logger/text.py +++ b/linkcheck/logger/text.py @@ -110,6 +110,8 @@ class TextLogger (Logger): self.write_dlsize(url_data) if url_data.info and self.has_part('info'): self.write_info(url_data) + if url_data.modified and self.has_part('modified'): + self.write_modified(url_data) if url_data.warnings and self.has_part('warning'): self.write_warning(url_data) if self.has_part('result'): @@ -176,6 +178,11 @@ class TextLogger (Logger): self.write(self.part("info") + self.spaces("info")) self.writeln(self.wrap(url_data.info, 65), color=self.colorinfo) + def write_modified(self, url_data): + """Write url_data.modified.""" + self.write(self.part("modified") + self.spaces("modified")) + self.writeln(url_data.modified) + def write_warning (self, url_data): """Write url_data.warning.""" self.write(self.part("warning") + self.spaces("warning"))