diff --git a/doc/changelog.txt b/doc/changelog.txt index c48d55ea..595e2259 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -24,6 +24,8 @@ Fixes: Features: - dns: Updated dnspython module from upstream version 1.8.1. +- checking: Check hyperlinks of Word documents. Needs pywin32 + installed. 5.1 "Let the right one in" (released 04.08.2009) diff --git a/doc/todo.txt b/doc/todo.txt index a456850d..5e3268ed 100644 --- a/doc/todo.txt +++ b/doc/todo.txt @@ -1,9 +1,3 @@ -- [CHECKING] Parse Word files for hyperlinks (needs win32com) - app = CreateObject("Word.Application") - doc = app.Documents.Open("c:\test.doc") - for link in doc.Hyperlinks: - url = link.Address - name = link.TextToDisplay - [HTTP] Do not fall back to GET when no recursion is requested on single pages. This would allow to check pages even if robots.txt disallows to get the page content. diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 46a8cb83..6cc34dd1 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2000-2009 Bastian Kleineidam +# Copyright (C) 2000-2010 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -141,6 +141,7 @@ PARSE_EXTENSIONS = { "opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file "css": re.compile(r'(?i)\.css$'), # CSS stylesheet "swf": re.compile(r'(?i)\.swf$'), # SWF file + "word": re.compile(r'(?i)\.docx?$'), # Word files } PARSE_MIMETYPES = ( @@ -148,6 +149,7 @@ PARSE_MIMETYPES = ( "application/xhtml+xml", "text/css", "application/x-shockwave-flash", + "application/msword", ) HTML_MIMETYPES = ( diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 7228ece0..789a678f 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -204,6 +204,10 @@ class FileUrl (urlbase.UrlBase): path = prepare_urlpath_for_nt(path) return fileutil.pathencode(urllib.url2pathname(path)) + def get_temp_filename (self): + """Get filename for content to parse.""" + return self.get_os_filename() + def is_directory (self): """ Check if file is a directory. diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index f09902bd..f6db157f 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -671,6 +671,8 @@ Use URL `%(newurl)s' instead for checking.""") % { self.parse_css() elif ctype == "application/x-shockwave-flash": self.parse_swf() + elif ctype == "application/msword": + self.parse_word() def get_robots_txt_url (self): """ diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 5e04dccd..5c3976f7 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -1,5 +1,5 @@ # -*- coding: iso-8859-1 -*- -# Copyright (C) 2000-2009 Bastian Kleineidam +# Copyright (C) 2000-2010 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -27,11 +27,13 @@ import time import errno import socket import select +import tempfile from . import absolute_url, StoringHandler, get_url_from from ..cache import geoip from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib, - strformat, LinkCheckerError, url as urlutil, trace, clamav, containers) + strformat, LinkCheckerError, url as urlutil, trace, clamav, containers, + winutil) from ..HtmlParser import htmlsax from ..htmlutil import linkparse, titleparse from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN, @@ -909,6 +911,37 @@ class UrlBase (object): parent_url=self.url) self.aggregate.urlqueue.put(url_data) + def parse_word (self): + """Parse a word file for hyperlinks.""" + if not winutil.has_word(): + return + filename = self.get_temp_filename() + # open word file and parse hyperlinks + try: + app = winutil.get_word_app() + try: + doc = winutil.open_word(app, filename) + try: + for link in doc.Hyperlinks: + url_data = get_url_from(link.Address, + self.recursion_level+1, self.aggregate, + parent_url=self.url, name=link.TextToDisplay) + self.aggregate.urlqueue.put(url_data) + finally: + winutil.close_wordfile(doc) + finally: + winutil.close_word_app(app) + except winutil.Error, msg: + log.warn(LOG_CHECK, "Error parsing word file: %s", msg) + + def get_temp_filename (self): + """Get temporary filename for content to parse.""" + # store content in temporary file + fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_') + fp = os.fdopen(fd) + fp.write(self.get_content()) + fp.close() + def serialized (self): """ Return serialized url check data as unicode string. diff --git a/linkcheck/winutil.py b/linkcheck/winutil.py new file mode 100644 index 00000000..8fac483d --- /dev/null +++ b/linkcheck/winutil.py @@ -0,0 +1,83 @@ +# -*- coding: iso-8859-1 -*- +# Copyright (C) 2010 Bastian Kleineidam +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +try: + import win32com + import pythoncom + has_win32com = True + class Error (pythoncom.com_error): + """Raised on errors.""" + pass +except ImportError: + has_win32com = False + class Error (StandardError): + """Raised on errors.""" + pass + + +def init_win32com (): + """Initialize the win32com.client cache.""" + import win32com.client + if win32com.client.gencache.is_readonly: + #allow gencache to create the cached wrapper objects + win32com.client.gencache.is_readonly = False + # under py2exe the call in gencache to __init__() does not happen + # so we use Rebuild() to force the creation of the gen_py folder + win32com.client.gencache.Rebuild() + + +def _init (): + if has_win32com: + init_win32com() +_init() + + + +_has_app_cache = {} +def has_word (): + """Determine if Word is available on the current system.""" + if not has_win32com: + return False + try: + import _winreg + key = _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, "Word.Application") + _winreg.CloseKey(key) + return True + except (EnvironmentError, ImportError): + pass + return False + + +def get_word_app (): + """Return open Word.Application handle, or None on error.""" + if not has_word(): + return None + import win32com.client + app = win32com.client.gencache.EnsureDispatch("Word.Application") + app.Visible = False + return app + + +def close_word_app (app): + app.Quit() + + +def open_wordfile (app, filename): + return app.Documents.Open(filename) + + +def close_wordfile (doc): + doc.Close()