Parse links in Word files.

This commit is contained in:
Bastian Kleineidam 2010-03-07 19:20:51 +01:00
parent 34a2f4a15d
commit 6a2fcf8ae9
7 changed files with 129 additions and 9 deletions

View file

@ -24,6 +24,8 @@ Fixes:
Features:
- dns: Updated dnspython module from upstream version 1.8.1.
- checking: Check hyperlinks of Word documents. Needs pywin32
installed.
5.1 "Let the right one in" (released 04.08.2009)

View file

@ -1,9 +1,3 @@
- [CHECKING] Parse Word files for hyperlinks (needs win32com)
app = CreateObject("Word.Application")
doc = app.Documents.Open("c:\test.doc")
for link in doc.Hyperlinks:
url = link.Address
name = link.TextToDisplay
- [HTTP] Do not fall back to GET when no recursion is requested on
single pages. This would allow to check pages even if robots.txt
disallows to get the page content.

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2009 Bastian Kleineidam
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -141,6 +141,7 @@ PARSE_EXTENSIONS = {
"opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
"swf": re.compile(r'(?i)\.swf$'), # SWF file
"word": re.compile(r'(?i)\.docx?$'), # Word files
}
PARSE_MIMETYPES = (
@ -148,6 +149,7 @@ PARSE_MIMETYPES = (
"application/xhtml+xml",
"text/css",
"application/x-shockwave-flash",
"application/msword",
)
HTML_MIMETYPES = (

View file

@ -204,6 +204,10 @@ class FileUrl (urlbase.UrlBase):
path = prepare_urlpath_for_nt(path)
return fileutil.pathencode(urllib.url2pathname(path))
def get_temp_filename (self):
"""Get filename for content to parse."""
return self.get_os_filename()
def is_directory (self):
"""
Check if file is a directory.

View file

@ -671,6 +671,8 @@ Use URL `%(newurl)s' instead for checking.""") % {
self.parse_css()
elif ctype == "application/x-shockwave-flash":
self.parse_swf()
elif ctype == "application/msword":
self.parse_word()
def get_robots_txt_url (self):
"""

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2009 Bastian Kleineidam
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -27,11 +27,13 @@ import time
import errno
import socket
import select
import tempfile
from . import absolute_url, StoringHandler, get_url_from
from ..cache import geoip
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
strformat, LinkCheckerError, url as urlutil, trace, clamav, containers)
strformat, LinkCheckerError, url as urlutil, trace, clamav, containers,
winutil)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse, titleparse
from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN,
@ -909,6 +911,37 @@ class UrlBase (object):
parent_url=self.url)
self.aggregate.urlqueue.put(url_data)
def parse_word (self):
"""Parse a word file for hyperlinks."""
if not winutil.has_word():
return
filename = self.get_temp_filename()
# open word file and parse hyperlinks
try:
app = winutil.get_word_app()
try:
doc = winutil.open_word(app, filename)
try:
for link in doc.Hyperlinks:
url_data = get_url_from(link.Address,
self.recursion_level+1, self.aggregate,
parent_url=self.url, name=link.TextToDisplay)
self.aggregate.urlqueue.put(url_data)
finally:
winutil.close_wordfile(doc)
finally:
winutil.close_word_app(app)
except winutil.Error, msg:
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
def get_temp_filename (self):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_')
fp = os.fdopen(fd)
fp.write(self.get_content())
fp.close()
def serialized (self):
"""
Return serialized url check data as unicode string.

83
linkcheck/winutil.py Normal file
View file

@ -0,0 +1,83 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
try:
import win32com
import pythoncom
has_win32com = True
class Error (pythoncom.com_error):
"""Raised on errors."""
pass
except ImportError:
has_win32com = False
class Error (StandardError):
"""Raised on errors."""
pass
def init_win32com ():
"""Initialize the win32com.client cache."""
import win32com.client
if win32com.client.gencache.is_readonly:
#allow gencache to create the cached wrapper objects
win32com.client.gencache.is_readonly = False
# under py2exe the call in gencache to __init__() does not happen
# so we use Rebuild() to force the creation of the gen_py folder
win32com.client.gencache.Rebuild()
def _init ():
if has_win32com:
init_win32com()
_init()
_has_app_cache = {}
def has_word ():
"""Determine if Word is available on the current system."""
if not has_win32com:
return False
try:
import _winreg
key = _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, "Word.Application")
_winreg.CloseKey(key)
return True
except (EnvironmentError, ImportError):
pass
return False
def get_word_app ():
"""Return open Word.Application handle, or None on error."""
if not has_word():
return None
import win32com.client
app = win32com.client.gencache.EnsureDispatch("Word.Application")
app.Visible = False
return app
def close_word_app (app):
app.Quit()
def open_wordfile (app, filename):
return app.Documents.Open(filename)
def close_wordfile (doc):
doc.Close()