mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Parse links in Word files.
This commit is contained in:
parent
34a2f4a15d
commit
6a2fcf8ae9
7 changed files with 129 additions and 9 deletions
|
|
@ -24,6 +24,8 @@ Fixes:
|
|||
|
||||
Features:
|
||||
- dns: Updated dnspython module from upstream version 1.8.1.
|
||||
- checking: Check hyperlinks of Word documents. Needs pywin32
|
||||
installed.
|
||||
|
||||
5.1 "Let the right one in" (released 04.08.2009)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,3 @@
|
|||
- [CHECKING] Parse Word files for hyperlinks (needs win32com)
|
||||
app = CreateObject("Word.Application")
|
||||
doc = app.Documents.Open("c:\test.doc")
|
||||
for link in doc.Hyperlinks:
|
||||
url = link.Address
|
||||
name = link.TextToDisplay
|
||||
- [HTTP] Do not fall back to GET when no recursion is requested on
|
||||
single pages. This would allow to check pages even if robots.txt
|
||||
disallows to get the page content.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -141,6 +141,7 @@ PARSE_EXTENSIONS = {
|
|||
"opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
|
||||
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
|
||||
"swf": re.compile(r'(?i)\.swf$'), # SWF file
|
||||
"word": re.compile(r'(?i)\.docx?$'), # Word files
|
||||
}
|
||||
|
||||
PARSE_MIMETYPES = (
|
||||
|
|
@ -148,6 +149,7 @@ PARSE_MIMETYPES = (
|
|||
"application/xhtml+xml",
|
||||
"text/css",
|
||||
"application/x-shockwave-flash",
|
||||
"application/msword",
|
||||
)
|
||||
|
||||
HTML_MIMETYPES = (
|
||||
|
|
|
|||
|
|
@ -204,6 +204,10 @@ class FileUrl (urlbase.UrlBase):
|
|||
path = prepare_urlpath_for_nt(path)
|
||||
return fileutil.pathencode(urllib.url2pathname(path))
|
||||
|
||||
def get_temp_filename (self):
|
||||
"""Get filename for content to parse."""
|
||||
return self.get_os_filename()
|
||||
|
||||
def is_directory (self):
|
||||
"""
|
||||
Check if file is a directory.
|
||||
|
|
|
|||
|
|
@ -671,6 +671,8 @@ Use URL `%(newurl)s' instead for checking.""") % {
|
|||
self.parse_css()
|
||||
elif ctype == "application/x-shockwave-flash":
|
||||
self.parse_swf()
|
||||
elif ctype == "application/msword":
|
||||
self.parse_word()
|
||||
|
||||
def get_robots_txt_url (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2000-2009 Bastian Kleineidam
|
||||
# Copyright (C) 2000-2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -27,11 +27,13 @@ import time
|
|||
import errno
|
||||
import socket
|
||||
import select
|
||||
import tempfile
|
||||
|
||||
from . import absolute_url, StoringHandler, get_url_from
|
||||
from ..cache import geoip
|
||||
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, clamav, containers)
|
||||
strformat, LinkCheckerError, url as urlutil, trace, clamav, containers,
|
||||
winutil)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse, titleparse
|
||||
from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN,
|
||||
|
|
@ -909,6 +911,37 @@ class UrlBase (object):
|
|||
parent_url=self.url)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_word (self):
|
||||
"""Parse a word file for hyperlinks."""
|
||||
if not winutil.has_word():
|
||||
return
|
||||
filename = self.get_temp_filename()
|
||||
# open word file and parse hyperlinks
|
||||
try:
|
||||
app = winutil.get_word_app()
|
||||
try:
|
||||
doc = winutil.open_word(app, filename)
|
||||
try:
|
||||
for link in doc.Hyperlinks:
|
||||
url_data = get_url_from(link.Address,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, name=link.TextToDisplay)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
finally:
|
||||
winutil.close_wordfile(doc)
|
||||
finally:
|
||||
winutil.close_word_app(app)
|
||||
except winutil.Error, msg:
|
||||
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
|
||||
|
||||
def get_temp_filename (self):
|
||||
"""Get temporary filename for content to parse."""
|
||||
# store content in temporary file
|
||||
fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_')
|
||||
fp = os.fdopen(fd)
|
||||
fp.write(self.get_content())
|
||||
fp.close()
|
||||
|
||||
def serialized (self):
|
||||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
|
|
|
|||
83
linkcheck/winutil.py
Normal file
83
linkcheck/winutil.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2010 Bastian Kleineidam
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
try:
|
||||
import win32com
|
||||
import pythoncom
|
||||
has_win32com = True
|
||||
class Error (pythoncom.com_error):
|
||||
"""Raised on errors."""
|
||||
pass
|
||||
except ImportError:
|
||||
has_win32com = False
|
||||
class Error (StandardError):
|
||||
"""Raised on errors."""
|
||||
pass
|
||||
|
||||
|
||||
def init_win32com ():
|
||||
"""Initialize the win32com.client cache."""
|
||||
import win32com.client
|
||||
if win32com.client.gencache.is_readonly:
|
||||
#allow gencache to create the cached wrapper objects
|
||||
win32com.client.gencache.is_readonly = False
|
||||
# under py2exe the call in gencache to __init__() does not happen
|
||||
# so we use Rebuild() to force the creation of the gen_py folder
|
||||
win32com.client.gencache.Rebuild()
|
||||
|
||||
|
||||
def _init ():
|
||||
if has_win32com:
|
||||
init_win32com()
|
||||
_init()
|
||||
|
||||
|
||||
|
||||
_has_app_cache = {}
|
||||
def has_word ():
|
||||
"""Determine if Word is available on the current system."""
|
||||
if not has_win32com:
|
||||
return False
|
||||
try:
|
||||
import _winreg
|
||||
key = _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, "Word.Application")
|
||||
_winreg.CloseKey(key)
|
||||
return True
|
||||
except (EnvironmentError, ImportError):
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def get_word_app ():
|
||||
"""Return open Word.Application handle, or None on error."""
|
||||
if not has_word():
|
||||
return None
|
||||
import win32com.client
|
||||
app = win32com.client.gencache.EnsureDispatch("Word.Application")
|
||||
app.Visible = False
|
||||
return app
|
||||
|
||||
|
||||
def close_word_app (app):
|
||||
app.Quit()
|
||||
|
||||
|
||||
def open_wordfile (app, filename):
|
||||
return app.Documents.Open(filename)
|
||||
|
||||
|
||||
def close_wordfile (doc):
|
||||
doc.Close()
|
||||
Loading…
Reference in a new issue