Add virus checking

New option --scan-virus to check the content of URLs for
viruses with ClamAV.


git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3753 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2008-05-20 08:57:37 +00:00
parent fbf99033d4
commit dbb498a395
18 changed files with 1922 additions and 1326 deletions

View file

@ -1,7 +1,5 @@
- [INTERFACE] make a nice GUI for linkchecker
- [FEATURE] Virus check
- [DOC] The ALLOWED_HOSTS and ALLOWED_SERVERS variables in lc.cgi
are not documented.

View file

@ -138,6 +138,10 @@
# check HTML or CSS syntax online with W3C validators
#checkhtmlw3=1
#checkcssw3=1
# scan URL content for viruses with ClamAV
#scanvirus=1
# ClamAV config file
#clamavconf=/etc/clamav/clamd.conf
# filtering options
[filtering]

589
doc/de.po

File diff suppressed because it is too large Load diff

View file

@ -113,6 +113,9 @@ Prüfe Syntax von CSS URLs mit lokaler Bibliothek (cssutils).
\fB\-\-check\-css\-w3\fP
Prüfe Syntax von CSS URLs mit dem W3C Online Validator.
.TP
\fB\-\-scan\-virus\fP
Prüfe Inhalt von URLs auf Viren mit ClamAV.
.TP
\fB\-q\fP, \fB\-\-quiet\fP
Keine Ausgabe, ein Alias für \fB\-o none\fP. Dies ist nur in Verbindung mit
\fB\-F\fP nützlich.

View file

@ -81,6 +81,16 @@ Kommandozeilenoption: \fB\-\-check\-html\fP
Prüfe Syntax von CSS URLs.
.br
Kommandozeilenoption: \fB\-\-check\-css\fP
.TP
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
Prüfe Inhalt von URLs auf Viren mit ClamAV.
.br
Kommandozeilenoption: \fB\-\-scan\-virus\fP
.TP
\fBclamavconf=\fP\fIDateiname\fP
Dateiname von \fBclamd.conf\fP Konfigurationsdatei.
.br
Kommandozeilenoption: keine
.SS [filtering]
.TP
\fBignore=\fP\fIREGEX\fP (MULTILINE)

View file

@ -109,6 +109,9 @@ Check syntax of CSS URLs with local library (cssutils).
\fB\-\-check\-css\-w3\fP
Check syntax of CSS URLs with W3C online validator.
.TP
\fB\-\-scan\-virus\fP
Scan content of URLs for viruses with ClamAV.
.TP
\fB\-q\fP, \fB\-\-quiet\fP
Quiet operation, an alias for \fB\-o none\fP.
This is only useful with \fB\-F\fP.

View file

@ -75,6 +75,16 @@ Command line option: \fB\-\-check\-html\fP
Check syntax of CSS URLs.
.br
Command line option: \fB\-\-check\-css\fP
.TP
\fBscanvirus=\fP[\fB0\fP|\fB1\fP]
Scan content of URLs for viruses with ClamAV.
.br
Command line option: \fB\-\-scan\-virus\fP
.TP
\fBclamavconf=\fIfilename\fP
Filename of \fBclamd.conf\fP config file.
.br
Command line option: none
.SS \fB[filtering]\fP
.TP
\fBignore=\fP\fIREGEX\fP (MULTILINE)

File diff suppressed because it is too large Load diff

View file

@ -33,7 +33,7 @@ from . import absolute_url, StoringHandler, get_url_from
from ..cache import geoip
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
strformat, linkparse, containers, LinkCheckerError, url as urlutil,
trace)
trace, clamav)
from ..HtmlParser import htmlsax
from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN,
WARN_URL_UNNORMED, WARN_URL_ERROR_GETTING_CONTENT,
@ -407,26 +407,8 @@ class UrlBase (object):
if isinstance(value, httplib.BadStatusLine):
value = _('Bad HTTP response %r') % str(value)
self.set_result(unicode_safe(value), valid=False)
# check content
warningregex = self.aggregate.config["warningregex"]
if warningregex and self.valid:
log.debug(LOG_CHECK, "checking content")
try:
self.check_content(warningregex)
except tuple(ExcList):
value = self.handle_exception()
self.set_result(unicode_safe(value), valid=False)
# check HTML/CSS syntax
if self.valid and not self.extern[0]:
if self.aggregate.config["checkhtml"] and self.is_html():
self.check_html()
if self.aggregate.config["checkcss"] and self.is_css():
self.check_css()
if self.aggregate.config["checkhtmlw3"] and self.is_html():
self.check_html_w3()
if self.aggregate.config["checkcssw3"] and self.is_css():
self.check_css_w3()
if self.can_get_content():
self.check_content()
self.checktime = time.time() - check_start
# check recursion
try:
@ -596,17 +578,36 @@ class UrlBase (object):
self.dlsize = len(self.data)
return self.data
def check_content (self, warningregex):
"""
If a warning expression was given, call this function to check it
against the content of this url.
"""
if not self.can_get_content():
def check_content (self):
"""Check content data for warnings, syntax errors, viruses etc."""
if not (self.can_get_content() and self.valid):
# no data to check
return
match = warningregex.search(self.get_content())
if match:
self.add_warning(_("Found %r in link contents.") % match.group(),
tag=WARN_URL_WARNREGEX_FOUND)
warningregex = self.aggregate.config["warningregex"]
if warningregex:
log.debug(LOG_CHECK, "checking content")
try:
match = warningregex.search(self.get_content())
if match:
self.add_warning(_("Found %r in link contents.") %
match.group(), tag=WARN_URL_WARNREGEX_FOUND)
except tuple(ExcList):
value = self.handle_exception()
self.set_result(unicode_safe(value), valid=False)
# is it an intern URL?
if not self.extern[0]:
# check HTML/CSS syntax
if self.aggregate.config["checkhtml"] and self.is_html():
self.check_html()
if self.aggregate.config["checkcss"] and self.is_css():
self.check_css()
if self.aggregate.config["checkhtmlw3"] and self.is_html():
self.check_html_w3()
if self.aggregate.config["checkcssw3"] and self.is_css():
self.check_css_w3()
# check with clamav
if self.aggregate.config["scanvirus"]:
self.scan_virus()
def check_size (self):
"""
@ -743,6 +744,14 @@ class UrlBase (object):
log.warn(LOG_CHECK,
_("warning: CSS W3C validation caused error: %s ") % err)
def scan_virus (self):
"""Scan content for viruses."""
infected, errors = clamav.scan(self.get_content())
for msg in infected:
self.add_warning(u"Virus scan infection: %s" % msg)
for msg in errors:
self.add_warning(u"Virus scan error: %s" % msg)
def parse_url (self):
"""
Parse url content and search for recursive links.

195
linkcheck/clamav.py Normal file
View file

@ -0,0 +1,195 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
from __future__ import with_statement
import socket
import os
from . import log, LOG_CHECK
from .socketutil import create_socket
class ClamavError (Exception):
pass
class ClamdScanner (object):
"""Virus scanner using a clamd daemon process."""
def __init__ (self, clamav_conf):
"""Initialize clamd daemon process sockets."""
self.infected = []
self.errors = []
self.sock, self.host = clamav_conf.new_connection()
self.sock_rcvbuf = \
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF)
self.wsock = self.new_scansock()
def new_scansock (self):
"""Return a connected socket for sending scan data to it."""
port = None
try:
self.sock.sendall("STREAM")
port = None
for i in xrange(60):
data = self.sock.recv(self.sock_rcvbuf)
i = data.find("PORT")
if i != -1:
port = int(data[i+5:])
break
except socket.error:
self.sock.close()
raise
if port is None:
raise ClamavError(_("clamd is not ready for stream scanning"))
sockinfo = get_sockinfo(self.host, port=port)
wsock = create_socket(socket.AF_INET, socket.SOCK_STREAM)
try:
wsock.connect(sockinfo[0][4])
except socket.error:
wsock.close()
raise
return wsock
def scan (self, data):
"""Scan given data for viruses."""
self.wsock.sendall(data)
def close (self):
"""Get results and close clamd daemon sockets."""
self.wsock.close()
data = self.sock.recv(self.sock_rcvbuf)
while data:
if "FOUND\n" in data:
self.infected.append(data)
if "ERROR\n" in data:
self.errors.append(data)
data = self.sock.recv(self.sock_rcvbuf)
self.sock.close()
def canonical_clamav_conf ():
"""Default clamav configs for various platforms."""
if os.name == 'posix':
clamavconf = "/etc/clamav/clamd.conf"
elif os.name == 'nt':
clamavconf = r"c:\clamav-devel\etc\clamd.conf"
else:
clamavconf = "clamd.conf"
return clamavconf
_clamav_conf = None
def init_clamav_conf (conf):
"""Initialize clamav configuration."""
if not conf:
# clamav was not configured
return
if os.path.isfile(conf):
global _clamav_conf
_clamav_conf = ClamavConfig(conf)
else:
log.warn(LOG_CHECK, "No ClamAV config file found at %r.", conf)
def get_clamav_conf ():
"""Get the ClamavConfig instance."""
return _clamav_conf
def get_sockinfo (host, port=None):
"""Return socket.getaddrinfo for given host and port."""
family, socktype = socket.AF_INET, socket.SOCK_STREAM
return socket.getaddrinfo(host, port, family, socktype)
class ClamavConfig (dict):
"""Clamav configuration wrapper, with clamd connection method."""
def __init__ (self, filename):
"""Parse clamav configuration file."""
super(ClamavConfig, self).__init__()
self.parseconf(filename)
if self.get('ScannerDaemonOutputFormat'):
raise ClamavError(_("ScannerDaemonOutputFormat must be disabled"))
if self.get('TCPSocket') and self.get('LocalSocket'):
raise ClamavError(_("only one of TCPSocket and LocalSocket must be enabled"))
def parseconf (self, filename):
"""Parse clamav configuration from given file."""
with open(filename) as fd:
# yet another config format, sigh
for line in fd:
line = line.strip()
if not line or line.startswith("#"):
# ignore empty lines and comments
continue
split = line.split(None, 1)
if len(split) == 1:
self[split[0]] = True
else:
self[split[0]] = split[1]
def new_connection (self):
"""Connect to clamd for stream scanning.
@return: tuple (connected socket, host)
"""
if self.get('LocalSocket'):
host = 'localhost'
sock = self.create_local_socket()
elif self.get('TCPSocket'):
host = self.get('TCPAddr', 'localhost')
sock = self.create_tcp_socket(host)
else:
raise ClamavError(_("one of TCPSocket or LocalSocket must be enabled"))
return sock, host
def create_local_socket (self):
"""Create local socket, connect to it and return socket object."""
sock = create_socket(socket.AF_UNIX, socket.SOCK_STREAM)
addr = self['LocalSocket']
try:
sock.connect(addr)
except socket.error:
sock.close()
raise
return sock
def create_tcp_socket (self, host):
"""Create tcp socket, connect to it and return socket object."""
port = int(self['TCPSocket'])
sockinfo = get_sockinfo(host, port=port)
sock = create_socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.connect(sockinfo[0][4])
except socket.error:
sock.close()
raise
return sock
def scan (data):
"""Scan data for viruses.
@return (infection msgs, errors)
@rtype ([], [])
"""
clamconf = ClamavConfig(canonical_clamav_conf())
scanner = ClamdScanner(clamconf)
try:
scanner.scan(data)
finally:
scanner.close()
return scanner.infected, scanner.errors

View file

@ -23,7 +23,7 @@ import os
import logging.config
import urllib
import _linkchecker_configdata
from .. import log, LOG_CHECK, LOG, ansicolor, lognames
from .. import log, LOG_CHECK, LOG, ansicolor, lognames, clamav
from . import confparse
Version = _linkchecker_configdata.version
@ -148,6 +148,8 @@ class Configuration (dict):
self["checkcss"] = False
self["checkhtmlw3"] = False
self["checkcssw3"] = False
self["scanvirus"] = False
self["clamavconf"] = clamav.canonical_clamav_conf()
def init_logging (self, debug=None):
"""
@ -235,3 +237,24 @@ class Configuration (dict):
if 'url-anchor-not-found' in self["ignorewarnings"]:
self["ignorewarnings"].remove('url-anchor-not-found')
self['logger'] = self.logger_new(self['output'])
if self['checkhtml']:
try:
import tidy
except ImportError:
log.warn(LOG_CHECK,
_("warning: tidy module is not available; " \
"download from http://utidylib.berlios.de/"))
self['checkhtml'] = False
if self['checkcss']:
try:
import cssutils
except ImportError:
log.warn(LOG_CHECK,
_("warning: cssutils module is not available; " \
"download from http://cthedot.de/cssutils/"))
self['checkcss'] = False
if self['scanvirus']:
try:
clamav.init_clamav_conf(self['clamavconf'])
except clamav.ClamavError, msg:
self['scanvirus'] = False

View file

@ -131,33 +131,18 @@ class LCConfigParser (ConfigParser.RawConfigParser, object):
def read_check_options (self, section):
"""Read check* options."""
if self.has_option(section,"checkhtml"):
val = self.getboolean(section, "checkhtml")
if val:
try:
import tidy
except ImportError:
log.warn(LOG_CHECK,
_("warning: tidy module is not available; " \
"download from http://utidylib.berlios.de/"))
val = False
self.config["checkhtml"] = val
self.config["checkhtml"] = self.getboolean(section, "checkhtml")
if self.has_option(section,"checkcss"):
val = self.getboolean(section, "checkcss")
if val:
try:
import cssutils
except ImportError:
log.warn(LOG_CHECK,
_("warning: cssutils module is not available; " \
"download from http://cthedot.de/cssutils/"))
val = False
self.config["checkcss"] = val
self.config["checkcss"] = self.getboolean(section, "checkcss")
if self.has_option(section,"checkhtmlw3"):
val = self.getboolean(section, "checkhtmlw3")
self.config["checkhtmlw3"] = val
if self.has_option(section,"checkcssw3"):
val = self.getboolean(section, "checkcssw3")
self.config["checkcssw3"] = val
self.config["checkcssw3"] = self.getboolean(section, "checkcssw3")
if self.has_option(section, "scanvirus"):
self.config["scanvirus"] = self.getboolean(section, "scanvirus")
if self.has_option(section, "clamavconf"):
self.config["clamavconf"] = self.getboolean(section, "clamavconf")
def read_authentication_config (self):
"""Read configuration options in section "authentication"."""

52
linkcheck/socketutil.py Normal file
View file

@ -0,0 +1,52 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import socket
# test for IPv6, both in Python build and in kernel build
has_ipv6 = False
if socket.has_ipv6:
# python has ipv6 compiled in, but the operating system also
# has to support it.
try:
socket.socket(socket.AF_INET6, socket.SOCK_STREAM).close()
has_ipv6 = True
except socket.error, msg:
# only catch these one:
# socket.error: (97, 'Address family not supported by protocol')
# socket.error: (10047, 'Address family not supported by protocol')
if msg[0] not in (97, 10047):
raise
def create_socket (family, socktype, proto=0, timeout=60):
"""
Create a socket with given family and type. If SSL context
is given an SSL socket is created.
"""
sock = socket.socket(family, socktype, proto=proto)
sock.settimeout(timeout)
socktypes_inet = [socket.AF_INET]
if has_ipv6:
socktypes_inet.append(socket.AF_INET6)
if family in socktypes_inet and socktype == socket.SOCK_STREAM:
# disable NAGLE algorithm, which means sending pending data
# immediately, possibly wasting bandwidth but improving
# responsiveness for fast networks
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
return sock

View file

@ -0,0 +1,50 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Test virus filter.
"""
import unittest
from linkcheck import clamav
class TestClamav (unittest.TestCase):
needed_resources = ['clamav']
def testClean (self):
data = ""
infected, errors = clamav.scan(data)
self.assertFalse(infected)
self.assertFalse(errors)
def testInfected (self):
data = '<object data="&#109;s-its:mhtml:file://'+ \
'C:\\foo.mht!${PATH}/' + \
'EXPLOIT.CHM::' + \
'/exploit.htm">'
infected, errors = clamav.scan(data)
msg = 'stream: Exploit.HTML.MHTRedir.2n FOUND\n'
self.assert_(msg in infected)
self.assertFalse(errors)
def test_suite ():
return unittest.makeSuite(TestClamav)
if __name__ == '__main__':
unittest.main()

View file

@ -379,6 +379,9 @@ group.add_option("--check-css", action="store_true", dest="checkcss",
group.add_option("--check-css-w3", action="store_true", dest="checkcssw3",
help=_(
"""Check syntax of CSS URLs with W3C online validator."""))
group.add_option("--scan-virus", action="store_true", dest="scanvirus",
help=_(
"""Scan content of URLs with ClamAV virus scanner."""))
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
help=_(
"""Quiet operation, an alias for '-o none'.
@ -677,6 +680,8 @@ if constructauth:
config["authentication"].append({'pattern': try_compile_re(".+"),
'user': _username,
'password': _password})
if options.scanvirus is not None:
config["scanvirus"] = options.scanvirus
# boolean options for syntaxcheck
for prefix in ("checkhtml", "checkcss"):
for suffix in ("", "w3"):

866
po/de.po

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -12,3 +12,10 @@ os=`python -c "import os; print os.name"`
if [ "x$os" = "xposix" ]; then
echo "--resource=posix"
fi
SOCK=`grep LocalSocket /etc/clamav/clamd.conf | awk '{print $2;}'`
if test -n $SOCK; then
if waitfor -w 1 unix:"$SOCK"; then
echo "--resource=clamav"
fi
fi