linkchecker/linkcheck/checker/urlbase.py
2010-10-27 19:52:48 +02:00

1146 lines
42 KiB
Python

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Base URL handler.
"""
import sys
import os
import logging
import urlparse
import urllib2
import urllib
import time
import errno
import socket
import select
import tempfile
from . import absolute_url, StoringHandler, get_url_from
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND,
WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList)
# helper alias
unicode_safe = strformat.unicode_safe
def urljoin (parent, url, scheme):
"""
If url is relative, join parent and url. Else leave url as-is.
@return joined url
"""
if url.startswith(scheme+":"):
return url
# work around a Python 2.6/3.1 bug cutting off characters when the URL
# begins with semicolon
if url.startswith(';'):
url = "./%s" % url
return urlparse.urljoin(parent, url)
def url_norm (url, encoding=None):
"""Wrapper for url.url_norm() to convert UnicodeError in
LinkCheckerError."""
try:
return urlutil.url_norm(url, encoding=encoding)
except UnicodeError:
msg = _("URL has unparsable domain name: %(name)s") % \
{"name": sys.exc_info()[1]}
raise LinkCheckerError(msg)
class UrlBase (object):
"""An URL with additional information like validity etc."""
# file types that can be parsed recursively
ContentMimetypes = {
"text/html": "html",
"application/xhtml+xml": "html",
"text/css": "css",
"application/x-shockwave-flash": "swf",
"application/msword": "word",
"text/plain+linkchecker": "text",
"text/plain+opera": "opera",
}
def __init__ (self, base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=-1, column=-1,
name=u"", url_encoding=None):
"""
Initialize check data, and store given variables.
@param base_url: unquoted and possibly unnormed url
@param recursion_level: on what check level lies the base url
@param aggregate: aggregate instance
@param parent_url: quoted and normed url of parent or None
@param base_ref: quoted and normed url of <base href=""> or None
@param line: line number of url in parent content
@param column: column number of url in parent content
@param name: name of url or empty
@param url_encoding: encoding of URL or None
"""
self.init(base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding)
self.reset()
self.check_syntax()
def init (self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding):
"""
Initialize internal data.
"""
self.base_ref = base_ref
# note that self.base_url must not be modified
self.base_url = base_url
self.parent_url = parent_url
self.recursion_level = recursion_level
self.aggregate = aggregate
self.line = line
self.column = column
self.name = name
self.encoding = url_encoding
if self.base_ref:
assert not urlutil.url_needs_quoting(self.base_ref), \
"unquoted base reference URL %r" % self.base_ref
if self.parent_url:
assert not urlutil.url_needs_quoting(self.parent_url), \
"unquoted parent URL %r" % self.parent_url
url = absolute_url(base_url, base_ref, parent_url)
# assume file link if no scheme is found
self.scheme = url.split(":", 1)[0] or "file"
def reset (self):
"""
Reset all variables to default values.
"""
# self.url is constructed by self.build_url() out of base_url
# and (base_ref or parent) as absolute and normed url.
# This the real url we use when checking so it also referred to
# as 'real url'
self.url = None
# a splitted version of url for convenience
self.urlparts = None
# the anchor part of url
self.anchor = None
# list of parsed anchors
self.anchors = []
# the result message string and flag
self.result = u""
self.has_result = False
# cached or not
self.cached = False
# valid or not
self.valid = True
# list of warnings (without duplicates)
self.warnings = []
# list of infos
self.info = []
# content size
self.size = -1
# download time
self.dltime = -1
# download size
self.dlsize = -1
# check time
self.checktime = 0
# connection object
self.url_connection = None
# data of url content, (data == None) means no data is available
self.data = None
# cache keys, are set by build_url() calling set_cache_keys()
self.cache_url_key = None
self.cache_content_key = None
# extern flags (is_extern, is_strict), both enabled as default
self.extern = (1, 1)
# flag if the result should be cached
self.caching = True
# title is either the URL or parsed from content
self.title = None
# flag if content should be checked or not
self.do_check_content = True
def set_result (self, msg, valid=True, overwrite=False):
"""
Set result string and validity.
"""
if self.has_result and not overwrite:
log.warn(LOG_CHECK,
"Double result %r (previous %r) for %s", msg, self.result, self)
else:
self.has_result = True
if not isinstance(msg, unicode):
log.warn(LOG_CHECK, "Non-unicode result for %s: %r", self, msg)
elif not msg:
log.warn(LOG_CHECK, "Empty result for %s", self)
self.result = msg
self.valid = valid
def get_title (self):
"""Return title of page the URL refers to.
This is per default the filename or the URL."""
if self.title is None:
url = u""
if self.base_url:
url = self.base_url
elif self.url:
url = self.url
self.title = url
if "/" in url:
title = url.rsplit("/", 1)[1]
if title:
self.title = title
return self.title
def set_title_from_content (self):
"""Set title of page the URL refers to.from page content."""
if self.valid:
try:
handler = linkparse.TitleFinder()
except tuple(ExcList):
return
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse, msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
if handler.title:
self.title = handler.title
def is_parseable (self):
"""
Return True iff content of this url is parseable.
"""
return False
def is_html (self):
"""
Return True iff content of this url is HTML formatted.
"""
return False
def is_css (self):
"""Return True iff content of this url is CSS stylesheet."""
return False
def is_http (self):
"""
Return True for http:// URLs.
"""
return False
def is_file (self):
"""
Return True for file:// URLs.
"""
return False
def add_warning (self, s, tag=None):
"""
Add a warning string.
"""
item = (tag, s)
if item not in self.warnings:
self.warnings.append(item)
def add_info (self, s):
"""
Add an info string.
"""
if s not in self.info:
self.info.append(s)
def copy_from_cache (self, cache_data):
"""
Fill attributes from cache data.
"""
self.result = cache_data["result"]
self.has_result = True
for tag, msg in cache_data["warnings"]:
# do not copy anchor warnings, since the current anchor
# might have changed
if tag != WARN_URL_ANCHOR_NOT_FOUND:
self.add_warning(msg, tag=tag)
for info in cache_data["info"]:
self.add_info(info)
self.valid = cache_data["valid"]
self.dltime = cache_data["dltime"]
self.dlsize = cache_data["dlsize"]
self.anchors = cache_data["anchors"]
self.cached = True
# recheck anchor
if self.valid and self.anchor:
self.check_anchor()
def get_cache_data (self):
"""Return all data values that should be put in the cache."""
return {"result": self.result,
"warnings": self.warnings,
"info": self.info,
"valid": self.valid,
"dltime": self.dltime,
"dlsize": self.dlsize,
"anchors": self.anchors,
}
def get_alias_cache_data (self):
"""Return all data values that should be put in the cache.
Intended to be overridden by subclasses that handle aliases.
"""
return self.get_cache_data()
def set_cache_keys (self):
"""
Set keys for URL checking and content recursion.
"""
# remove anchor from content cache key since we assume
# URLs with different anchors to have the same content
self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
assert isinstance(self.cache_content_key, unicode), self
log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key)
# construct cache key
self.cache_url_key = self.cache_content_key
assert isinstance(self.cache_url_key, unicode), self
log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key)
def check_syntax (self):
"""
Called before self.check(), this function inspects the
url syntax. Success enables further checking, failure
immediately logs this url. Syntax checks must not
use any network resources.
"""
log.debug(LOG_CHECK, "checking syntax")
if self.base_url is None:
self.set_result(_("URL is missing"), valid=False)
return
if not (self.base_url or self.parent_url):
self.set_result(_("URL is empty"), valid=False)
return
try:
self.build_url()
# check url warnings
effectiveurl = urlparse.urlunsplit(self.urlparts)
if self.url != effectiveurl:
self.add_warning(_("Effective URL %(url)r.") %
{"url": effectiveurl},
tag=WARN_URL_EFFECTIVE_URL)
self.url = effectiveurl
except tuple(ExcSyntaxList), msg:
self.set_result(unicode_safe(msg), valid=False)
return
self.set_cache_keys()
def build_url (self):
"""
Construct self.url and self.urlparts out of the given base
url information self.base_url, self.parent_url and self.base_ref.
"""
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url, self.encoding)
if is_idn:
self.add_warning(_("""URL %(url)r has a unicode domain name which
is not yet widely supported. You should use
the URL %(idna_url)r instead.""") % \
{"url": self.base_url, "idna_url": base_url},
tag=WARN_URL_UNICODE_DOMAIN)
# make url absolute
if self.base_ref:
# use base reference as parent url
if ":" not in self.base_ref:
# some websites have a relative base reference
self.base_ref = urljoin(self.parent_url, self.base_ref,
self.scheme)
self.url = urljoin(self.base_ref, base_url, self.scheme)
elif self.parent_url:
# strip the parent url query and anchor
urlparts = list(urlparse.urlsplit(self.parent_url))
urlparts[3] = urlparts[4] = ""
parent_url = urlparse.urlunsplit(urlparts)
self.url = urljoin(parent_url, base_url, self.scheme)
else:
self.url = base_url
# note: urljoin can unnorm the url path, so norm it again
urlparts = list(urlparse.urlsplit(self.url))
if urlparts[2]:
urlparts[2] = urlutil.collapse_segments(urlparts[2])
self.url = urlparse.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = strformat.url_unicode_split(self.url)
# and unsplit again
self.url = urlparse.urlunsplit(self.urlparts)
# check userinfo@host:port syntax
self.userinfo, host = urllib.splituser(self.urlparts[1])
# set host lowercase
if self.userinfo:
self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
else:
self.urlparts[1] = host.lower()
# safe anchor for later checking
self.anchor = self.urlparts[4]
self.host, self.port = urllib.splitport(host)
if self.port is not None:
if not urlutil.is_numeric_port(self.port):
raise LinkCheckerError(_("URL has invalid port %(port)r") %
{"port": str(self.port)})
self.port = int(self.port)
self.check_obfuscated_ip()
def check_obfuscated_ip (self):
"""Warn if host of this URL is obfuscated IP address."""
# check if self.host can be an IP address
if self.scheme not in ("ftp", "http", "mailto", "news", "nntp", "telnet"):
return
# check for obfuscated IP address
if iputil.is_obfuscated_ip(self.host):
ips = iputil.resolve_host(self.host)
if ips:
self.add_warning(
_("URL %(url)s has obfuscated IP address %(ip)s") % \
{"url": self.base_url, "ip": ips.pop()},
tag=WARN_URL_OBFUSCATED_IP)
def check (self):
"""Main check function for checking this URL."""
if self.aggregate.config["trace"]:
trace.trace_on()
try:
self.local_check()
except (socket.error, select.error):
# on Unix, ctrl-c can raise
# error: (4, 'Interrupted system call')
etype, value = sys.exc_info()[:2]
if etype == errno.EINTR:
raise KeyboardInterrupt(value)
else:
raise
finally:
# close/release possible open connection
self.close_connection()
def add_country_info (self):
"""Try to ask GeoIP database for country info."""
if self.host:
country = geoip.get_country(self.host)
if country is not None:
self.add_info(_("URL is located in %(country)s.") %
{"country": _(country)})
def add_size_info (self):
"""Store size of URL content from meta info into self.size.
Must be implemented in subclasses."""
pass
def local_check (self):
"""Local check function can be overridden in subclasses."""
log.debug(LOG_CHECK, "Checking %s", self)
# start time for check
check_start = time.time()
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
self.add_info(_("Outside of domain filter, checked only syntax."))
return
# check connection
log.debug(LOG_CHECK, "checking connection")
try:
self.check_connection()
self.add_size_info()
self.add_country_info()
except tuple(ExcList):
value = self.handle_exception()
# make nicer error msg for unknown hosts
if isinstance(value, socket.error) and value.args[0] == -2:
value = _('Hostname not found')
# make nicer error msg for bad status line
if isinstance(value, httplib.BadStatusLine):
value = _('Bad HTTP response %(line)r') % {"line": str(value)}
self.set_result(unicode_safe(value), valid=False)
self.checktime = time.time() - check_start
if self.do_check_content:
# check content and recursion
try:
self.check_content()
if self.allows_recursion():
self.parse_url()
# check content size
self.check_size()
except tuple(ExcList):
value = self.handle_exception()
self.add_warning(_("could not get content: %(msg)r") %
{"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT)
def close_connection (self):
"""
Close an opened url connection.
"""
if self.url_connection is None:
# no connection is open
return
try:
self.url_connection.close()
except Exception:
# ignore close errors
pass
self.url_connection = None
def handle_exception (self):
"""
An exception occurred. Log it and set the cache flag.
"""
etype, value = sys.exc_info()[:2]
log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, value, exception=True)
# note: etype must be the exact class, not a subclass
if (etype in ExcNoCacheList) or \
(etype == socket.error and value.args[0]==errno.EBADF) or \
not value:
# EBADF occurs when operating on an already socket
self.caching = False
errmsg = etype.__name__
if str(value):
# use Exception class name
errmsg += ": %s" % str(value)
# limit length to 240
return strformat.limit(errmsg, length=240)
def check_connection (self):
"""
The basic connection check uses urllib2.urlopen to initialize
a connection object.
"""
self.url_connection = urllib2.urlopen(self.url)
def allows_recursion (self):
"""
Return True iff we can recurse into the url's content.
"""
log.debug(LOG_CHECK, "checking recursion of %r ...", self.url)
# Test self.valid before self.is_parseable().
if not self.valid:
log.debug(LOG_CHECK, "... no, invalid.")
return False
if not self.is_parseable():
log.debug(LOG_CHECK, "... no, not parseable.")
return False
if not self.can_get_content():
log.debug(LOG_CHECK, "... no, cannot get content.")
return False
rec_level = self.aggregate.config["recursionlevel"]
if rec_level >= 0 and self.recursion_level >= rec_level:
log.debug(LOG_CHECK, "... no, maximum recursion level reached.")
return False
if self.extern[0]:
log.debug(LOG_CHECK, "... no, extern.")
return False
if not self.content_allows_robots():
log.debug(LOG_CHECK, "... no, robots.")
return False
log.debug(LOG_CHECK, "... yes, recursion.")
return True
def content_allows_robots (self):
"""
Return False if the content of this URL forbids robots to
search for recursive links.
"""
if not self.is_html():
return True
if not (self.is_http() or self.is_file()):
return True
# construct parser object
handler = linkparse.MetaRobotsFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse, msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
return handler.follow
def get_anchors (self):
"""Store list of anchors for this URL. Precondition: this URL is
an HTML resource."""
log.debug(LOG_CHECK, "Getting HTML anchors %s", self)
handler = linkparse.LinkFinder(self.add_anchor,
tags={'a': [u'name'], None: [u'id']})
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse, msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
def add_anchor (self, url, line, column, name, base):
"""Add anchor URL."""
self.anchors.append((url, line, column, name, base))
def check_anchor (self):
"""If URL was valid and has an anchor, check it. A warning is
logged if the anchor is not found.
"""
if not self.aggregate.config["anchors"]:
return
log.debug(LOG_CHECK, "checking anchor %r", self.anchor)
if any(x for x in self.anchors if x[0] == self.anchor):
return
anchors = u",".join(u"`%s'" % x[0] for x in self.anchors)
args = {"name": self.anchor, "anchors": anchors}
msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args,
_("Available anchors: %(anchors)s.") % args)
self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND)
def set_extern (self, url):
"""
Match URL against extern and intern link patterns. If no pattern
matches the URL is extern. Sets self.extern to a tuple (bool,
bool) with content (is_extern, is_strict).
@return: None
"""
for entry in self.aggregate.config["externlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
log.debug(LOG_CHECK, "Extern URL %r", url)
self.extern = (1, entry['strict'])
return
for entry in self.aggregate.config["internlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
log.debug(LOG_CHECK, "Intern URL %r", url)
self.extern = (0, 0)
return
log.debug(LOG_CHECK, "Explicit extern URL %r", url)
self.extern = (1, 0)
return
def can_get_content (self):
"""Indicate wether url get_content() can be called."""
return True
def get_content (self):
"""Precondition: url_connection is an opened URL."""
if self.data is None:
log.debug(LOG_CHECK, "Get content of %r", self.url)
t = time.time()
self.data, self.dlsize = self.read_content()
self.dltime = time.time() - t
return self.data
def read_content (self):
"""Return data and data size for this URL.
Can be overridden in subclasses."""
data = self.url_connection.read()
return data, len(data)
def check_content (self):
"""Check content data for warnings, syntax errors, viruses etc."""
if not (self.valid and self.can_get_content()):
return
if self.is_html():
self.set_title_from_content()
if self.aggregate.config["anchors"]:
self.get_anchors()
if self.anchor:
self.check_anchor()
self.check_warningregex()
# is it an intern URL?
if not self.extern[0]:
# check HTML/CSS syntax
if self.aggregate.config["checkhtml"] and self.is_html():
self.check_html()
if self.aggregate.config["checkcss"] and self.is_css():
self.check_css()
if self.aggregate.config["checkhtmlw3"] and self.is_html():
self.check_html_w3()
if self.aggregate.config["checkcssw3"] and self.is_css():
self.check_css_w3()
# check with clamav
if self.aggregate.config["scanvirus"]:
self.scan_virus()
def check_warningregex (self):
warningregex = self.aggregate.config["warningregex"]
if warningregex:
log.debug(LOG_CHECK, "checking content")
try:
match = warningregex.search(self.get_content())
if match:
self.add_warning(_("Found %(match)r in link contents.") %
{"match": match.group()}, tag=WARN_URL_WARNREGEX_FOUND)
except tuple(ExcList):
value = self.handle_exception()
self.set_result(unicode_safe(value), valid=False)
def check_size (self):
"""Check content size if it is zero or larger than a given
maximum size.
"""
if self.dlsize == 0:
self.add_warning(_("Content size is zero."),
tag=WARN_URL_CONTENT_SIZE_ZERO)
else:
maxbytes = self.aggregate.config["warnsizebytes"]
if maxbytes is not None and self.dlsize >= maxbytes:
self.add_warning(
_("Content size %(dlsize)s is larger than %(maxbytes)s.") %
{"dlsize": strformat.strsize(self.dlsize),
"maxbytes": strformat.strsize(maxbytes)},
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size:
self.add_warning(_("Download size (%(dlsize)d Byte) "
"does not equal content size (%(size)d Byte).") %
{"dlsize": self.dlsize,
"size": self.size},
tag=WARN_URL_CONTENT_SIZE_UNEQUAL)
def check_html (self):
"""Check HTML syntax of this page (which is supposed to be HTML)
with the local HTML tidy module."""
try:
import tidy
except ImportError:
log.warn(LOG_CHECK, _("warning: tidy module is not available; " \
"download from http://utidylib.berlios.de/"))
return
options = dict(output_html=0, show_warnings=1, quiet=True,
input_encoding='utf8', output_encoding='utf8', tidy_mark=0)
try:
doc = tidy.parseString(self.get_content(), **options)
errors = filter_tidy_errors(doc.errors)
if errors:
for err in errors:
self.add_warning(u"HTMLTidy: %s" % err)
else:
self.add_info(u"HTMLTidy: %s" % _("valid HTML syntax"))
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("warning: tidy HTML parsing caused error: %(msg)s ") %
{"msg": err})
def check_css (self):
"""Check CSS syntax of this page (which is supposed to be CSS)
with the local cssutils module."""
try:
import cssutils
except ImportError:
log.warn(LOG_CHECK,
_("warning: cssutils module is not available; " \
"download from http://cthedot.de/cssutils/"))
return
try:
csslog = logging.getLogger('cssutils')
csslog.propagate = 0
del csslog.handlers[:]
handler = StoringHandler()
csslog.addHandler(handler)
csslog.setLevel(logging.WARN)
cssparser = cssutils.CSSParser(log=csslog)
cssparser.parseString(self.get_content(), href=self.url)
if handler.storage:
for record in handler.storage:
self.add_warning(u"cssutils: %s" % record.getMessage())
else:
self.add_info(u"cssutils: %s" % _("valid CSS syntax"))
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("warning: cssutils parsing caused error: %(msg)s") %
{"msg": err})
def check_html_w3 (self):
"""Check HTML syntax of this page (which is supposed to be HTML)
with the online W3C HTML validator documented at
http://validator.w3.org/docs/api.html
"""
self.aggregate.check_w3_time()
try:
u = urllib2.urlopen('http://validator.w3.org/check',
urllib.urlencode({
'fragment': self.get_content(),
'output': 'xml',
}))
if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid':
self.add_info(u"W3C Validator: %s" % _("valid HTML syntax"))
return
from xml.dom.minidom import parseString
dom = parseString(u.read())
elements = dom.getElementsByTagName('messages')[0].getElementsByTagName('msg')
for msg in [e.firstChild.wholeText for e in elements]:
self.add_warning(u"W3C HTML validation: %s" % msg)
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("warning: HTML W3C validation caused error: %(msg)s ") %
{"msg": err})
def check_css_w3 (self):
"""Check CSS syntax of this page (which is supposed to be CSS)
with the online W3C CSS validator documented at
http://jigsaw.w3.org/css-validator/manual.html#expert
"""
self.aggregate.check_w3_time()
try:
host = 'jigsaw.w3.org'
path = '/css-validator/validator'
params = {
'text': "div {}",
'warning': '2',
'output': 'soap12',
}
fields = params.items()
content_type, body = httputil.encode_multipart_formdata(fields)
h = httplib.HTTPConnection(host)
h.putrequest('POST', path)
h.putheader('Content-Type', content_type)
h.putheader('Content-Length', str(len(body)))
h.endheaders()
h.send(body)
r = h.getresponse(True)
if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid':
self.add_info(u"W3C Validator: %s" % _("valid CSS syntax"))
return
from xml.dom.minidom import parseString
dom = parseString(r.read())
elements = dom.getElementsByTagName('m:errors')[0].getElementsByTagName('m:error')
for msg in [e.firstChild.wholeText for e in elements]:
self.add_warning(u"W3C HTML validation: %s" % msg)
except Exception:
# catch _all_ exceptions since we dont want third party module
# errors to propagate into this library
err = str(sys.exc_info()[1])
log.warn(LOG_CHECK,
_("warning: CSS W3C validation caused error: %(msg)s ") %
{"msg": err})
def scan_virus (self):
"""Scan content for viruses."""
infected, errors = clamav.scan(self.get_content())
for msg in infected:
self.add_warning(u"Virus scan infection: %s" % msg)
for msg in errors:
self.add_warning(u"Virus scan error: %s" % msg)
def parse_url (self):
"""
Parse url content and search for recursive links.
Default parse type is html.
"""
self.parse_html()
def get_user_password (self):
"""Get tuple (user, password) from configured authentication.
Both user and password can be None.
"""
return self.aggregate.config.get_user_password(self.url)
def parse_html (self):
"""Parse into HTML content and search for URLs to check.
Found URLs are added to the URL queue.
"""
log.debug(LOG_CHECK, "Parsing HTML %s", self)
# construct parser object
handler = linkparse.LinkFinder(self.add_url)
parser = htmlsax.parser(handler)
handler.parser = parser
# parse
try:
parser.feed(self.get_content())
parser.flush()
except linkparse.StopParse, msg:
log.debug(LOG_CHECK, "Stopped parsing: %s", msg)
# break cyclic dependencies
handler.parser = None
parser.handler = None
def add_url (self, url, line, column, name, base):
"""Queue URL data for checking."""
base_ref = urlutil.url_norm(base)[0]
url_data = get_url_from(url, self.recursion_level+1, self.aggregate,
parent_url=self.url, base_ref=base_ref, line=line, column=column,
name=name)
self.aggregate.urlqueue.put(url_data)
def parse_opera (self):
"""Parse an opera bookmark file."""
log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)
name = None
lineno = 0
for line in self.get_content().splitlines():
lineno += 1
line = line.strip()
if line.startswith("NAME="):
name = line[5:]
elif line.startswith("URL="):
url = line[4:]
if url and name is not None:
url_data = get_url_from(url, self.recursion_level+1,
self.aggregate, parent_url=self.url,
line=lineno, name=name)
self.aggregate.urlqueue.put(url_data)
else:
name = None
def parse_text (self):
"""
Parse a text file with on url per line; comment and blank
lines are ignored.
"""
log.debug(LOG_CHECK, "Parsing text %s", self)
lineno = 0
for line in self.get_content().splitlines():
lineno += 1
line = line.strip()
if not line or line.startswith('#'):
continue
url_data = get_url_from(line,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno)
self.aggregate.urlqueue.put(url_data)
def parse_css (self):
"""
Parse a CSS file for url() patterns.
"""
log.debug(LOG_CHECK, "Parsing CSS %s", self)
lineno = 0
linkfinder = linkparse.css_url_re.finditer
strip_comments = linkparse.strip_c_comments
for line in strip_comments(self.get_content()).splitlines():
lineno += 1
for mo in linkfinder(line):
column = mo.start("url")
url = strformat.unquote(mo.group("url").strip())
url_data = get_url_from(url,
self.recursion_level+1, self.aggregate,
parent_url=self.url, line=lineno, column=column)
self.aggregate.urlqueue.put(url_data)
def parse_swf (self):
"""Parse a SWF file for URLs."""
linkfinder = linkparse.swf_url_re.finditer
for mo in linkfinder(self.get_content()):
url = mo.group()
url_data = get_url_from(url,
self.recursion_level+1, self.aggregate,
parent_url=self.url)
self.aggregate.urlqueue.put(url_data)
def parse_word (self):
"""Parse a word file for hyperlinks."""
if not winutil.has_word():
return
filename = self.get_temp_filename()
# open word file and parse hyperlinks
try:
app = winutil.get_word_app()
try:
doc = winutil.open_wordfile(app, filename)
try:
for link in doc.Hyperlinks:
url_data = get_url_from(link.Address,
self.recursion_level+1, self.aggregate,
parent_url=self.url, name=link.TextToDisplay)
self.aggregate.urlqueue.put(url_data)
finally:
winutil.close_wordfile(doc)
finally:
winutil.close_word_app(app)
except winutil.Error, msg:
log.warn(LOG_CHECK, "Error parsing word file: %s", msg)
def get_temp_filename (self):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_')
fp = os.fdopen(fd)
fp.write(self.get_content())
fp.close()
def serialized (self):
"""
Return serialized url check data as unicode string.
"""
sep = unicode_safe(os.linesep)
if self.base_url is not None:
assert isinstance(self.base_url, unicode), self
if self.parent_url is not None:
assert isinstance(self.parent_url, unicode), self
if self.base_ref is not None:
assert isinstance(self.base_ref, unicode), self
assert isinstance(self.name, unicode), self
return sep.join([
u"%s link" % self.scheme,
u"base_url=%r" % self.base_url,
u"parent_url=%r" % self.parent_url,
u"base_ref=%r" % self.base_ref,
u"recursion_level=%s" % self.recursion_level,
u"url_connection=%s" % self.url_connection,
u"line=%d" % self.line,
u"column=%d" % self.column,
u"name=%r" % self.name,
])
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
return None
def __str__ (self):
"""
Get URL info.
@return: URL info, encoded with the output logger encoding
@rtype: string
"""
s = self.serialized()
return self.aggregate.config['logger'].encode(s)
def __repr__ (self):
"""
Get URL info.
@return: URL info
@rtype: unicode
"""
return u"<%s >" % self.serialized()
def to_wire_dict (self):
"""Return a simplified transport object for logging.
The transport object must contain these attributes:
- url_data.valid: bool
Indicates if URL is valid
- url_data.cached: bool
Indicates if URL data has been loaded from cache.
- url_data.result: unicode
Result string
- url_data.warnings: list of unicode
List of tagged warnings for this URL.
- url_data.name: unicode string or None
name of URL (eg. filename or link name)
- url_data.parent_url: unicode or None
Parent URL
- url_data.base_ref: unicode or None
HTML base reference URL of parent
- url_data.url: unicode or None
Fully qualified URL.
- url_data.checktime: int
Number of seconds needed to check this link, default: zero.
- url_data.dltime: int
Number of seconds needed to download URL content, default: -1
- url_data.dlsize: int
Size of downloaded URL content, default: -1
- url_data.info: list of unicode
Additional information about this URL.
- url_data.line: int
Line number of this URL at parent document, or -1
- url_data.column: int
Column number of this URL at parent document, or -1
"""
return dict(valid=self.valid,
extern=self.extern[0],
cached=self.cached,
result=self.result,
warnings=[x[1] for x in self.warnings],
name=self.name or u"",
title=self.get_title(),
parent_url=self.parent_url or u"",
base_ref=self.base_ref or u"",
base_url=self.base_url or u"",
url=self.url or u"",
checktime=self.checktime,
dltime=self.dltime,
dlsize=self.dlsize,
info=self.info,
line=self.line,
column=self.column,
cache_url_key=self.cache_url_key,
)
def to_wire (self):
return CompactUrlData(self.to_wire_dict())
def filter_tidy_errors (errors):
"""Filter certain errors from HTML tidy run."""
return [x for x in errors if not \
(x.severity=='W' and x.message=='<table> lacks "summary" attribute')]
urlDataAttr = [
'valid',
'extern',
'cached',
'result',
'warnings',
'name',
'title',
'parent_url',
'base_ref',
'base_url',
'url',
'checktime',
'dltime',
'dlsize',
'info',
'line',
'column',
'cache_url_key',
]
class CompactUrlData (object):
__slots__ = urlDataAttr
def __init__(self, wired_url_data):
'''Set all attributes according to the dictionnary wired_url_data'''
for attr in urlDataAttr:
setattr(self, attr, wired_url_data[attr])