# -*- coding: iso-8859-1 -*- # Copyright (C) 2000-2010 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Base URL handler. """ import sys import os import logging import urlparse import urllib2 import urllib import time import errno import socket import select import tempfile from . import absolute_url, StoringHandler, get_url_from from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib, strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip) from ..HtmlParser import htmlsax from ..htmlutil import linkparse from ..network import iputil from .const import (WARN_URL_EFFECTIVE_URL, WARN_URL_UNICODE_DOMAIN, WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, WARN_URL_ANCHOR_NOT_FOUND, WARN_URL_WARNREGEX_FOUND, WARN_URL_CONTENT_SIZE_TOO_LARGE, WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_UNEQUAL, ExcList, ExcSyntaxList, ExcNoCacheList) # helper alias unicode_safe = strformat.unicode_safe def urljoin (parent, url, scheme): """ If url is relative, join parent and url. Else leave url as-is. @return joined url """ if url.startswith(scheme+":"): return url # work around a Python 2.6/3.1 bug cutting off characters when the URL # begins with semicolon if url.startswith(';'): url = "./%s" % url return urlparse.urljoin(parent, url) def url_norm (url, encoding=None): """Wrapper for url.url_norm() to convert UnicodeError in LinkCheckerError.""" try: return urlutil.url_norm(url, encoding=encoding) except UnicodeError: msg = _("URL has unparsable domain name: %(name)s") % \ {"name": sys.exc_info()[1]} raise LinkCheckerError(msg) class UrlBase (object): """An URL with additional information like validity etc.""" # file types that can be parsed recursively ContentMimetypes = { "text/html": "html", "application/xhtml+xml": "html", "text/css": "css", "application/x-shockwave-flash": "swf", "application/msword": "word", "text/plain+linkchecker": "text", "text/plain+opera": "opera", } def __init__ (self, base_url, recursion_level, aggregate, parent_url=None, base_ref=None, line=-1, column=-1, name=u"", url_encoding=None): """ Initialize check data, and store given variables. @param base_url: unquoted and possibly unnormed url @param recursion_level: on what check level lies the base url @param aggregate: aggregate instance @param parent_url: quoted and normed url of parent or None @param base_ref: quoted and normed url of or None @param line: line number of url in parent content @param column: column number of url in parent content @param name: name of url or empty @param url_encoding: encoding of URL or None """ self.init(base_ref, base_url, parent_url, recursion_level, aggregate, line, column, name, url_encoding) self.reset() self.check_syntax() def init (self, base_ref, base_url, parent_url, recursion_level, aggregate, line, column, name, url_encoding): """ Initialize internal data. """ self.base_ref = base_ref # note that self.base_url must not be modified self.base_url = base_url self.parent_url = parent_url self.recursion_level = recursion_level self.aggregate = aggregate self.line = line self.column = column self.name = name self.encoding = url_encoding if self.base_ref: assert not urlutil.url_needs_quoting(self.base_ref), \ "unquoted base reference URL %r" % self.base_ref if self.parent_url: assert not urlutil.url_needs_quoting(self.parent_url), \ "unquoted parent URL %r" % self.parent_url url = absolute_url(base_url, base_ref, parent_url) # assume file link if no scheme is found self.scheme = url.split(":", 1)[0] or "file" def reset (self): """ Reset all variables to default values. """ # self.url is constructed by self.build_url() out of base_url # and (base_ref or parent) as absolute and normed url. # This the real url we use when checking so it also referred to # as 'real url' self.url = None # a splitted version of url for convenience self.urlparts = None # the anchor part of url self.anchor = None # list of parsed anchors self.anchors = [] # the result message string and flag self.result = u"" self.has_result = False # cached or not self.cached = False # valid or not self.valid = True # list of warnings (without duplicates) self.warnings = [] # list of infos self.info = [] # content size self.size = -1 # download time self.dltime = -1 # download size self.dlsize = -1 # check time self.checktime = 0 # connection object self.url_connection = None # data of url content, (data == None) means no data is available self.data = None # cache keys, are set by build_url() calling set_cache_keys() self.cache_url_key = None self.cache_content_key = None # extern flags (is_extern, is_strict), both enabled as default self.extern = (1, 1) # flag if the result should be cached self.caching = True # title is either the URL or parsed from content self.title = None # flag if content should be checked or not self.do_check_content = True def set_result (self, msg, valid=True, overwrite=False): """ Set result string and validity. """ if self.has_result and not overwrite: log.warn(LOG_CHECK, "Double result %r (previous %r) for %s", msg, self.result, self) else: self.has_result = True if not isinstance(msg, unicode): log.warn(LOG_CHECK, "Non-unicode result for %s: %r", self, msg) elif not msg: log.warn(LOG_CHECK, "Empty result for %s", self) self.result = msg self.valid = valid def get_title (self): """Return title of page the URL refers to. This is per default the filename or the URL.""" if self.title is None: url = u"" if self.base_url: url = self.base_url elif self.url: url = self.url self.title = url if "/" in url: title = url.rsplit("/", 1)[1] if title: self.title = title return self.title def set_title_from_content (self): """Set title of page the URL refers to.from page content.""" if self.valid: try: handler = linkparse.TitleFinder() except tuple(ExcList): return parser = htmlsax.parser(handler) handler.parser = parser # parse try: parser.feed(self.get_content()) parser.flush() except linkparse.StopParse, msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) # break cyclic dependencies handler.parser = None parser.handler = None if handler.title: self.title = handler.title def is_parseable (self): """ Return True iff content of this url is parseable. """ return False def is_html (self): """ Return True iff content of this url is HTML formatted. """ return False def is_css (self): """Return True iff content of this url is CSS stylesheet.""" return False def is_http (self): """ Return True for http:// URLs. """ return False def is_file (self): """ Return True for file:// URLs. """ return False def add_warning (self, s, tag=None): """ Add a warning string. """ item = (tag, s) if item not in self.warnings: self.warnings.append(item) def add_info (self, s): """ Add an info string. """ if s not in self.info: self.info.append(s) def copy_from_cache (self, cache_data): """ Fill attributes from cache data. """ self.result = cache_data["result"] self.has_result = True for tag, msg in cache_data["warnings"]: # do not copy anchor warnings, since the current anchor # might have changed if tag != WARN_URL_ANCHOR_NOT_FOUND: self.add_warning(msg, tag=tag) for info in cache_data["info"]: self.add_info(info) self.valid = cache_data["valid"] self.dltime = cache_data["dltime"] self.dlsize = cache_data["dlsize"] self.anchors = cache_data["anchors"] self.cached = True # recheck anchor if self.valid and self.anchor: self.check_anchor() def get_cache_data (self): """Return all data values that should be put in the cache.""" return {"result": self.result, "warnings": self.warnings, "info": self.info, "valid": self.valid, "dltime": self.dltime, "dlsize": self.dlsize, "anchors": self.anchors, } def get_alias_cache_data (self): """Return all data values that should be put in the cache. Intended to be overridden by subclasses that handle aliases. """ return self.get_cache_data() def set_cache_keys (self): """ Set keys for URL checking and content recursion. """ # remove anchor from content cache key since we assume # URLs with different anchors to have the same content self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u'']) assert isinstance(self.cache_content_key, unicode), self log.debug(LOG_CACHE, "Content cache key %r", self.cache_content_key) # construct cache key self.cache_url_key = self.cache_content_key assert isinstance(self.cache_url_key, unicode), self log.debug(LOG_CACHE, "URL cache key %r", self.cache_url_key) def check_syntax (self): """ Called before self.check(), this function inspects the url syntax. Success enables further checking, failure immediately logs this url. Syntax checks must not use any network resources. """ log.debug(LOG_CHECK, "checking syntax") if self.base_url is None: self.set_result(_("URL is missing"), valid=False) return if not (self.base_url or self.parent_url): self.set_result(_("URL is empty"), valid=False) return try: self.build_url() # check url warnings effectiveurl = urlparse.urlunsplit(self.urlparts) if self.url != effectiveurl: self.add_warning(_("Effective URL %(url)r.") % {"url": effectiveurl}, tag=WARN_URL_EFFECTIVE_URL) self.url = effectiveurl except tuple(ExcSyntaxList), msg: self.set_result(unicode_safe(msg), valid=False) return self.set_cache_keys() def build_url (self): """ Construct self.url and self.urlparts out of the given base url information self.base_url, self.parent_url and self.base_ref. """ # norm base url - can raise UnicodeError from url.idna_encode() base_url, is_idn = url_norm(self.base_url, self.encoding) if is_idn: self.add_warning(_("""URL %(url)r has a unicode domain name which is not yet widely supported. You should use the URL %(idna_url)r instead.""") % \ {"url": self.base_url, "idna_url": base_url}, tag=WARN_URL_UNICODE_DOMAIN) # make url absolute if self.base_ref: # use base reference as parent url if ":" not in self.base_ref: # some websites have a relative base reference self.base_ref = urljoin(self.parent_url, self.base_ref, self.scheme) self.url = urljoin(self.base_ref, base_url, self.scheme) elif self.parent_url: # strip the parent url query and anchor urlparts = list(urlparse.urlsplit(self.parent_url)) urlparts[3] = urlparts[4] = "" parent_url = urlparse.urlunsplit(urlparts) self.url = urljoin(parent_url, base_url, self.scheme) else: self.url = base_url # note: urljoin can unnorm the url path, so norm it again urlparts = list(urlparse.urlsplit(self.url)) if urlparts[2]: urlparts[2] = urlutil.collapse_segments(urlparts[2]) self.url = urlparse.urlunsplit(urlparts) # split into (modifiable) list self.urlparts = strformat.url_unicode_split(self.url) # and unsplit again self.url = urlparse.urlunsplit(self.urlparts) # check userinfo@host:port syntax self.userinfo, host = urllib.splituser(self.urlparts[1]) # set host lowercase if self.userinfo: self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower()) else: self.urlparts[1] = host.lower() # safe anchor for later checking self.anchor = self.urlparts[4] self.host, self.port = urllib.splitport(host) if self.port is not None: if not urlutil.is_numeric_port(self.port): raise LinkCheckerError(_("URL has invalid port %(port)r") % {"port": str(self.port)}) self.port = int(self.port) self.check_obfuscated_ip() def check_obfuscated_ip (self): """Warn if host of this URL is obfuscated IP address.""" # check if self.host can be an IP address if self.scheme not in ("ftp", "http", "mailto", "news", "nntp", "telnet"): return # check for obfuscated IP address if iputil.is_obfuscated_ip(self.host): ips = iputil.resolve_host(self.host) if ips: self.add_warning( _("URL %(url)s has obfuscated IP address %(ip)s") % \ {"url": self.base_url, "ip": ips.pop()}, tag=WARN_URL_OBFUSCATED_IP) def check (self): """Main check function for checking this URL.""" if self.aggregate.config["trace"]: trace.trace_on() try: self.local_check() except (socket.error, select.error): # on Unix, ctrl-c can raise # error: (4, 'Interrupted system call') etype, value = sys.exc_info()[:2] if etype == errno.EINTR: raise KeyboardInterrupt(value) else: raise finally: # close/release possible open connection self.close_connection() def add_country_info (self): """Try to ask GeoIP database for country info.""" if self.host: country = geoip.get_country(self.host) if country is not None: self.add_info(_("URL is located in %(country)s.") % {"country": _(country)}) def add_size_info (self): """Store size of URL content from meta info into self.size. Must be implemented in subclasses.""" pass def local_check (self): """Local check function can be overridden in subclasses.""" log.debug(LOG_CHECK, "Checking %s", self) # start time for check check_start = time.time() self.set_extern(self.url) if self.extern[0] and self.extern[1]: self.add_info(_("Outside of domain filter, checked only syntax.")) return # check connection log.debug(LOG_CHECK, "checking connection") try: self.check_connection() self.add_size_info() self.add_country_info() except tuple(ExcList): value = self.handle_exception() # make nicer error msg for unknown hosts if isinstance(value, socket.error) and value.args[0] == -2: value = _('Hostname not found') # make nicer error msg for bad status line if isinstance(value, httplib.BadStatusLine): value = _('Bad HTTP response %(line)r') % {"line": str(value)} self.set_result(unicode_safe(value), valid=False) self.checktime = time.time() - check_start if self.do_check_content: # check content and recursion try: self.check_content() if self.allows_recursion(): self.parse_url() # check content size self.check_size() except tuple(ExcList): value = self.handle_exception() self.add_warning(_("could not get content: %(msg)r") % {"msg": str(value)}, tag=WARN_URL_ERROR_GETTING_CONTENT) def close_connection (self): """ Close an opened url connection. """ if self.url_connection is None: # no connection is open return try: self.url_connection.close() except Exception: # ignore close errors pass self.url_connection = None def handle_exception (self): """ An exception occurred. Log it and set the cache flag. """ etype, value = sys.exc_info()[:2] log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, value, exception=True) # note: etype must be the exact class, not a subclass if (etype in ExcNoCacheList) or \ (etype == socket.error and value.args[0]==errno.EBADF) or \ not value: # EBADF occurs when operating on an already socket self.caching = False errmsg = etype.__name__ if str(value): # use Exception class name errmsg += ": %s" % str(value) # limit length to 240 return strformat.limit(errmsg, length=240) def check_connection (self): """ The basic connection check uses urllib2.urlopen to initialize a connection object. """ self.url_connection = urllib2.urlopen(self.url) def allows_recursion (self): """ Return True iff we can recurse into the url's content. """ log.debug(LOG_CHECK, "checking recursion of %r ...", self.url) # Test self.valid before self.is_parseable(). if not self.valid: log.debug(LOG_CHECK, "... no, invalid.") return False if not self.is_parseable(): log.debug(LOG_CHECK, "... no, not parseable.") return False if not self.can_get_content(): log.debug(LOG_CHECK, "... no, cannot get content.") return False rec_level = self.aggregate.config["recursionlevel"] if rec_level >= 0 and self.recursion_level >= rec_level: log.debug(LOG_CHECK, "... no, maximum recursion level reached.") return False if self.extern[0]: log.debug(LOG_CHECK, "... no, extern.") return False if not self.content_allows_robots(): log.debug(LOG_CHECK, "... no, robots.") return False log.debug(LOG_CHECK, "... yes, recursion.") return True def content_allows_robots (self): """ Return False if the content of this URL forbids robots to search for recursive links. """ if not self.is_html(): return True if not (self.is_http() or self.is_file()): return True # construct parser object handler = linkparse.MetaRobotsFinder() parser = htmlsax.parser(handler) handler.parser = parser # parse try: parser.feed(self.get_content()) parser.flush() except linkparse.StopParse, msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) # break cyclic dependencies handler.parser = None parser.handler = None return handler.follow def get_anchors (self): """Store list of anchors for this URL. Precondition: this URL is an HTML resource.""" log.debug(LOG_CHECK, "Getting HTML anchors %s", self) handler = linkparse.LinkFinder(self.add_anchor, tags={'a': [u'name'], None: [u'id']}) parser = htmlsax.parser(handler) handler.parser = parser # parse try: parser.feed(self.get_content()) parser.flush() except linkparse.StopParse, msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) # break cyclic dependencies handler.parser = None parser.handler = None def add_anchor (self, url, line, column, name, base): """Add anchor URL.""" self.anchors.append((url, line, column, name, base)) def check_anchor (self): """If URL was valid and has an anchor, check it. A warning is logged if the anchor is not found. """ if not self.aggregate.config["anchors"]: return log.debug(LOG_CHECK, "checking anchor %r", self.anchor) if any(x for x in self.anchors if x[0] == self.anchor): return anchors = u",".join(u"`%s'" % x[0] for x in self.anchors) args = {"name": self.anchor, "anchors": anchors} msg = u"%s %s" % (_("Anchor `%(name)s' not found.") % args, _("Available anchors: %(anchors)s.") % args) self.add_warning(msg, tag=WARN_URL_ANCHOR_NOT_FOUND) def set_extern (self, url): """ Match URL against extern and intern link patterns. If no pattern matches the URL is extern. Sets self.extern to a tuple (bool, bool) with content (is_extern, is_strict). @return: None """ for entry in self.aggregate.config["externlinks"]: match = entry['pattern'].search(url) if (entry['negate'] and not match) or \ (match and not entry['negate']): log.debug(LOG_CHECK, "Extern URL %r", url) self.extern = (1, entry['strict']) return for entry in self.aggregate.config["internlinks"]: match = entry['pattern'].search(url) if (entry['negate'] and not match) or \ (match and not entry['negate']): log.debug(LOG_CHECK, "Intern URL %r", url) self.extern = (0, 0) return log.debug(LOG_CHECK, "Explicit extern URL %r", url) self.extern = (1, 0) return def can_get_content (self): """Indicate wether url get_content() can be called.""" return True def get_content (self): """Precondition: url_connection is an opened URL.""" if self.data is None: log.debug(LOG_CHECK, "Get content of %r", self.url) t = time.time() self.data, self.dlsize = self.read_content() self.dltime = time.time() - t return self.data def read_content (self): """Return data and data size for this URL. Can be overridden in subclasses.""" data = self.url_connection.read() return data, len(data) def check_content (self): """Check content data for warnings, syntax errors, viruses etc.""" if not (self.valid and self.can_get_content()): return if self.is_html(): self.set_title_from_content() if self.aggregate.config["anchors"]: self.get_anchors() if self.anchor: self.check_anchor() self.check_warningregex() # is it an intern URL? if not self.extern[0]: # check HTML/CSS syntax if self.aggregate.config["checkhtml"] and self.is_html(): self.check_html() if self.aggregate.config["checkcss"] and self.is_css(): self.check_css() if self.aggregate.config["checkhtmlw3"] and self.is_html(): self.check_html_w3() if self.aggregate.config["checkcssw3"] and self.is_css(): self.check_css_w3() # check with clamav if self.aggregate.config["scanvirus"]: self.scan_virus() def check_warningregex (self): warningregex = self.aggregate.config["warningregex"] if warningregex: log.debug(LOG_CHECK, "checking content") try: match = warningregex.search(self.get_content()) if match: self.add_warning(_("Found %(match)r in link contents.") % {"match": match.group()}, tag=WARN_URL_WARNREGEX_FOUND) except tuple(ExcList): value = self.handle_exception() self.set_result(unicode_safe(value), valid=False) def check_size (self): """Check content size if it is zero or larger than a given maximum size. """ if self.dlsize == 0: self.add_warning(_("Content size is zero."), tag=WARN_URL_CONTENT_SIZE_ZERO) else: maxbytes = self.aggregate.config["warnsizebytes"] if maxbytes is not None and self.dlsize >= maxbytes: self.add_warning( _("Content size %(dlsize)s is larger than %(maxbytes)s.") % {"dlsize": strformat.strsize(self.dlsize), "maxbytes": strformat.strsize(maxbytes)}, tag=WARN_URL_CONTENT_SIZE_TOO_LARGE) if self.size != -1 and self.dlsize != -1 and self.dlsize != self.size: self.add_warning(_("Download size (%(dlsize)d Byte) " "does not equal content size (%(size)d Byte).") % {"dlsize": self.dlsize, "size": self.size}, tag=WARN_URL_CONTENT_SIZE_UNEQUAL) def check_html (self): """Check HTML syntax of this page (which is supposed to be HTML) with the local HTML tidy module.""" try: import tidy except ImportError: log.warn(LOG_CHECK, _("warning: tidy module is not available; " \ "download from http://utidylib.berlios.de/")) return options = dict(output_html=0, show_warnings=1, quiet=True, input_encoding='utf8', output_encoding='utf8', tidy_mark=0) try: doc = tidy.parseString(self.get_content(), **options) errors = filter_tidy_errors(doc.errors) if errors: for err in errors: self.add_warning(u"HTMLTidy: %s" % err) else: self.add_info(u"HTMLTidy: %s" % _("valid HTML syntax")) except Exception: # catch _all_ exceptions since we dont want third party module # errors to propagate into this library err = str(sys.exc_info()[1]) log.warn(LOG_CHECK, _("warning: tidy HTML parsing caused error: %(msg)s ") % {"msg": err}) def check_css (self): """Check CSS syntax of this page (which is supposed to be CSS) with the local cssutils module.""" try: import cssutils except ImportError: log.warn(LOG_CHECK, _("warning: cssutils module is not available; " \ "download from http://cthedot.de/cssutils/")) return try: csslog = logging.getLogger('cssutils') csslog.propagate = 0 del csslog.handlers[:] handler = StoringHandler() csslog.addHandler(handler) csslog.setLevel(logging.WARN) cssparser = cssutils.CSSParser(log=csslog) cssparser.parseString(self.get_content(), href=self.url) if handler.storage: for record in handler.storage: self.add_warning(u"cssutils: %s" % record.getMessage()) else: self.add_info(u"cssutils: %s" % _("valid CSS syntax")) except Exception: # catch _all_ exceptions since we dont want third party module # errors to propagate into this library err = str(sys.exc_info()[1]) log.warn(LOG_CHECK, _("warning: cssutils parsing caused error: %(msg)s") % {"msg": err}) def check_html_w3 (self): """Check HTML syntax of this page (which is supposed to be HTML) with the online W3C HTML validator documented at http://validator.w3.org/docs/api.html """ self.aggregate.check_w3_time() try: u = urllib2.urlopen('http://validator.w3.org/check', urllib.urlencode({ 'fragment': self.get_content(), 'output': 'xml', })) if u.headers.get('x-w3c-validator-status', 'Invalid') == 'Valid': self.add_info(u"W3C Validator: %s" % _("valid HTML syntax")) return from xml.dom.minidom import parseString dom = parseString(u.read()) elements = dom.getElementsByTagName('messages')[0].getElementsByTagName('msg') for msg in [e.firstChild.wholeText for e in elements]: self.add_warning(u"W3C HTML validation: %s" % msg) except Exception: # catch _all_ exceptions since we dont want third party module # errors to propagate into this library err = str(sys.exc_info()[1]) log.warn(LOG_CHECK, _("warning: HTML W3C validation caused error: %(msg)s ") % {"msg": err}) def check_css_w3 (self): """Check CSS syntax of this page (which is supposed to be CSS) with the online W3C CSS validator documented at http://jigsaw.w3.org/css-validator/manual.html#expert """ self.aggregate.check_w3_time() try: host = 'jigsaw.w3.org' path = '/css-validator/validator' params = { 'text': "div {}", 'warning': '2', 'output': 'soap12', } fields = params.items() content_type, body = httputil.encode_multipart_formdata(fields) h = httplib.HTTPConnection(host) h.putrequest('POST', path) h.putheader('Content-Type', content_type) h.putheader('Content-Length', str(len(body))) h.endheaders() h.send(body) r = h.getresponse(True) if r.getheader('X-W3C-Validator-Status', 'Invalid') == 'Valid': self.add_info(u"W3C Validator: %s" % _("valid CSS syntax")) return from xml.dom.minidom import parseString dom = parseString(r.read()) elements = dom.getElementsByTagName('m:errors')[0].getElementsByTagName('m:error') for msg in [e.firstChild.wholeText for e in elements]: self.add_warning(u"W3C HTML validation: %s" % msg) except Exception: # catch _all_ exceptions since we dont want third party module # errors to propagate into this library err = str(sys.exc_info()[1]) log.warn(LOG_CHECK, _("warning: CSS W3C validation caused error: %(msg)s ") % {"msg": err}) def scan_virus (self): """Scan content for viruses.""" infected, errors = clamav.scan(self.get_content()) for msg in infected: self.add_warning(u"Virus scan infection: %s" % msg) for msg in errors: self.add_warning(u"Virus scan error: %s" % msg) def parse_url (self): """ Parse url content and search for recursive links. Default parse type is html. """ self.parse_html() def get_user_password (self): """Get tuple (user, password) from configured authentication. Both user and password can be None. """ return self.aggregate.config.get_user_password(self.url) def parse_html (self): """Parse into HTML content and search for URLs to check. Found URLs are added to the URL queue. """ log.debug(LOG_CHECK, "Parsing HTML %s", self) # construct parser object handler = linkparse.LinkFinder(self.add_url) parser = htmlsax.parser(handler) handler.parser = parser # parse try: parser.feed(self.get_content()) parser.flush() except linkparse.StopParse, msg: log.debug(LOG_CHECK, "Stopped parsing: %s", msg) # break cyclic dependencies handler.parser = None parser.handler = None def add_url (self, url, line, column, name, base): """Queue URL data for checking.""" base_ref = urlutil.url_norm(base)[0] url_data = get_url_from(url, self.recursion_level+1, self.aggregate, parent_url=self.url, base_ref=base_ref, line=line, column=column, name=name) self.aggregate.urlqueue.put(url_data) def parse_opera (self): """Parse an opera bookmark file.""" log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self) name = None lineno = 0 for line in self.get_content().splitlines(): lineno += 1 line = line.strip() if line.startswith("NAME="): name = line[5:] elif line.startswith("URL="): url = line[4:] if url and name is not None: url_data = get_url_from(url, self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno, name=name) self.aggregate.urlqueue.put(url_data) else: name = None def parse_text (self): """ Parse a text file with on url per line; comment and blank lines are ignored. """ log.debug(LOG_CHECK, "Parsing text %s", self) lineno = 0 for line in self.get_content().splitlines(): lineno += 1 line = line.strip() if not line or line.startswith('#'): continue url_data = get_url_from(line, self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno) self.aggregate.urlqueue.put(url_data) def parse_css (self): """ Parse a CSS file for url() patterns. """ log.debug(LOG_CHECK, "Parsing CSS %s", self) lineno = 0 linkfinder = linkparse.css_url_re.finditer strip_comments = linkparse.strip_c_comments for line in strip_comments(self.get_content()).splitlines(): lineno += 1 for mo in linkfinder(line): column = mo.start("url") url = strformat.unquote(mo.group("url").strip()) url_data = get_url_from(url, self.recursion_level+1, self.aggregate, parent_url=self.url, line=lineno, column=column) self.aggregate.urlqueue.put(url_data) def parse_swf (self): """Parse a SWF file for URLs.""" linkfinder = linkparse.swf_url_re.finditer for mo in linkfinder(self.get_content()): url = mo.group() url_data = get_url_from(url, self.recursion_level+1, self.aggregate, parent_url=self.url) self.aggregate.urlqueue.put(url_data) def parse_word (self): """Parse a word file for hyperlinks.""" if not winutil.has_word(): return filename = self.get_temp_filename() # open word file and parse hyperlinks try: app = winutil.get_word_app() try: doc = winutil.open_wordfile(app, filename) try: for link in doc.Hyperlinks: url_data = get_url_from(link.Address, self.recursion_level+1, self.aggregate, parent_url=self.url, name=link.TextToDisplay) self.aggregate.urlqueue.put(url_data) finally: winutil.close_wordfile(doc) finally: winutil.close_word_app(app) except winutil.Error, msg: log.warn(LOG_CHECK, "Error parsing word file: %s", msg) def get_temp_filename (self): """Get temporary filename for content to parse.""" # store content in temporary file fd, filename = tempfile.mkstemp(suffix='.doc', prefix='lc_') fp = os.fdopen(fd) fp.write(self.get_content()) fp.close() def serialized (self): """ Return serialized url check data as unicode string. """ sep = unicode_safe(os.linesep) if self.base_url is not None: assert isinstance(self.base_url, unicode), self if self.parent_url is not None: assert isinstance(self.parent_url, unicode), self if self.base_ref is not None: assert isinstance(self.base_ref, unicode), self assert isinstance(self.name, unicode), self return sep.join([ u"%s link" % self.scheme, u"base_url=%r" % self.base_url, u"parent_url=%r" % self.parent_url, u"base_ref=%r" % self.base_ref, u"recursion_level=%s" % self.recursion_level, u"url_connection=%s" % self.url_connection, u"line=%d" % self.line, u"column=%d" % self.column, u"name=%r" % self.name, ]) def get_intern_pattern (self): """ Get pattern for intern URL matching. @return non-empty regex pattern or None @rtype String or None """ return None def __str__ (self): """ Get URL info. @return: URL info, encoded with the output logger encoding @rtype: string """ s = self.serialized() return self.aggregate.config['logger'].encode(s) def __repr__ (self): """ Get URL info. @return: URL info @rtype: unicode """ return u"<%s >" % self.serialized() def to_wire_dict (self): """Return a simplified transport object for logging. The transport object must contain these attributes: - url_data.valid: bool Indicates if URL is valid - url_data.cached: bool Indicates if URL data has been loaded from cache. - url_data.result: unicode Result string - url_data.warnings: list of unicode List of tagged warnings for this URL. - url_data.name: unicode string or None name of URL (eg. filename or link name) - url_data.parent_url: unicode or None Parent URL - url_data.base_ref: unicode or None HTML base reference URL of parent - url_data.url: unicode or None Fully qualified URL. - url_data.checktime: int Number of seconds needed to check this link, default: zero. - url_data.dltime: int Number of seconds needed to download URL content, default: -1 - url_data.dlsize: int Size of downloaded URL content, default: -1 - url_data.info: list of unicode Additional information about this URL. - url_data.line: int Line number of this URL at parent document, or -1 - url_data.column: int Column number of this URL at parent document, or -1 """ return dict(valid=self.valid, extern=self.extern[0], cached=self.cached, result=self.result, warnings=[x[1] for x in self.warnings], name=self.name or u"", title=self.get_title(), parent_url=self.parent_url or u"", base_ref=self.base_ref or u"", base_url=self.base_url or u"", url=self.url or u"", checktime=self.checktime, dltime=self.dltime, dlsize=self.dlsize, info=self.info, line=self.line, column=self.column, cache_url_key=self.cache_url_key, ) def to_wire (self): return CompactUrlData(self.to_wire_dict()) def filter_tidy_errors (errors): """Filter certain errors from HTML tidy run.""" return [x for x in errors if not \ (x.severity=='W' and x.message==' lacks "summary" attribute')] urlDataAttr = [ 'valid', 'extern', 'cached', 'result', 'warnings', 'name', 'title', 'parent_url', 'base_ref', 'base_url', 'url', 'checktime', 'dltime', 'dlsize', 'info', 'line', 'column', 'cache_url_key', ] class CompactUrlData (object): __slots__ = urlDataAttr def __init__(self, wired_url_data): '''Set all attributes according to the dictionnary wired_url_data''' for attr in urlDataAttr: setattr(self, attr, wired_url_data[attr])