linkchecker/linkcheck/checker/httpurl.py

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2010 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle http links.
"""

import urlparse
import urllib
import re
import errno
import zlib
import socket
from cStringIO import StringIO
import Cookie

from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
    httplib2 as httplib, LinkCheckerError, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
    get_url_from)
# import warnings
from .const import WARN_HTTP_ROBOTS_DENIED, \
    WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
    WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
    WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
    WARN_HTTP_AUTH_UNKNOWN

# helper alias
unicode_safe = strformat.unicode_safe

supportHttps = hasattr(httplib, "HTTPSConnection")

_supported_encodings = ('gzip', 'x-gzip', 'deflate')

# Amazon blocks all HEAD requests
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search


class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
    """
    Url link with http scheme.
    """

    def reset (self):
        """
        Initialize HTTP specific variables.
        """
        super(HttpUrl, self).reset()
        self.max_redirects = 5
        self.has301status = False
        # flag if check had to fallback from HEAD to GET method
        self.fallback_get = False
        # flag if connection is persistent
        self.persistent = False
        # URLs seen through 301/302 redirections
        self.aliases = []
        # initialize check data
        self.headers = None
        self.auth = None
        self.cookies = []
        # temporary data filled when reading redirections
        self._data = None
        # flag indicating connection reuse
        self.reused_connection = False
        # flag telling if GET method is allowed; determined by robots.txt
        self.method_get_allowed = True

    def allows_robots (self, url):
        """
        Fetch and parse the robots.txt of given url. Checks if LinkChecker
        can get the requested resource content. HEAD requests however are
        still allowed.

        @param url: the url to be requested
        @type url: string
        @return: True if access is granted, otherwise False
        @rtype: bool
        """
        roboturl = self.get_robots_txt_url()
        user, password = self.get_user_password()
        rb = self.aggregate.robots_txt
        callback = self.aggregate.connections.host_wait
        return rb.allows_url(roboturl, url, self.proxy, user, password,
            callback=callback)

    def add_size_info (self):
        """Get size of URL content from HTTP header."""
        if self.headers and "Content-Length" in self.headers and \
           "Transfer-Encoding" not in self.headers:
            # Note that content-encoding causes size differences since
            # the content data is always decoded.
            try:
                self.size = int(self.headers["Content-Length"])
                if self.dlsize == -1:
                    self.dlsize = self.size
            except (ValueError, OverflowError):
                pass
        else:
            self.size = -1

    def check_connection (self):
        """
        Check a URL with HTTP protocol.
        Here is an excerpt from RFC 1945 with common response codes:
        The first digit of the Status-Code defines the class of response. The
        last two digits do not have any categorization role. There are 5
        values for the first digit:
          - 1xx: Informational - Not used, but reserved for future use
          - 2xx: Success - The action was successfully received,
            understood, and accepted.
          - 3xx: Redirection - Further action must be taken in order to
            complete the request
          - 4xx: Client Error - The request contains bad syntax or cannot
            be fulfilled
          - 5xx: Server Error - The server failed to fulfill an apparently
            valid request
        """
        # set the proxy, so a 407 status after this is an error
        self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
        # check robots.txt
        if not self.allows_robots(self.url):
            # remove all previously stored results
            self.add_warning(
                 _("Access denied by robots.txt, skipping content checks."),
                 tag=WARN_HTTP_ROBOTS_DENIED)
            self.method_get_allowed = False
        # first try with HEAD
        self.method = "HEAD"
        # check for amazon server quirk
        if _is_amazon(self.urlparts[1]):
            self.add_info(_("Amazon servers block HTTP HEAD requests."))
            if self.method_get_allowed:
                self.add_info(_("Using GET method for Amazon server."))
                self.method = "GET"
        # check the http connection
        response = self.check_http_connection()
        if self.headers and "Server" in self.headers:
            server = self.headers['Server']
        else:
            server = _("unknown")
        if self.fallback_get:
            self.add_info(_("Server `%(name)s' did not support HEAD request; "
                            "a GET request was used instead.") %
                            {"name": server})
        # redirections might have changed the URL
        self.url = urlparse.urlunsplit(self.urlparts)
        # check response
        if response:
            self.check_response(response)
            response.close()

    def check_http_connection (self):
        """
        Check HTTP connection and return get response and a flag
        if the check algorithm had to fall back to the GET method.

        @return: response or None if url is already handled
        @rtype: HttpResponse or None
        """
        response = None
        while True:
            if response is not None:
                response.close()
            try:
                response = self._try_http_response()
            except httplib.BadStatusLine, msg:
                # some servers send empty HEAD replies
                if self.method == "HEAD" and self.method_get_allowed:
                    log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                raise
            except socket.error, msg:
                # some servers reset the connection on HEAD requests
                if self.method == "HEAD" and self.method_get_allowed and \
                   msg[0] == errno.ECONNRESET:
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                raise
            if response.reason:
                response.reason = unicode_safe(response.reason)
            log.debug(LOG_CHECK,
                "Response: %s %s", response.status, response.reason)
            log.debug(LOG_CHECK, "Headers: %s", self.headers)
            # proxy enforcement (overrides standard proxy)
            if response.status == 305 and self.headers:
                oldproxy = (self.proxy, self.proxyauth)
                newproxy = self.headers.getheader("Location")
                self.add_info(_("Enforced proxy `%(name)s'.") %
                              {"name": newproxy})
                self.set_proxy(newproxy)
                if not self.proxy:
                    self.set_result(
                         _("Enforced proxy `%(name)s' ignored, aborting.") %
                         {"name": newproxy},
                         valid=False)
                    return response
                response.close()
                response = self._try_http_response()
                # restore old proxy settings
                self.proxy, self.proxyauth = oldproxy
            try:
                tries, response = self.follow_redirections(response)
            except httplib.BadStatusLine, msg:
                # some servers send empty HEAD replies
                if self.method == "HEAD" and self.method_get_allowed:
                    log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                raise
            if tries == -1:
                log.debug(LOG_CHECK, "already handled")
                response.close()
                self.do_check_content = False
                return None
            if tries >= self.max_redirects:
                if self.method == "HEAD" and self.method_get_allowed:
                    # Microsoft servers tend to recurse HEAD requests
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
                self.set_result(_("more than %d redirections, aborting") %
                                self.max_redirects, valid=False)
                return response
            # user authentication
            if response.status == 401:
                authenticate = self.headers.get('WWW-Authenticate')
                if not authenticate or not authenticate.startswith("Basic"):
                    # LinkChecker only supports Basic authorization
                    args = {"auth": authenticate}
                    self.add_warning(
                       _("Unsupported HTTP authentication `%(auth)s', " \
                         "only `Basic' authentication is supported.") % args,
                       tag=WARN_HTTP_AUTH_UNKNOWN)
                    return
                if not self.auth:
                    import base64
                    _user, _password = self.get_user_password()
                    self.auth = "Basic " + \
                        base64.encodestring("%s:%s" % (_user, _password))
                    log.debug(LOG_CHECK,
                        "Authentication %s/%s", _user, _password)
                    continue
            elif response.status >= 400:
                # retry with GET (but do not set fallback flag)
                if self.method == "HEAD" and self.method_get_allowed:
                    self.method = "GET"
                    self.aliases = []
                    continue
            elif self.headers and self.method == "HEAD" and self.method_get_allowed:
                # test for HEAD support
                mime = headers.get_content_type(self.headers)
                poweredby = self.headers.get('X-Powered-By', '')
                server = self.headers.get('Server', '')
                if mime in ('application/octet-stream', 'text/plain') and \
                  (poweredby.startswith('Zope') or server.startswith('Zope')):
                    # Zope server could not get Content-Type with HEAD
                    self.method = "GET"
                    self.aliases = []
                    self.fallback_get = True
                    continue
            break
        return response

    def follow_redirections (self, response, set_result=True):
        """
        Follow all redirections of http response.
        """
        log.debug(LOG_CHECK, "follow all redirections")
        redirected = self.url
        tries = 0
        while response.status in [301, 302] and self.headers and \
              tries < self.max_redirects:
            newurl = self.headers.getheader("Location",
                         self.headers.getheader("Uri", ""))
            # make new url absolute and unicode
            newurl = unicode_safe(newurl)
            newurl = urlparse.urljoin(redirected, newurl)
            log.debug(LOG_CHECK, "Redirected to %r", newurl)
            self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
            # norm base url - can raise UnicodeError from url.idna_encode()
            redirected, is_idn = urlbase.url_norm(newurl)
            if is_idn:
                pass # XXX warn about idn use
            log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
            urlparts = strformat.url_unicode_split(redirected)
            if urlparts[1] != self.urlparts[1]:
                # check extern filter again
                self.set_extern(redirected)
                if self.extern[0] and self.extern[1]:
                    if set_result:
                        self.check301status(response)
                        self.add_info(
                             _("The redirected URL is outside of the domain "
                               "filter, checked only syntax."))
                        self.set_result(u"filtered")
                    return -1, response
            # check robots.txt allowance again
            if not self.allows_robots(redirected):
                if set_result:
                    self.add_warning(
                       _("Access to redirected URL denied by robots.txt, "
                         "checked only syntax."),
                       tag=WARN_HTTP_ROBOTS_DENIED)
                    self.set_result(u"syntax OK")
                return -1, response
            # see about recursive redirect
            all_seen = [self.cache_url_key] + self.aliases
            if redirected in all_seen:
                if self.method == "HEAD" and self.method_get_allowed:
                    # Microsoft servers tend to recurse HEAD requests
                    # fall back to the original url and use GET
                    return self.max_redirects, response
                recursion = all_seen + [redirected]
                if set_result:
                    self.set_result(
                          _("recursive redirection encountered:\n %(urls)s") %
                            {"urls": "\n  => ".join(recursion)}, valid=False)
                return -1, response
            if urlparts[0] == self.scheme or urlparts[0] in ('http', 'https'):
                # remember redirected url as alias
                self.aliases.append(redirected)
            else:
                # in case of changed scheme make new URL object
                newobj = get_url_from(
                          redirected, self.recursion_level, self.aggregate,
                          parent_url=self.parent_url, base_ref=self.base_ref,
                          line=self.line, column=self.column, name=self.name)
                if set_result:
                    self.add_warning(
                     _("Redirection to URL `%(newurl)s' with different scheme"
                       " found; the original URL was `%(url)s'.") %
                     {"url": self.url, "newurl": newobj.url},
                     tag=WARN_HTTP_WRONG_REDIRECT)
                    self.set_result(u"syntax OK")
                # append new object to queue
                self.aggregate.urlqueue.put(newobj)
                # pretend to be finished and logged
                return -1, response
            # note: urlparts has to be a list
            self.urlparts = urlparts
            if set_result:
                self.check301status(response)
            # check cache again on the changed URL
            if self.aggregate.urlqueue.checked_redirect(redirected, self):
                return -1, response
            # new response data
            response.close()
            response = self._try_http_response()
            tries += 1
        return tries, response

    def check301status (self, response):
        """If response page has been permanently moved add a warning."""
        if response.status == 301 and not self.has301status:
            self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
                               " should update this link."),
                             tag=WARN_HTTP_MOVED_PERMANENT)
            self.has301status = True

    def get_alias_cache_data (self):
        """
        Return all data values that should be put in the cache,
        minus redirection warnings.
        """
        data = self.get_cache_data()
        data["warnings"] = [
            x for x in self.warnings if x[0] != "http-moved-permanent"]
        data["info"] = self.info
        return data

    def check_response (self, response):
        """Check final result and log it."""
        if response.status >= 400:
            self.set_result(u"%r %s" % (response.status, response.reason),
                            valid=False)
        else:
            if response.status == 204:
                # no content
                self.add_warning(unicode_safe(response.reason),
                                 tag=WARN_HTTP_EMPTY_CONTENT)
            # store cookies for valid links
            if self.aggregate.config['storecookies']:
                for c in self.cookies:
                    self.add_info(_("Sent cookie: %(cookie)s.") %
                                  {"cookie": c})
                try:
                    out = self.aggregate.cookies.add(self.headers,
                                                     self.urlparts[0],
                                                     self.urlparts[1],
                                                     self.urlparts[2])
                except Cookie.CookieError, msg:
                    self.add_warning(_("Could not store cookies: %(msg)s.") %
                                     {'msg': str(msg)},
                                     tag=WARN_HTTP_COOKIE_STORE_ERROR)
            if response.status >= 200:
                self.set_result(u"%r %s" % (response.status, response.reason))
            else:
                self.set_result(u"OK")
        modified = self.headers.get('Last-Modified', '')
        if modified:
            self.add_info(_("Last modified %(date)s.") % {"date": modified})

    def _try_http_response (self):
        """Try to get a HTTP response object. For reused persistent
        connections that the server closed unexpected, a new connection
        will be opened.
        """
        try:
            return self._get_http_response()
        except socket.error, msg:
            if msg.args[0] == 32 and self.reused_connection:
                # server closed persistent connection - retry
                log.debug(LOG_CHECK, "Server closed connection: retry")
                self.persistent = False
                return self._get_http_response()
            raise
        except httplib.BadStatusLine, msg:
            if not msg and self.reused_connection:
                # server closed connection - retry
                log.debug(LOG_CHECK, "Empty status line: retry")
                self.persistent = False
                return self._get_http_response()
            raise

    def _get_http_response (self):
        """
        Send HTTP request and get response object.
        """
        if self.proxy:
            host = self.proxy
            scheme = self.proxytype
        else:
            host = self.urlparts[1]
            scheme = self.urlparts[0]
        log.debug(LOG_CHECK, "Connecting to %r", host)
        # close/release a previous connection
        self.close_connection()
        self.url_connection = self.get_http_object(host, scheme)
        # the anchor fragment is not part of a HTTP URL, see
        # http://tools.ietf.org/html/rfc2616#section-3.2.2
        anchor = ''
        if self.proxy:
            path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
                                 self.urlparts[2], self.urlparts[3], anchor))
        else:
            path = urlparse.urlunsplit(('', '', self.urlparts[2],
                                        self.urlparts[3], anchor))
        self.url_connection.putrequest(self.method, path, skip_host=True,
                                       skip_accept_encoding=True)
        # be sure to use the original host as header even for proxies
        self.url_connection.putheader("Host", self.urlparts[1])
        # userinfo is from http://user@pass:host/
        if self.userinfo:
            self.url_connection.putheader("Authorization", self.userinfo)
        # auth is the -u and -p configuration options
        elif self.auth:
            self.url_connection.putheader("Authorization", self.auth)
        if self.proxyauth:
            self.url_connection.putheader("Proxy-Authorization",
                                         self.proxyauth)
        if (self.parent_url and
            self.parent_url.startswith(('http://', 'https://'))):
            self.url_connection.putheader("Referer", self.parent_url)
        self.url_connection.putheader("User-Agent", configuration.UserAgent)
        self.url_connection.putheader("Accept-Encoding",
                                  "gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
        if self.aggregate.config['sendcookies']:
            scheme = self.urlparts[0]
            host = self.urlparts[1]
            port = urlutil.default_ports.get(scheme, 80)
            host, port = urllib.splitnport(host, port)
            path = self.urlparts[2]
            self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
            for c in self.cookies:
                name = c.client_header_name()
                value = c.client_header_value()
                self.url_connection.putheader(name, value)
        self.url_connection.endheaders()
        response = self.url_connection.getresponse(True)
        self.timeout = headers.http_timeout(response)
        self.headers = response.msg
        self.persistent = not response.will_close
        if self.persistent and self.method == "HEAD":
            # Some servers send page content after a HEAD request,
            # but only after making the *next* request. This breaks
            # protocol synchronisation. Workaround here is to close
            # the connection after HEAD.
            # Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
            self.persistent = False
        if self.persistent and (self.method == "GET" or
           self.headers.getheader("Content-Length") != "0"):
            # always read content from persistent connections
            self._read_content(response)
            assert not response.will_close
        # If possible, use official W3C HTTP response name
        if response.status in httplib.responses:
            response.reason = httplib.responses[response.status]
        return response

    def get_http_object (self, host, scheme):
        """
        Open a HTTP connection.

        @param host: the host to connect to
        @type host: string of the form <host>[:<port>]
        @param scheme: 'http' or 'https'
        @type scheme: string
        @return: open HTTP(S) connection
        @rtype: httplib.HTTP(S)Connection
        """
        _user, _password = self.get_user_password()
        key = (scheme, self.urlparts[1], _user, _password)
        conn = self.aggregate.connections.get(key)
        if conn is not None:
            log.debug(LOG_CHECK, "reuse cached HTTP(S) connection %s", conn)
            self.reused_connection = True
            return conn
        self.aggregate.connections.wait_for_host(host)
        if scheme == "http":
            h = httplib.HTTPConnection(host)
        elif scheme == "https" and supportHttps:
            h = httplib.HTTPSConnection(host)
        else:
            msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
            raise LinkCheckerError(msg)
        if log.is_debug(LOG_CHECK):
            h.set_debuglevel(1)
        h.connect()
        return h

    def read_content (self):
        """Get content of the URL target. The content data is cached after
        the first call to this method.

        @return: URL content, decompressed and decoded
        @rtype: string
        """
        assert self.method_get_allowed, 'unallowed content read'
        self.method = "GET"
        response = self._try_http_response()
        response = self.follow_redirections(response, set_result=False)[1]
        self.headers = response.msg
        # Re-read size info, since the GET request result could be different
        # than a former HEAD request.
        self.add_size_info()
        if self._data is None:
            self._read_content(response)
        data, size = self._data, self._size
        self._data = self._size = None
        return data, size

    def _read_content (self, response):
        """Read URL contents and store then in self._data.
        This way, the method can be called by other functions than
        read_content()"""
        data = response.read()
        self._size = len(data)
        encoding = headers.get_content_encoding(self.headers)
        if encoding in _supported_encodings:
            try:
                if encoding == 'deflate':
                    f = StringIO(zlib.decompress(data))
                else:
                    f = gzip.GzipFile('', 'rb', 9, StringIO(data))
            except zlib.error, msg:
                self.add_warning(_("Decompress error %(err)s") %
                                 {"err": str(msg)},
                                 tag=WARN_HTTP_DECOMPRESS_ERROR)
                f = StringIO(data)
            try:
                data = f.read()
            finally:
                f.close()
        # store temporary data
        self._data = data

    def encoding_supported (self):
        """Check if page encoding is supported."""
        encoding = headers.get_content_encoding(self.headers)
        if encoding and encoding not in _supported_encodings and \
           encoding != 'identity':
            self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
                             {"encoding": encoding},
                             tag=WARN_HTTP_UNSUPPORTED_ENCODING)
            return False
        return True

    def set_title_from_content (self):
        """Check if it's allowed to read content before execution."""
        if self.method_get_allowed:
            super(HttpUrl, self).set_title_from_content()

    def get_anchors (self):
        """Check if it's allowed to read content before execution."""
        if self.method_get_allowed:
            super(HttpUrl, self).get_anchors()

    def content_allows_robots (self):
        """Check if it's allowed to read content before execution."""
        if not self.method_get_allowed:
            return False
        return super(HttpUrl, self).content_allows_robots()

    def check_warningregex (self):
        """Check if it's allowed to read content before execution."""
        if self.method_get_allowed:
            super(HttpUrl, self).check_warningregex()

    def is_html (self):
        """
        See if this URL points to a HTML file by looking at the
        Content-Type header, file extension and file content.

        @return: True if URL points to HTML file
        @rtype: bool
        """
        if not (self.valid and self.headers):
            return False
        mime = headers.get_content_type(self.headers)
        if self.ContentMimetypes.get(mime) != "html":
            return False
        return self.encoding_supported()

    def is_css (self):
        """Return True iff content of this url is CSS stylesheet."""
        if not (self.valid and self.headers):
            return False
        mime = headers.get_content_type(self.headers)
        if self.ContentMimetypes.get(mime) != "css":
            return False
        return self.encoding_supported()

    def is_http (self):
        """
        This is a HTTP file.

        @return: True
        @rtype: bool
        """
        return True

    def is_parseable (self):
        """
        Check if content is parseable for recursion.

        @return: True if content is parseable
        @rtype: bool
        """
        if not (self.valid and self.headers):
            return False
        if headers.get_content_type(self.headers) not in self.ContentMimetypes:
            return False
        return self.encoding_supported()

    def parse_url (self):
        """
        Parse file contents for new links to check.
        """
        ctype = headers.get_content_type(self.headers)
        if self.is_html():
            self.parse_html()
        elif self.is_css():
            self.parse_css()
        elif ctype == "application/x-shockwave-flash":
            self.parse_swf()
        elif ctype == "application/msword":
            self.parse_word()

    def get_robots_txt_url (self):
        """
        Get the according robots.txt URL for this URL.

        @return: robots.txt URL
        @rtype: string
        """
        return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])

    def close_connection (self):
        """
        If connection is persistent, add it to the connection pool.
        Else close the connection. Errors on closing are ignored.
        """
        if self.url_connection is None:
            # no connection is open
            return
        # add to cached connections
        _user, _password = self.get_user_password()
        key = ("http", self.urlparts[1], _user, _password)
        if self.persistent and self.url_connection.is_idle():
            self.aggregate.connections.add(
                  key, self.url_connection, self.timeout)
        else:
            try:
                self.url_connection.close()
            except Exception:
                # ignore close errors
                pass
        self.url_connection = None