# -*- coding: iso-8859-1 -*- # Copyright (C) 2000-2010 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Handle http links. """ import urlparse import urllib import re import errno import zlib import socket from cStringIO import StringIO import Cookie from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil, httplib2 as httplib, LinkCheckerError, configuration) from . import (internpaturl, proxysupport, httpheaders as headers, urlbase, get_url_from) # import warnings from .const import WARN_HTTP_ROBOTS_DENIED, \ WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \ WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \ WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \ WARN_HTTP_AUTH_UNKNOWN # helper alias unicode_safe = strformat.unicode_safe supportHttps = hasattr(httplib, "HTTPSConnection") _supported_encodings = ('gzip', 'x-gzip', 'deflate') # Amazon blocks all HEAD requests _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ Url link with http scheme. """ def reset (self): """ Initialize HTTP specific variables. """ super(HttpUrl, self).reset() self.max_redirects = 5 self.has301status = False # flag if check had to fallback from HEAD to GET method self.fallback_get = False # flag if connection is persistent self.persistent = False # URLs seen through 301/302 redirections self.aliases = [] # initialize check data self.headers = None self.auth = None self.cookies = [] # temporary data filled when reading redirections self._data = None # flag indicating connection reuse self.reused_connection = False # flag telling if GET method is allowed; determined by robots.txt self.method_get_allowed = True def allows_robots (self, url): """ Fetch and parse the robots.txt of given url. Checks if LinkChecker can get the requested resource content. HEAD requests however are still allowed. @param url: the url to be requested @type url: string @return: True if access is granted, otherwise False @rtype: bool """ roboturl = self.get_robots_txt_url() user, password = self.get_user_password() rb = self.aggregate.robots_txt callback = self.aggregate.connections.host_wait return rb.allows_url(roboturl, url, self.proxy, user, password, callback=callback) def add_size_info (self): """Get size of URL content from HTTP header.""" if self.headers and "Content-Length" in self.headers and \ "Transfer-Encoding" not in self.headers: # Note that content-encoding causes size differences since # the content data is always decoded. try: self.size = int(self.headers["Content-Length"]) if self.dlsize == -1: self.dlsize = self.size except (ValueError, OverflowError): pass else: self.size = -1 def check_connection (self): """ Check a URL with HTTP protocol. Here is an excerpt from RFC 1945 with common response codes: The first digit of the Status-Code defines the class of response. The last two digits do not have any categorization role. There are 5 values for the first digit: - 1xx: Informational - Not used, but reserved for future use - 2xx: Success - The action was successfully received, understood, and accepted. - 3xx: Redirection - Further action must be taken in order to complete the request - 4xx: Client Error - The request contains bad syntax or cannot be fulfilled - 5xx: Server Error - The server failed to fulfill an apparently valid request """ # set the proxy, so a 407 status after this is an error self.set_proxy(self.aggregate.config["proxy"].get(self.scheme)) # check robots.txt if not self.allows_robots(self.url): # remove all previously stored results self.add_warning( _("Access denied by robots.txt, skipping content checks."), tag=WARN_HTTP_ROBOTS_DENIED) self.method_get_allowed = False # first try with HEAD self.method = "HEAD" # check for amazon server quirk if _is_amazon(self.urlparts[1]): self.add_info(_("Amazon servers block HTTP HEAD requests.")) if self.method_get_allowed: self.add_info(_("Using GET method for Amazon server.")) self.method = "GET" # check the http connection response = self.check_http_connection() if self.headers and "Server" in self.headers: server = self.headers['Server'] else: server = _("unknown") if self.fallback_get: self.add_info(_("Server `%(name)s' did not support HEAD request; " "a GET request was used instead.") % {"name": server}) # redirections might have changed the URL self.url = urlparse.urlunsplit(self.urlparts) # check response if response: self.check_response(response) response.close() def check_http_connection (self): """ Check HTTP connection and return get response and a flag if the check algorithm had to fall back to the GET method. @return: response or None if url is already handled @rtype: HttpResponse or None """ response = None while True: if response is not None: response.close() try: response = self._try_http_response() except httplib.BadStatusLine, msg: # some servers send empty HEAD replies if self.method == "HEAD" and self.method_get_allowed: log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg) self.method = "GET" self.aliases = [] self.fallback_get = True continue raise except socket.error, msg: # some servers reset the connection on HEAD requests if self.method == "HEAD" and self.method_get_allowed and \ msg[0] == errno.ECONNRESET: self.method = "GET" self.aliases = [] self.fallback_get = True continue raise if response.reason: response.reason = unicode_safe(response.reason) log.debug(LOG_CHECK, "Response: %s %s", response.status, response.reason) log.debug(LOG_CHECK, "Headers: %s", self.headers) # proxy enforcement (overrides standard proxy) if response.status == 305 and self.headers: oldproxy = (self.proxy, self.proxyauth) newproxy = self.headers.getheader("Location") self.add_info(_("Enforced proxy `%(name)s'.") % {"name": newproxy}) self.set_proxy(newproxy) if not self.proxy: self.set_result( _("Enforced proxy `%(name)s' ignored, aborting.") % {"name": newproxy}, valid=False) return response response.close() response = self._try_http_response() # restore old proxy settings self.proxy, self.proxyauth = oldproxy try: tries, response = self.follow_redirections(response) except httplib.BadStatusLine, msg: # some servers send empty HEAD replies if self.method == "HEAD" and self.method_get_allowed: log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg) self.method = "GET" self.aliases = [] self.fallback_get = True continue raise if tries == -1: log.debug(LOG_CHECK, "already handled") response.close() self.do_check_content = False return None if tries >= self.max_redirects: if self.method == "HEAD" and self.method_get_allowed: # Microsoft servers tend to recurse HEAD requests self.method = "GET" self.aliases = [] self.fallback_get = True continue self.set_result(_("more than %d redirections, aborting") % self.max_redirects, valid=False) return response # user authentication if response.status == 401: authenticate = self.headers.get('WWW-Authenticate') if not authenticate or not authenticate.startswith("Basic"): # LinkChecker only supports Basic authorization args = {"auth": authenticate} self.add_warning( _("Unsupported HTTP authentication `%(auth)s', " \ "only `Basic' authentication is supported.") % args, tag=WARN_HTTP_AUTH_UNKNOWN) return if not self.auth: import base64 _user, _password = self.get_user_password() self.auth = "Basic " + \ base64.encodestring("%s:%s" % (_user, _password)) log.debug(LOG_CHECK, "Authentication %s/%s", _user, _password) continue elif response.status >= 400: # retry with GET (but do not set fallback flag) if self.method == "HEAD" and self.method_get_allowed: self.method = "GET" self.aliases = [] continue elif self.headers and self.method == "HEAD" and self.method_get_allowed: # test for HEAD support mime = headers.get_content_type(self.headers) poweredby = self.headers.get('X-Powered-By', '') server = self.headers.get('Server', '') if mime in ('application/octet-stream', 'text/plain') and \ (poweredby.startswith('Zope') or server.startswith('Zope')): # Zope server could not get Content-Type with HEAD self.method = "GET" self.aliases = [] self.fallback_get = True continue break return response def follow_redirections (self, response, set_result=True): """ Follow all redirections of http response. """ log.debug(LOG_CHECK, "follow all redirections") redirected = self.url tries = 0 while response.status in [301, 302] and self.headers and \ tries < self.max_redirects: newurl = self.headers.getheader("Location", self.headers.getheader("Uri", "")) # make new url absolute and unicode newurl = unicode_safe(newurl) newurl = urlparse.urljoin(redirected, newurl) log.debug(LOG_CHECK, "Redirected to %r", newurl) self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl}) # norm base url - can raise UnicodeError from url.idna_encode() redirected, is_idn = urlbase.url_norm(newurl) if is_idn: pass # XXX warn about idn use log.debug(LOG_CHECK, "Norm redirected to %r", redirected) urlparts = strformat.url_unicode_split(redirected) if urlparts[1] != self.urlparts[1]: # check extern filter again self.set_extern(redirected) if self.extern[0] and self.extern[1]: if set_result: self.check301status(response) self.add_info( _("The redirected URL is outside of the domain " "filter, checked only syntax.")) self.set_result(u"filtered") return -1, response # check robots.txt allowance again if not self.allows_robots(redirected): if set_result: self.add_warning( _("Access to redirected URL denied by robots.txt, " "checked only syntax."), tag=WARN_HTTP_ROBOTS_DENIED) self.set_result(u"syntax OK") return -1, response # see about recursive redirect all_seen = [self.cache_url_key] + self.aliases if redirected in all_seen: if self.method == "HEAD" and self.method_get_allowed: # Microsoft servers tend to recurse HEAD requests # fall back to the original url and use GET return self.max_redirects, response recursion = all_seen + [redirected] if set_result: self.set_result( _("recursive redirection encountered:\n %(urls)s") % {"urls": "\n => ".join(recursion)}, valid=False) return -1, response if urlparts[0] == self.scheme or urlparts[0] in ('http', 'https'): # remember redirected url as alias self.aliases.append(redirected) else: # in case of changed scheme make new URL object newobj = get_url_from( redirected, self.recursion_level, self.aggregate, parent_url=self.parent_url, base_ref=self.base_ref, line=self.line, column=self.column, name=self.name) if set_result: self.add_warning( _("Redirection to URL `%(newurl)s' with different scheme" " found; the original URL was `%(url)s'.") % {"url": self.url, "newurl": newobj.url}, tag=WARN_HTTP_WRONG_REDIRECT) self.set_result(u"syntax OK") # append new object to queue self.aggregate.urlqueue.put(newobj) # pretend to be finished and logged return -1, response # note: urlparts has to be a list self.urlparts = urlparts if set_result: self.check301status(response) # check cache again on the changed URL if self.aggregate.urlqueue.checked_redirect(redirected, self): return -1, response # new response data response.close() response = self._try_http_response() tries += 1 return tries, response def check301status (self, response): """If response page has been permanently moved add a warning.""" if response.status == 301 and not self.has301status: self.add_warning(_("HTTP 301 (moved permanent) encountered: you" " should update this link."), tag=WARN_HTTP_MOVED_PERMANENT) self.has301status = True def get_alias_cache_data (self): """ Return all data values that should be put in the cache, minus redirection warnings. """ data = self.get_cache_data() data["warnings"] = [ x for x in self.warnings if x[0] != "http-moved-permanent"] data["info"] = self.info return data def check_response (self, response): """Check final result and log it.""" if response.status >= 400: self.set_result(u"%r %s" % (response.status, response.reason), valid=False) else: if response.status == 204: # no content self.add_warning(unicode_safe(response.reason), tag=WARN_HTTP_EMPTY_CONTENT) # store cookies for valid links if self.aggregate.config['storecookies']: for c in self.cookies: self.add_info(_("Sent cookie: %(cookie)s.") % {"cookie": c}) try: out = self.aggregate.cookies.add(self.headers, self.urlparts[0], self.urlparts[1], self.urlparts[2]) except Cookie.CookieError, msg: self.add_warning(_("Could not store cookies: %(msg)s.") % {'msg': str(msg)}, tag=WARN_HTTP_COOKIE_STORE_ERROR) if response.status >= 200: self.set_result(u"%r %s" % (response.status, response.reason)) else: self.set_result(u"OK") modified = self.headers.get('Last-Modified', '') if modified: self.add_info(_("Last modified %(date)s.") % {"date": modified}) def _try_http_response (self): """Try to get a HTTP response object. For reused persistent connections that the server closed unexpected, a new connection will be opened. """ try: return self._get_http_response() except socket.error, msg: if msg.args[0] == 32 and self.reused_connection: # server closed persistent connection - retry log.debug(LOG_CHECK, "Server closed connection: retry") self.persistent = False return self._get_http_response() raise except httplib.BadStatusLine, msg: if not msg and self.reused_connection: # server closed connection - retry log.debug(LOG_CHECK, "Empty status line: retry") self.persistent = False return self._get_http_response() raise def _get_http_response (self): """ Send HTTP request and get response object. """ if self.proxy: host = self.proxy scheme = self.proxytype else: host = self.urlparts[1] scheme = self.urlparts[0] log.debug(LOG_CHECK, "Connecting to %r", host) # close/release a previous connection self.close_connection() self.url_connection = self.get_http_object(host, scheme) # the anchor fragment is not part of a HTTP URL, see # http://tools.ietf.org/html/rfc2616#section-3.2.2 anchor = '' if self.proxy: path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1], self.urlparts[2], self.urlparts[3], anchor)) else: path = urlparse.urlunsplit(('', '', self.urlparts[2], self.urlparts[3], anchor)) self.url_connection.putrequest(self.method, path, skip_host=True, skip_accept_encoding=True) # be sure to use the original host as header even for proxies self.url_connection.putheader("Host", self.urlparts[1]) # userinfo is from http://user@pass:host/ if self.userinfo: self.url_connection.putheader("Authorization", self.userinfo) # auth is the -u and -p configuration options elif self.auth: self.url_connection.putheader("Authorization", self.auth) if self.proxyauth: self.url_connection.putheader("Proxy-Authorization", self.proxyauth) if (self.parent_url and self.parent_url.startswith(('http://', 'https://'))): self.url_connection.putheader("Referer", self.parent_url) self.url_connection.putheader("User-Agent", configuration.UserAgent) self.url_connection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") if self.aggregate.config['sendcookies']: scheme = self.urlparts[0] host = self.urlparts[1] port = urlutil.default_ports.get(scheme, 80) host, port = urllib.splitnport(host, port) path = self.urlparts[2] self.cookies = self.aggregate.cookies.get(scheme, host, port, path) for c in self.cookies: name = c.client_header_name() value = c.client_header_value() self.url_connection.putheader(name, value) self.url_connection.endheaders() response = self.url_connection.getresponse(True) self.timeout = headers.http_timeout(response) self.headers = response.msg self.persistent = not response.will_close if self.persistent and self.method == "HEAD": # Some servers send page content after a HEAD request, # but only after making the *next* request. This breaks # protocol synchronisation. Workaround here is to close # the connection after HEAD. # Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk) self.persistent = False if self.persistent and (self.method == "GET" or self.headers.getheader("Content-Length") != "0"): # always read content from persistent connections self._read_content(response) assert not response.will_close # If possible, use official W3C HTTP response name if response.status in httplib.responses: response.reason = httplib.responses[response.status] return response def get_http_object (self, host, scheme): """ Open a HTTP connection. @param host: the host to connect to @type host: string of the form [:] @param scheme: 'http' or 'https' @type scheme: string @return: open HTTP(S) connection @rtype: httplib.HTTP(S)Connection """ _user, _password = self.get_user_password() key = (scheme, self.urlparts[1], _user, _password) conn = self.aggregate.connections.get(key) if conn is not None: log.debug(LOG_CHECK, "reuse cached HTTP(S) connection %s", conn) self.reused_connection = True return conn self.aggregate.connections.wait_for_host(host) if scheme == "http": h = httplib.HTTPConnection(host) elif scheme == "https" and supportHttps: h = httplib.HTTPSConnection(host) else: msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme} raise LinkCheckerError(msg) if log.is_debug(LOG_CHECK): h.set_debuglevel(1) h.connect() return h def read_content (self): """Get content of the URL target. The content data is cached after the first call to this method. @return: URL content, decompressed and decoded @rtype: string """ assert self.method_get_allowed, 'unallowed content read' self.method = "GET" response = self._try_http_response() response = self.follow_redirections(response, set_result=False)[1] self.headers = response.msg # Re-read size info, since the GET request result could be different # than a former HEAD request. self.add_size_info() if self._data is None: self._read_content(response) data, size = self._data, self._size self._data = self._size = None return data, size def _read_content (self, response): """Read URL contents and store then in self._data. This way, the method can be called by other functions than read_content()""" data = response.read() self._size = len(data) encoding = headers.get_content_encoding(self.headers) if encoding in _supported_encodings: try: if encoding == 'deflate': f = StringIO(zlib.decompress(data)) else: f = gzip.GzipFile('', 'rb', 9, StringIO(data)) except zlib.error, msg: self.add_warning(_("Decompress error %(err)s") % {"err": str(msg)}, tag=WARN_HTTP_DECOMPRESS_ERROR) f = StringIO(data) try: data = f.read() finally: f.close() # store temporary data self._data = data def encoding_supported (self): """Check if page encoding is supported.""" encoding = headers.get_content_encoding(self.headers) if encoding and encoding not in _supported_encodings and \ encoding != 'identity': self.add_warning(_("Unsupported content encoding `%(encoding)s'.") % {"encoding": encoding}, tag=WARN_HTTP_UNSUPPORTED_ENCODING) return False return True def set_title_from_content (self): """Check if it's allowed to read content before execution.""" if self.method_get_allowed: super(HttpUrl, self).set_title_from_content() def get_anchors (self): """Check if it's allowed to read content before execution.""" if self.method_get_allowed: super(HttpUrl, self).get_anchors() def content_allows_robots (self): """Check if it's allowed to read content before execution.""" if not self.method_get_allowed: return False return super(HttpUrl, self).content_allows_robots() def check_warningregex (self): """Check if it's allowed to read content before execution.""" if self.method_get_allowed: super(HttpUrl, self).check_warningregex() def is_html (self): """ See if this URL points to a HTML file by looking at the Content-Type header, file extension and file content. @return: True if URL points to HTML file @rtype: bool """ if not (self.valid and self.headers): return False mime = headers.get_content_type(self.headers) if self.ContentMimetypes.get(mime) != "html": return False return self.encoding_supported() def is_css (self): """Return True iff content of this url is CSS stylesheet.""" if not (self.valid and self.headers): return False mime = headers.get_content_type(self.headers) if self.ContentMimetypes.get(mime) != "css": return False return self.encoding_supported() def is_http (self): """ This is a HTTP file. @return: True @rtype: bool """ return True def is_parseable (self): """ Check if content is parseable for recursion. @return: True if content is parseable @rtype: bool """ if not (self.valid and self.headers): return False if headers.get_content_type(self.headers) not in self.ContentMimetypes: return False return self.encoding_supported() def parse_url (self): """ Parse file contents for new links to check. """ ctype = headers.get_content_type(self.headers) if self.is_html(): self.parse_html() elif self.is_css(): self.parse_css() elif ctype == "application/x-shockwave-flash": self.parse_swf() elif ctype == "application/msword": self.parse_word() def get_robots_txt_url (self): """ Get the according robots.txt URL for this URL. @return: robots.txt URL @rtype: string """ return "%s://%s/robots.txt" % tuple(self.urlparts[0:2]) def close_connection (self): """ If connection is persistent, add it to the connection pool. Else close the connection. Errors on closing are ignored. """ if self.url_connection is None: # no connection is open return # add to cached connections _user, _password = self.get_user_password() key = ("http", self.urlparts[1], _user, _password) if self.persistent and self.url_connection.is_idle(): self.aggregate.connections.add( key, self.url_connection, self.timeout) else: try: self.url_connection.close() except Exception: # ignore close errors pass self.url_connection = None