linkchecker/linkcheck/checker/httpurl.py
2012-10-26 18:05:00 +02:00

841 lines
34 KiB
Python

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2000-2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle http links.
"""
import urlparse
import os
import errno
import zlib
import socket
import rfc822
import time
from cStringIO import StringIO
from datetime import datetime
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
httplib2 as httplib, LinkCheckerError, httputil, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
get_url_from, pooledconnection)
# import warnings
from .const import WARN_HTTP_ROBOTS_DENIED, \
WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
WARN_HTTP_AUTH_UNKNOWN, WARN_HTTP_AUTH_UNAUTHORIZED
# assumed HTTP header encoding
HEADER_ENCODING = "iso-8859-1"
HTTP_SCHEMAS = ('http://', 'https://')
# helper alias
unicode_safe = strformat.unicode_safe
supportHttps = hasattr(httplib, "HTTPSConnection")
SUPPORTED_ENCODINGS = ('x-gzip', 'gzip', 'deflate')
# Accept-Encoding header value
ACCEPT_ENCODING = ",".join(SUPPORTED_ENCODINGS)
# Accept-Charset header value
ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.3"
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledconnection.PooledConnection):
"""
Url link with http scheme.
"""
def reset (self):
"""
Initialize HTTP specific variables.
"""
super(HttpUrl, self).reset()
self.max_redirects = 5
self.has301status = False
# flag if connection is persistent
self.persistent = False
# URLs seen through 301/302 redirections
self.aliases = []
# initialize check data
self.headers = None
self.auth = None
self.cookies = []
# temporary data filled when reading redirections
self._data = None
# flag telling if GET method is allowed; determined by robots.txt
self.method_get_allowed = True
# HttpResponse object
self.response = None
def allows_robots (self, url):
"""
Fetch and parse the robots.txt of given url. Checks if LinkChecker
can get the requested resource content. HEAD requests however are
still allowed.
@param url: the url to be requested
@type url: string
@return: True if access is granted, otherwise False
@rtype: bool
"""
roboturl = self.get_robots_txt_url()
user, password = self.get_user_password()
rb = self.aggregate.robots_txt
callback = self.aggregate.connections.host_wait
return rb.allows_url(roboturl, url, self.proxy, user, password,
callback=callback)
def add_size_info (self):
"""Get size of URL content from HTTP header."""
if self.headers and "Content-Length" in self.headers and \
"Transfer-Encoding" not in self.headers:
# Note that content-encoding causes size differences since
# the content data is always decoded.
try:
self.size = int(self.getheader("Content-Length"))
if self.dlsize == -1:
self.dlsize = self.size
except (ValueError, OverflowError):
pass
else:
self.size = -1
def check_connection (self):
"""
Check a URL with HTTP protocol.
Here is an excerpt from RFC 1945 with common response codes:
The first digit of the Status-Code defines the class of response. The
last two digits do not have any categorization role. There are 5
values for the first digit:
- 1xx: Informational - Not used, but reserved for future use
- 2xx: Success - The action was successfully received,
understood, and accepted.
- 3xx: Redirection - Further action must be taken in order to
complete the request
- 4xx: Client Error - The request contains bad syntax or cannot
be fulfilled
- 5xx: Server Error - The server failed to fulfill an apparently
valid request
"""
# set the proxy, so a 407 status after this is an error
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
self.construct_auth()
# check robots.txt
if not self.allows_robots(self.url):
# remove all previously stored results
self.add_warning(
_("Access denied by robots.txt, skipping content checks."),
tag=WARN_HTTP_ROBOTS_DENIED)
self.method_get_allowed = False
# first try with HEAD
self.method = "HEAD"
# check the http connection
self.check_http_connection()
# redirections might have changed the URL
self.url = urlutil.urlunsplit(self.urlparts)
# check response
if self.response is not None:
self.check_response()
self.close_response()
def check_http_connection (self):
"""
Check HTTP connection and return get response and a flag
if the check algorithm had to fall back to the GET method.
@return: response or None if url is already handled
@rtype: HttpResponse or None
"""
while True:
# XXX refactor this
self.close_response()
try:
self._try_http_response()
except httplib.BadStatusLine, msg:
# some servers send empty HEAD replies
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.fallback_to_get()
continue
raise
except socket.error, msg:
# some servers reset the connection on HEAD requests
if self.method == "HEAD" and self.method_get_allowed and \
msg[0] == errno.ECONNRESET:
self.fallback_to_get()
continue
raise
uheaders = unicode_safe(self.headers, encoding=HEADER_ENCODING)
log.debug(LOG_CHECK, "Headers: %s", uheaders)
# proxy enforcement (overrides standard proxy)
if self.response.status == 305 and self.headers:
oldproxy = (self.proxy, self.proxyauth)
newproxy = self.getheader("Location")
self.add_info(_("Enforced proxy `%(name)s'.") %
{"name": newproxy})
self.set_proxy(newproxy)
self.close_response()
if not self.proxy:
self.set_result(
_("Enforced proxy `%(name)s' ignored, aborting.") %
{"name": newproxy},
valid=False)
return
self._try_http_response()
# restore old proxy settings
self.proxy, self.proxyauth = oldproxy
try:
tries = self.follow_redirections()
except httplib.BadStatusLine, msg:
# some servers send empty HEAD replies
if self.method == "HEAD" and self.method_get_allowed:
log.debug(LOG_CHECK, "Bad status line %r: falling back to GET", msg)
self.fallback_to_get()
continue
raise
if tries == -1:
log.debug(LOG_CHECK, "already handled")
self.close_response()
self.do_check_content = False
return
if tries >= self.max_redirects:
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
self.fallback_to_get()
continue
self.set_result(_("more than %d redirections, aborting") %
self.max_redirects, valid=False)
self.close_response()
self.do_check_content = False
return
if self.do_fallback(self.response.status):
self.fallback_to_get()
continue
# user authentication
if self.response.status == 401:
authenticate = self.getheader('WWW-Authenticate')
if authenticate is None:
# Either the server intentionally blocked this request,
# or there is a form on this page which requires
# manual user/password input.
# Either way, this is a warning.
self.add_warning(_("Unauthorized access without HTTP authentication."),
tag=WARN_HTTP_AUTH_UNAUTHORIZED)
return
if not authenticate.startswith("Basic"):
# LinkChecker only supports Basic authorization
args = {"auth": authenticate}
self.add_warning(
_("Unsupported HTTP authentication `%(auth)s', " \
"only `Basic' authentication is supported.") % args,
tag=WARN_HTTP_AUTH_UNKNOWN)
return
if not self.auth:
self.construct_auth()
if self.auth:
continue
break
def do_fallback(self, status):
"""Check for fallback according to response status.
@param status: The HTTP response status
@ptype status: int
@return: True if checker should use GET, else False
@rtype: bool
"""
if self.method == "HEAD":
# Some sites do not support HEAD requests, for example
# youtube sends a 404 with HEAD, 200 with GET. Doh.
# A 405 "Method not allowed" status should also use GET.
if status >= 400:
log.debug(LOG_CHECK, "Method HEAD error %d, falling back to GET", status)
return True
# Other sites send 200 with HEAD, but 404 with GET. Bummer.
poweredby = self.getheader('X-Powered-By', u'')
server = self.getheader('Server', u'')
if (poweredby.startswith('Zope') or server.startswith('Zope')
or ('ASP.NET' in poweredby and 'Microsoft-IIS' in server)):
# Zope or IIS server could not get Content-Type with HEAD
# http://intermapper.com.dev4.silvertech.net/bogus.aspx
return True
return False
def fallback_to_get(self):
"""Set method to GET and clear aliases."""
self.method = "GET"
self.aliases = []
def construct_auth (self):
"""Construct HTTP Basic authentication credentials if there
is user/password information available. Does not overwrite if
credentials have already been constructed."""
if self.auth:
return
_user, _password = self.get_user_password()
if _user is not None and _password is not None:
credentials = httputil.encode_base64("%s:%s" % (_user, _password))
self.auth = "Basic " + credentials
log.debug(LOG_CHECK, "Using basic authentication")
def get_content_type (self):
"""Return content MIME type or empty string."""
if self.content_type is None:
if self.headers:
self.content_type = headers.get_content_type(self.headers)
else:
self.content_type = u""
return self.content_type
def follow_redirections (self, set_result=True):
"""Follow all redirections of http response."""
log.debug(LOG_CHECK, "follow all redirections")
redirected = self.url
tries = 0
while self.response.status in [301, 302] and self.headers and \
tries < self.max_redirects:
num = self.follow_redirection(set_result, redirected)
if num == -1:
return num
redirected = urlutil.urlunsplit(self.urlparts)
tries += num
return tries
def follow_redirection (self, set_result, redirected):
"""Follow one redirection of http response."""
newurl = self.getheader("Location",
self.getheader("Uri", u""))
# make new url absolute and unicode
newurl = urlparse.urljoin(redirected, unicode_safe(newurl))
log.debug(LOG_CHECK, "Redirected to %r", newurl)
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
# norm base url - can raise UnicodeError from url.idna_encode()
redirected, is_idn = urlbase.url_norm(newurl)
log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
urlparts = strformat.url_unicode_split(redirected)
if not self.check_redirection_scheme(redirected, urlparts, set_result):
return -1
if not self.check_redirection_newscheme(redirected, urlparts, set_result):
return -1
if not self.check_redirection_domain(redirected, urlparts,
set_result):
return -1
if not self.check_redirection_robots(redirected, set_result):
return -1
num = self.check_redirection_recursion(redirected, set_result)
if num != 0:
return num
if set_result:
self.check301status()
self.close_response()
self.close_connection()
# remember redirected url as alias
self.aliases.append(redirected)
if self.anchor:
urlparts[4] = self.anchor
# note: urlparts has to be a list
self.urlparts = urlparts
self.build_url_parts()
# check cache again on the changed URL
if self.aggregate.urlqueue.checked_redirect(redirected, self):
return -1
# store cookies from redirect response
self.store_cookies()
# new response data
self._try_http_response()
return 1
def check_redirection_scheme (self, redirected, urlparts, set_result):
"""Return True if redirection scheme is ok, else False."""
if urlparts[0] in ('ftp', 'http', 'https'):
return True
# For security reasons do not allow redirects to protocols
# other than HTTP, HTTPS or FTP.
if set_result:
self.add_warning(
_("Redirection to url `%(newurl)s' is not allowed.") %
{'newurl': redirected})
self.set_result(_("syntax OK"))
return False
def check_redirection_domain (self, redirected, urlparts, set_result):
"""Return True if redirection domain is ok, else False."""
# XXX does not support user:pass@netloc format
if urlparts[1] != self.urlparts[1]:
# URL domain changed
if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
# Add intern patterns for redirection of URLs given by the
# user for HTTP schemes.
self.add_intern_pattern(url=redirected)
return True
# check extern filter again
self.extern = None
self.set_extern(redirected)
if self.extern[0] and self.extern[1]:
if set_result:
self.check301status()
self.add_info(_("The redirected URL is outside of the domain "
"filter, checked only syntax."))
self.set_result(_("filtered"))
return False
return True
def check_redirection_robots (self, redirected, set_result):
"""Check robots.txt allowance for redirections. Return True if
allowed, else False."""
if self.allows_robots(redirected):
return True
if set_result:
self.add_warning(
_("Access to redirected URL denied by robots.txt, "
"checked only syntax."), tag=WARN_HTTP_ROBOTS_DENIED)
self.set_result(_("syntax OK"))
return False
def check_redirection_recursion (self, redirected, set_result):
"""Check for recursive redirect. Return zero if no recursion
detected, max_redirects for recursion with HEAD request,
-1 otherwise."""
all_seen = [self.cache_url_key] + self.aliases
if redirected not in all_seen:
return 0
if self.method == "HEAD" and self.method_get_allowed:
# Microsoft servers tend to recurse HEAD requests
# fall back to the original url and use GET
return self.max_redirects
if set_result:
urls = "\n => ".join(all_seen + [redirected])
self.set_result(_("recursive redirection encountered:\n %(urls)s") %
{"urls": urls}, valid=False)
return -1
def check_redirection_newscheme (self, redirected, urlparts, set_result):
"""Check for HTTP(S)/FTP redirection. Return True for
redirection with same scheme, else False."""
if urlparts[0] != self.urlparts[0]:
# changed scheme
newobj = get_url_from(
redirected, self.recursion_level, self.aggregate,
parent_url=self.parent_url, base_ref=self.base_ref,
line=self.line, column=self.column, name=self.name)
msg = _("Redirection to URL `%(newurl)s' with different scheme"
" found; the original URL was `%(url)s'.") % \
{"url": self.url, "newurl": newobj.url}
if set_result:
self.add_warning(msg, tag=WARN_HTTP_WRONG_REDIRECT)
self.set_result(_("syntax OK"))
# append new object to queue
self.aggregate.urlqueue.put(newobj)
return False
raise LinkCheckerError(msg)
return True
def check301status (self):
"""If response page has been permanently moved add a warning."""
if self.response.status == 301 and not self.has301status:
self.add_warning(_("HTTP 301 (moved permanent) encountered: you"
" should update this link."),
tag=WARN_HTTP_MOVED_PERMANENT)
self.has301status = True
def getheader (self, name, default=None):
"""Get decoded header value.
@return: decoded header value or default of not found
@rtype: unicode or type of default
"""
value = self.headers.get(name)
if value is None:
return default
return unicode_safe(value, encoding=HEADER_ENCODING)
def get_alias_cache_data (self):
"""
Return all data values that should be put in the cache,
minus redirection warnings.
"""
data = self.get_cache_data()
data["warnings"] = [
x for x in self.warnings if x[0] != WARN_HTTP_MOVED_PERMANENT]
data["info"] = self.info
return data
def check_response (self):
"""Check final result and log it."""
if self.response.status >= 400:
self.set_result(u"%r %s" % (self.response.status, self.response.reason),
valid=False)
else:
if self.response.status == 204:
# no content
self.add_warning(self.response.reason,
tag=WARN_HTTP_EMPTY_CONTENT)
# store cookies for valid links
self.store_cookies()
if self.response.status >= 200:
self.set_result(u"%r %s" % (self.response.status, self.response.reason))
else:
self.set_result(_("OK"))
modified = rfc822.parsedate(self.getheader('Last-Modified', u''))
if modified:
self.modified = datetime.utcfromtimestamp(time.mktime(modified))
def _try_http_response (self):
"""Try to get a HTTP response object. For persistent
connections that the server closed unexpected, a new connection
will be opened.
"""
try:
self._get_http_response()
except socket.error, msg:
if msg.args[0] == 32 and self.persistent:
# server closed persistent connection - retry
log.debug(LOG_CHECK, "Server closed connection: retry")
self.persistent = False
self._get_http_response()
else:
raise
except httplib.BadStatusLine, msg:
if self.persistent:
# server closed connection - retry
log.debug(LOG_CHECK, "Empty status line: retry")
self.persistent = False
self._get_http_response()
else:
raise
def _get_http_response (self):
"""Send HTTP request and get response object."""
scheme, host, port = self.get_netloc()
log.debug(LOG_CHECK, "Connecting to %r", host)
self.get_http_object(scheme, host, port)
self.add_connection_request()
self.add_connection_headers()
self.response = self.url_connection.getresponse(buffering=True)
self.headers = self.response.msg
self.content_type = None
self.persistent = not self.response.will_close
if self.persistent and self.method == "HEAD":
# Some servers send page content after a HEAD request,
# but only after making the *next* request. This breaks
# protocol synchronisation. Workaround here is to close
# the connection after HEAD.
# Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
self.persistent = False
# Note that for POST method the connection should also be closed,
# but this method is never used.
# If possible, use official W3C HTTP response name
if self.response.status in httplib.responses:
self.response.reason = httplib.responses[self.response.status]
if self.response.reason:
self.response.reason = unicode_safe(self.response.reason)
log.debug(LOG_CHECK, "Response: %s %s", self.response.status, self.response.reason)
def add_connection_request(self):
"""Add connection request."""
# the anchor fragment is not part of a HTTP URL, see
# http://tools.ietf.org/html/rfc2616#section-3.2.2
anchor = ''
if self.proxy:
path = urlutil.urlunsplit((self.urlparts[0], self.urlparts[1],
self.urlparts[2], self.urlparts[3], anchor))
else:
path = urlutil.urlunsplit(('', '', self.urlparts[2],
self.urlparts[3], anchor))
self.url_connection.putrequest(self.method, path, skip_host=True,
skip_accept_encoding=True)
def add_connection_headers(self):
"""Add connection header."""
# be sure to use the original host as header even for proxies
self.url_connection.putheader("Host", self.urlparts[1])
if self.auth:
# HTTP authorization
self.url_connection.putheader("Authorization", self.auth)
if self.proxyauth:
self.url_connection.putheader("Proxy-Authorization",
self.proxyauth)
if (self.parent_url and
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
self.url_connection.putheader("Referer", self.parent_url)
self.url_connection.putheader("User-Agent",
self.aggregate.config["useragent"])
# prefer compressed content
self.url_connection.putheader("Accept-Encoding", ACCEPT_ENCODING)
# prefer UTF-8 encoding
self.url_connection.putheader("Accept-Charset", ACCEPT_CHARSET)
# send do-not-track header
self.url_connection.putheader("DNT", "1")
if self.aggregate.config['sendcookies']:
self.send_cookies()
self.url_connection.endheaders()
def store_cookies (self):
"""Save cookies from response headers."""
if self.aggregate.config['storecookies']:
for c in self.cookies:
self.add_info(_("Sent Cookie: %(cookie)s.") %
{"cookie": c.client_header_value()})
errors = self.aggregate.cookies.add(self.headers,
self.urlparts[0], self.urlparts[1], self.urlparts[2])
if errors:
self.add_warning(
_("Could not store cookies from headers: %(error)s.") %
{'error': "\n".join(errors)},
tag=WARN_HTTP_COOKIE_STORE_ERROR)
def send_cookies (self):
"""Add cookie headers to request."""
scheme = self.urlparts[0]
host = self.urlparts[1]
port = urlutil.default_ports.get(scheme, 80)
host, port = urlutil.splitport(host, port)
path = self.urlparts[2]
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
if not self.cookies:
return
# add one cookie header with all cookie data
# this is limited by maximum header length
headername = "Cookie"
headervalue = ""
max_value_len = headers.MAX_HEADER_BYTES - len(headername) - 2
for c in self.cookies:
cookievalue = c.client_header_value()
if "version" in c.attributes:
# add separate header for explicit versioned cookie
if headervalue:
self.url_connection.putheader(headername, headervalue)
self.url_connection.putheader(headername, cookievalue)
headervalue = ""
continue
if headervalue:
cookievalue = "; " + cookievalue
if (len(headervalue) + len(cookievalue)) < max_value_len:
headervalue += cookievalue
else:
log.debug(LOG_CHECK, "Discard too-long cookie %r", cookievalue)
if headervalue:
log.debug(LOG_CHECK, "Sending cookie header %s:%s", headername, headervalue)
self.url_connection.putheader(headername, headervalue)
def get_http_object (self, scheme, host, port):
"""
Open a HTTP connection.
@param host: the host to connect to
@ptype host: string of the form <host>[:<port>]
@param scheme: 'http' or 'https'
@ptype scheme: string
@return: None
"""
self.close_connection()
def create_connection(scheme, host, port):
"""Create a new http or https connection."""
kwargs = dict(port=port, strict=True, timeout=self.aggregate.config["timeout"])
if scheme == "http":
h = httplib.HTTPConnection(host, **kwargs)
elif scheme == "https" and supportHttps:
devel_dir = os.path.join(configuration.configdata.install_data, "config")
kwargs["ca_certs"] = configuration.get_share_file(devel_dir, 'ca-certificates.crt')
h = httplib.HTTPSConnection(host, **kwargs)
else:
msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
raise LinkCheckerError(msg)
if log.is_debug(LOG_CHECK):
h.set_debuglevel(1)
return h
self.get_pooled_connection(scheme, host, port, create_connection)
self.url_connection.connect()
def read_content (self):
"""Get content of the URL target. The content data is cached after
the first call to this method.
@return: URL content, decompressed and decoded
@rtype: string
"""
assert self.method_get_allowed, 'unallowed content read'
if self.method != "GET" or self.response is None:
self.method = "GET"
self._try_http_response()
num = self.follow_redirections(set_result=False)
if not (0 <= num <= self.max_redirects):
raise LinkCheckerError(_("Redirection error"))
# Re-read size info, since the GET request result could be different
# than a former HEAD request.
self.add_size_info()
if self.size > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
self.charset = headers.get_charset(self.headers)
return self._read_content()
def _read_content (self):
"""Read URL contents."""
data = self.response.read(self.MaxFilesizeBytes+1)
if len(data) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
dlsize = len(data)
urls = self.aggregate.add_download_data(self.cache_content_key, data)
self.warn_duplicate_content(urls)
encoding = headers.get_content_encoding(self.headers)
if encoding in SUPPORTED_ENCODINGS:
try:
if encoding == 'deflate':
f = StringIO(zlib.decompress(data))
else:
f = gzip.GzipFile('', 'rb', 9, StringIO(data))
except zlib.error, msg:
log.debug(LOG_CHECK, "Error %s data of len %d", encoding, len(data))
self.add_warning(_("Decompress error %(err)s") %
{"err": str(msg)},
tag=WARN_HTTP_DECOMPRESS_ERROR)
f = StringIO(data)
try:
data = f.read()
finally:
f.close()
return data, dlsize
def encoding_supported (self):
"""Check if page encoding is supported."""
encoding = headers.get_content_encoding(self.headers)
if encoding and encoding not in SUPPORTED_ENCODINGS and \
encoding != 'identity':
self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
{"encoding": encoding},
tag=WARN_HTTP_UNSUPPORTED_ENCODING)
return False
return True
def can_get_content(self):
"""Check if it's allowed to read content."""
return self.method_get_allowed
def content_allows_robots (self):
"""Check if it's allowed to read content before execution."""
if not self.method_get_allowed:
return False
return super(HttpUrl, self).content_allows_robots()
def check_warningregex (self):
"""Check if it's allowed to read content before execution."""
if self.method_get_allowed:
super(HttpUrl, self).check_warningregex()
def is_html (self):
"""
See if this URL points to a HTML file by looking at the
Content-Type header, file extension and file content.
@return: True if URL points to HTML file
@rtype: bool
"""
if not self.valid:
return False
mime = self.get_content_type()
if self.ContentMimetypes.get(mime) != "html":
return False
if self.headers:
return self.encoding_supported()
return True
def is_css (self):
"""Return True iff content of this url is CSS stylesheet."""
if not self.valid:
return False
mime = self.get_content_type()
if self.ContentMimetypes.get(mime) != "css":
return False
if self.headers:
return self.encoding_supported()
return True
def is_http (self):
"""
This is a HTTP file.
@return: True
@rtype: bool
"""
return True
def is_parseable (self):
"""
Check if content is parseable for recursion.
@return: True if content is parseable
@rtype: bool
"""
if not (self.valid and self.headers):
return False
ctype = self.get_content_type()
if ctype not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", ctype)
return False
return self.encoding_supported()
def parse_url (self):
"""
Parse file contents for new links to check.
"""
ctype = self.get_content_type()
if self.is_html():
self.parse_html()
elif self.is_css():
self.parse_css()
elif ctype == "application/x-shockwave-flash":
self.parse_swf()
elif ctype == "application/msword":
self.parse_word()
elif ctype == "text/vnd.wap.wml":
self.parse_wml()
self.add_num_url_info()
def get_robots_txt_url (self):
"""
Get the according robots.txt URL for this URL.
@return: robots.txt URL
@rtype: string
"""
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
def close_response(self):
"""Close the HTTP response object."""
if self.response is None:
return
self.response.close()
self.response = None
def close_connection (self):
"""Release the connection from the connection pool. Persistent
connections will not be closed.
"""
log.debug(LOG_CHECK, "Closing %s", self.url_connection)
if self.url_connection is None:
# no connection is open
return
# add to cached connections
scheme, host, port = self.get_netloc()
if self.persistent and self.url_connection.is_idle():
expiration = time.time() + headers.http_keepalive(self.headers)
else:
self.close_response()
expiration = None
self.aggregate.connections.release(scheme, host, port, self.url_connection, expiration=expiration)
self.url_connection = None