mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-20 07:50:24 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3887 e7d03fd6-7b0d-0410-9947-9c21f3af8025
686 lines
27 KiB
Python
686 lines
27 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2009 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
"""
|
|
Handle http links.
|
|
"""
|
|
|
|
import urlparse
|
|
import urllib
|
|
import time
|
|
import re
|
|
import zlib
|
|
import socket
|
|
from cStringIO import StringIO
|
|
import Cookie
|
|
|
|
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
|
|
httplib2 as httplib, LinkCheckerError, configuration)
|
|
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
|
|
get_url_from)
|
|
# import warnings
|
|
from .const import WARN_HTTP_ROBOTS_DENIED, WARN_HTTP_NO_ANCHOR_SUPPORT, \
|
|
WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
|
|
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
|
|
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
|
|
PARSE_MIMETYPES, HTML_MIMETYPES
|
|
|
|
# helper alias
|
|
unicode_safe = strformat.unicode_safe
|
|
|
|
supportHttps = hasattr(httplib, "HTTPSConnection") and \
|
|
hasattr(socket, "ssl")
|
|
|
|
_supported_encodings = ('gzip', 'x-gzip', 'deflate')
|
|
|
|
# Amazon blocks all HEAD requests
|
|
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
|
|
|
|
# Stolen from Python CVS urllib2.py
|
|
# Mapping status codes to official W3C names
|
|
httpresponses = {
|
|
100: 'Continue',
|
|
101: 'Switching Protocols',
|
|
|
|
200: 'OK',
|
|
201: 'Created',
|
|
202: 'Accepted',
|
|
203: 'Non-Authoritative Information',
|
|
204: 'No Content',
|
|
205: 'Reset Content',
|
|
206: 'Partial Content',
|
|
|
|
300: 'Multiple Choices',
|
|
301: 'Moved Permanently',
|
|
302: 'Found',
|
|
303: 'See Other',
|
|
304: 'Not Modified',
|
|
305: 'Use Proxy',
|
|
306: '(Unused)',
|
|
307: 'Temporary Redirect',
|
|
|
|
400: 'Bad Request',
|
|
401: 'Unauthorized',
|
|
402: 'Payment Required',
|
|
403: 'Forbidden',
|
|
404: 'Not Found',
|
|
405: 'Method Not Allowed',
|
|
406: 'Not Acceptable',
|
|
407: 'Proxy Authentication Required',
|
|
408: 'Request Timeout',
|
|
409: 'Conflict',
|
|
410: 'Gone',
|
|
411: 'Length Required',
|
|
412: 'Precondition Failed',
|
|
413: 'Request Entity Too Large',
|
|
414: 'Request-URI Too Long',
|
|
415: 'Unsupported Media Type',
|
|
416: 'Requested Range Not Satisfiable',
|
|
417: 'Expectation Failed',
|
|
|
|
500: 'Internal Server Error',
|
|
501: 'Not Implemented',
|
|
502: 'Bad Gateway',
|
|
503: 'Service Unavailable',
|
|
504: 'Gateway Timeout',
|
|
505: 'HTTP Version Not Supported',
|
|
}
|
|
|
|
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|
"""
|
|
Url link with http scheme.
|
|
"""
|
|
|
|
def reset (self):
|
|
"""
|
|
Initialize HTTP specific variables.
|
|
"""
|
|
super(HttpUrl, self).reset()
|
|
self.max_redirects = 5
|
|
self.has301status = False
|
|
# some servers do not support anchors in requests
|
|
# this flag tells us to remove the anchor in request url
|
|
self.no_anchor = False
|
|
# flag if check had to fallback from HEAD to GET method
|
|
self.fallback_get = False
|
|
# flag if connection is persistent
|
|
self.persistent = False
|
|
# URLs seen through 301/302 redirections
|
|
self.aliases = []
|
|
# initialize check data
|
|
self.headers = None
|
|
self.auth = None
|
|
self.cookies = []
|
|
|
|
def allows_robots (self, url):
|
|
"""
|
|
Fetch and parse the robots.txt of given url. Checks if LinkChecker
|
|
can access the requested resource.
|
|
|
|
@param url: the url to be requested
|
|
@type url: string
|
|
@return: True if access is granted, otherwise False
|
|
@rtype: bool
|
|
"""
|
|
roboturl = self.get_robots_txt_url()
|
|
user, password = self.get_user_password()
|
|
rb = self.aggregate.robots_txt
|
|
callback = self.aggregate.connections.host_wait
|
|
return rb.allows_url(roboturl, url, self.proxy, user, password,
|
|
callback=callback,)
|
|
|
|
def check_connection (self):
|
|
"""
|
|
Check a URL with HTTP protocol.
|
|
Here is an excerpt from RFC 1945 with common response codes:
|
|
The first digit of the Status-Code defines the class of response. The
|
|
last two digits do not have any categorization role. There are 5
|
|
values for the first digit:
|
|
- 1xx: Informational - Not used, but reserved for future use
|
|
- 2xx: Success - The action was successfully received,
|
|
understood, and accepted.
|
|
- 3xx: Redirection - Further action must be taken in order to
|
|
complete the request
|
|
- 4xx: Client Error - The request contains bad syntax or cannot
|
|
be fulfilled
|
|
- 5xx: Server Error - The server failed to fulfill an apparently
|
|
valid request
|
|
"""
|
|
# set the proxy, so a 407 status after this is an error
|
|
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
|
# check robots.txt
|
|
if not self.allows_robots(self.url):
|
|
# remove all previously stored results
|
|
self.add_warning(
|
|
_("Access denied by robots.txt, checked only syntax."),
|
|
tag=WARN_HTTP_ROBOTS_DENIED)
|
|
self.set_result(u"syntax OK")
|
|
return
|
|
# check for amazon server quirk
|
|
if _is_amazon(self.urlparts[1]):
|
|
self.add_info(_("Amazon servers block HTTP HEAD requests, "
|
|
"using GET instead."))
|
|
self.method = "GET"
|
|
else:
|
|
# first try with HEAD
|
|
self.method = "HEAD"
|
|
# check the http connection
|
|
response = self.check_http_connection()
|
|
if self.headers and "Server" in self.headers:
|
|
server = self.headers['Server']
|
|
else:
|
|
server = _("unknown")
|
|
if self.fallback_get:
|
|
self.add_info(_("Server `%(name)s' did not support HEAD request; "
|
|
"a GET request was used instead.") %
|
|
{"name": server})
|
|
if self.no_anchor:
|
|
self.add_warning(_("Server `%(name)s' had no anchor support, removed"
|
|
" anchor from request.") % {"name": server},
|
|
tag=WARN_HTTP_NO_ANCHOR_SUPPORT)
|
|
# redirections might have changed the URL
|
|
newurl = urlparse.urlunsplit(self.urlparts)
|
|
if self.url != newurl:
|
|
if self.warn_redirect:
|
|
log.warn(LOG_CHECK, _("""URL `%(url)s' has been redirected.
|
|
Use URL `%(newurl)s' instead for checking.""") % {
|
|
'url': self.url, 'newurl': newurl})
|
|
self.url = newurl
|
|
# check response
|
|
if response:
|
|
self.check_response(response)
|
|
response.close()
|
|
|
|
def check_http_connection (self):
|
|
"""
|
|
Check HTTP connection and return get response and a flag
|
|
if the check algorithm had to fall back to the GET method.
|
|
|
|
@return: response or None if url is already handled
|
|
@rtype: HttpResponse or None
|
|
"""
|
|
response = None
|
|
while True:
|
|
if response is not None:
|
|
response.close()
|
|
try:
|
|
response = self._get_http_response()
|
|
except httplib.BadStatusLine:
|
|
# some servers send empty HEAD replies
|
|
if self.method == "HEAD":
|
|
self.method = "GET"
|
|
self.aliases = []
|
|
self.fallback_get = True
|
|
continue
|
|
raise
|
|
if response.reason:
|
|
response.reason = unicode_safe(response.reason)
|
|
log.debug(LOG_CHECK,
|
|
"Response: %s %s", response.status, response.reason)
|
|
log.debug(LOG_CHECK, "Headers: %s", self.headers)
|
|
# proxy enforcement (overrides standard proxy)
|
|
if response.status == 305 and self.headers:
|
|
oldproxy = (self.proxy, self.proxyauth)
|
|
newproxy = self.headers.getheader("Location")
|
|
self.add_info(_("Enforced proxy `%(name)s'.") %
|
|
{"name": newproxy})
|
|
self.set_proxy(newproxy)
|
|
if not self.proxy:
|
|
self.set_result(
|
|
_("Enforced proxy `%(name)s' ignored, aborting.") %
|
|
{"name": newproxy},
|
|
valid=False)
|
|
return response
|
|
response.close()
|
|
response = self._get_http_response()
|
|
# restore old proxy settings
|
|
self.proxy, self.proxyauth = oldproxy
|
|
try:
|
|
tries, response = self.follow_redirections(response)
|
|
except httplib.BadStatusLine:
|
|
# some servers send empty HEAD replies
|
|
if self.method == "HEAD":
|
|
self.method = "GET"
|
|
self.aliases = []
|
|
self.fallback_get = True
|
|
continue
|
|
raise
|
|
if tries == -1:
|
|
log.debug(LOG_CHECK, "already handled")
|
|
response.close()
|
|
return None
|
|
if tries >= self.max_redirects:
|
|
if self.method == "HEAD":
|
|
# Microsoft servers tend to recurse HEAD requests
|
|
self.method = "GET"
|
|
self.aliases = []
|
|
self.fallback_get = True
|
|
continue
|
|
self.set_result(_("more than %d redirections, aborting") %
|
|
self.max_redirects, valid=False)
|
|
return response
|
|
# user authentication
|
|
if response.status == 401:
|
|
if not self.auth:
|
|
import base64
|
|
_user, _password = self.get_user_password()
|
|
self.auth = "Basic " + \
|
|
base64.encodestring("%s:%s" % (_user, _password))
|
|
log.debug(LOG_CHECK,
|
|
"Authentication %s/%s", _user, _password)
|
|
continue
|
|
elif response.status >= 400:
|
|
if self.headers and self.urlparts[4] and not self.no_anchor:
|
|
self.no_anchor = True
|
|
continue
|
|
# retry with GET (but do not set fallback flag)
|
|
if self.method == "HEAD":
|
|
self.method = "GET"
|
|
self.aliases = []
|
|
continue
|
|
elif self.headers and self.method == "HEAD":
|
|
# test for HEAD support
|
|
mime = headers.get_content_type(self.headers)
|
|
poweredby = self.headers.get('X-Powered-By', '')
|
|
server = self.headers.get('Server', '')
|
|
if mime in ('application/octet-stream', 'text/plain') and \
|
|
(poweredby.startswith('Zope') or server.startswith('Zope')):
|
|
# Zope server could not get Content-Type with HEAD
|
|
self.method = "GET"
|
|
self.aliases = []
|
|
self.fallback_get = True
|
|
continue
|
|
break
|
|
return response
|
|
|
|
def follow_redirections (self, response, set_result=True):
|
|
"""
|
|
Follow all redirections of http response.
|
|
"""
|
|
log.debug(LOG_CHECK, "follow all redirections")
|
|
redirected = self.url
|
|
tries = 0
|
|
while response.status in [301, 302] and self.headers and \
|
|
tries < self.max_redirects:
|
|
newurl = self.headers.getheader("Location",
|
|
self.headers.getheader("Uri", ""))
|
|
# make new url absolute and unicode
|
|
newurl = urlparse.urljoin(redirected, newurl)
|
|
newurl = unicode_safe(newurl)
|
|
log.debug(LOG_CHECK, "Redirected to %r", newurl)
|
|
self.add_info(_("Redirected to `%(url)s'.") % {'url': newurl})
|
|
# norm base url - can raise UnicodeError from url.idna_encode()
|
|
redirected, is_idn = urlbase.url_norm(newurl)
|
|
if is_idn:
|
|
pass # XXX warn about idn use
|
|
log.debug(LOG_CHECK, "Norm redirected to %r", redirected)
|
|
urlparts = strformat.url_unicode_split(redirected)
|
|
# check extern filter again
|
|
self.set_extern(redirected)
|
|
if self.extern[0] and self.extern[0]:
|
|
if set_result:
|
|
self.add_info(
|
|
_("The redirected URL is outside of the domain "
|
|
"filter, checked only syntax."))
|
|
self.set_result(u"filtered")
|
|
return -1, response
|
|
# check robots.txt allowance again
|
|
if not self.allows_robots(redirected):
|
|
if set_result:
|
|
self.add_warning(
|
|
_("Access to redirected URL denied by robots.txt, "
|
|
"checked only syntax."),
|
|
tag=WARN_HTTP_ROBOTS_DENIED)
|
|
self.set_result(u"syntax OK")
|
|
return -1, response
|
|
# see about recursive redirect
|
|
all_seen = [self.cache_url_key] + self.aliases
|
|
if redirected in all_seen:
|
|
if self.method == "HEAD":
|
|
# Microsoft servers tend to recurse HEAD requests
|
|
# fall back to the original url and use GET
|
|
return self.max_redirects, response
|
|
recursion = all_seen + [redirected]
|
|
if set_result:
|
|
self.set_result(
|
|
_("recursive redirection encountered:\n %(urls)s") %
|
|
{"urls": "\n => ".join(recursion)}, valid=False)
|
|
return -1, response
|
|
if urlparts[0] == self.scheme:
|
|
# remember redireced url as alias
|
|
self.aliases.append(redirected)
|
|
# note: urlparts has to be a list
|
|
self.urlparts = urlparts
|
|
if response.status == 301:
|
|
if not self.has301status:
|
|
if set_result:
|
|
self.add_warning(
|
|
_("HTTP 301 (moved permanent) encountered: you"
|
|
" should update this link."),
|
|
tag=WARN_HTTP_MOVED_PERMANENT)
|
|
self.has301status = True
|
|
# check cache again on the changed URL
|
|
if self.aggregate.urlqueue.checked_redirect(redirected, self):
|
|
return -1, response
|
|
# in case of changed scheme make new URL object
|
|
if self.urlparts[0] != self.scheme:
|
|
if set_result:
|
|
self.add_warning(
|
|
_("Redirection to different URL type encountered; "
|
|
"the original URL was `%(url)s'.") %
|
|
{"url": self.url},
|
|
tag=WARN_HTTP_WRONG_REDIRECT)
|
|
newobj = get_url_from(
|
|
redirected, self.recursion_level, self.aggregate,
|
|
parent_url=self.parent_url, base_ref=self.base_ref,
|
|
line=self.line, column=self.column, name=self.name)
|
|
# append new object to queue
|
|
self.aggregate.urlqueue.put(newobj)
|
|
# pretend to be finished and logged
|
|
return -1, response
|
|
# new response data
|
|
response.close()
|
|
response = self._get_http_response()
|
|
tries += 1
|
|
return tries, response
|
|
|
|
def get_alias_cache_data (self):
|
|
"""
|
|
Return all data values that should be put in the cache,
|
|
minus redirection warnings.
|
|
"""
|
|
data = self.get_cache_data()
|
|
data["warnings"] = [
|
|
x for x in self.warnings if x[0] != "http-moved-permanent"]
|
|
data["info"] = [x for x in self.info if x[0] != "http-redirect"]
|
|
return data
|
|
|
|
def check_response (self, response):
|
|
"""
|
|
Check final result and log it.
|
|
"""
|
|
if response.status >= 400:
|
|
self.set_result(u"%r %s" % (response.status, response.reason),
|
|
valid=False)
|
|
else:
|
|
if response.status == 204:
|
|
# no content
|
|
self.add_warning(unicode_safe(response.reason),
|
|
tag=WARN_HTTP_EMPTY_CONTENT)
|
|
# store cookies for valid links
|
|
if self.aggregate.config['storecookies']:
|
|
for c in self.cookies:
|
|
self.add_info(_("Store cookie: %(cookie)s.") %
|
|
{"cookie": c})
|
|
try:
|
|
out = self.aggregate.cookies.add(self.headers,
|
|
self.urlparts[0],
|
|
self.urlparts[1],
|
|
self.urlparts[2])
|
|
for h in out:
|
|
self.add_info(unicode_safe(h))
|
|
except Cookie.CookieError, msg:
|
|
self.add_warning(_("Could not store cookies: %(msg)s.") %
|
|
{'msg': str(msg)},
|
|
tag=WARN_HTTP_COOKIE_STORE_ERROR)
|
|
if response.status >= 200:
|
|
self.set_result(u"%r %s" % (response.status, response.reason))
|
|
else:
|
|
self.set_result(u"OK")
|
|
modified = self.headers.get('Last-Modified', '')
|
|
if modified:
|
|
self.add_info(_("Last modified %(date)s.") % {"date": modified})
|
|
|
|
def _get_http_response (self):
|
|
"""
|
|
Send HTTP request and get response object.
|
|
"""
|
|
if self.proxy:
|
|
host = self.proxy
|
|
scheme = "http"
|
|
else:
|
|
host = self.urlparts[1]
|
|
scheme = self.urlparts[0]
|
|
log.debug(LOG_CHECK, "Connecting to %r", host)
|
|
# close/release a previous connection
|
|
self.close_connection()
|
|
self.url_connection = self.get_http_object(host, scheme)
|
|
if self.no_anchor:
|
|
anchor = ''
|
|
else:
|
|
anchor = self.urlparts[4]
|
|
if self.proxy:
|
|
path = urlparse.urlunsplit((self.urlparts[0], self.urlparts[1],
|
|
self.urlparts[2], self.urlparts[3], anchor))
|
|
else:
|
|
path = urlparse.urlunsplit(('', '', self.urlparts[2],
|
|
self.urlparts[3], anchor))
|
|
self.url_connection.putrequest(self.method, path, skip_host=True,
|
|
skip_accept_encoding=True)
|
|
self.url_connection.putheader("Host", host)
|
|
# userinfo is from http://user@pass:host/
|
|
if self.userinfo:
|
|
self.url_connection.putheader("Authorization", self.userinfo)
|
|
# auth is the -u and -p configuration options
|
|
elif self.auth:
|
|
self.url_connection.putheader("Authorization", self.auth)
|
|
if self.proxyauth:
|
|
self.url_connection.putheader("Proxy-Authorization",
|
|
self.proxyauth)
|
|
if (self.parent_url and
|
|
self.parent_url.startswith(('http://', 'https://'))):
|
|
self.url_connection.putheader("Referer", self.parent_url)
|
|
self.url_connection.putheader("User-Agent", configuration.UserAgent)
|
|
self.url_connection.putheader("Accept-Encoding",
|
|
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
|
|
if self.aggregate.config['sendcookies']:
|
|
scheme = self.urlparts[0]
|
|
host = self.urlparts[1]
|
|
port = urlutil.default_ports.get(scheme, 80)
|
|
host, port = urllib.splitnport(host, port)
|
|
path = self.urlparts[2]
|
|
self.cookies = self.aggregate.cookies.get(scheme, host, port, path)
|
|
for c in self.cookies:
|
|
name = c.client_header_name()
|
|
value = c.client_header_value()
|
|
self.url_connection.putheader(name, value)
|
|
self.url_connection.endheaders()
|
|
response = self.url_connection.getresponse()
|
|
self.timeout = headers.http_timeout(response)
|
|
self.headers = response.msg
|
|
self.persistent = not response.will_close
|
|
if self.persistent and self.method == "HEAD":
|
|
# Some servers send page content after a HEAD request,
|
|
# but only after making the *next* request. This breaks
|
|
# protocol synchronisation. Workaround here is to close
|
|
# the connection after HEAD.
|
|
# Example: http://www.empleo.gob.mx (Apache/1.3.33 (Unix) mod_jk)
|
|
self.persistent = False
|
|
if self.persistent and (self.method == "GET" or
|
|
self.headers.getheader("Content-Length") != "0"):
|
|
# always read content from persistent connections
|
|
self._read_content(response)
|
|
# If possible, use official W3C HTTP response name
|
|
if response.status in httpresponses:
|
|
response.reason = httpresponses[response.status]
|
|
return response
|
|
|
|
def get_http_object (self, host, scheme):
|
|
"""
|
|
Open a HTTP connection.
|
|
|
|
@param host: the host to connect to
|
|
@type host: string of the form <host>[:<port>]
|
|
@param scheme: 'http' or 'https'
|
|
@type scheme: string
|
|
@return: open HTTP(S) connection
|
|
@rtype: httplib.HTTP(S)Connection
|
|
"""
|
|
_user, _password = self.get_user_password()
|
|
key = (scheme, self.urlparts[1], _user, _password)
|
|
conn = self.aggregate.connections.get(key)
|
|
if conn is not None:
|
|
log.debug(LOG_CHECK, "reuse cached HTTP(S) connection %s", conn)
|
|
return conn
|
|
self.aggregate.connections.wait_for_host(host)
|
|
if scheme == "http":
|
|
h = httplib.HTTPConnection(host)
|
|
elif scheme == "https" and supportHttps:
|
|
h = httplib.HTTPSConnection(host)
|
|
else:
|
|
msg = _("Unsupported HTTP url scheme `%(scheme)s'") % {"scheme": scheme}
|
|
raise LinkCheckerError(msg)
|
|
if log.is_debug(LOG_CHECK):
|
|
h.set_debuglevel(1)
|
|
h.connect()
|
|
return h
|
|
|
|
def get_content (self):
|
|
"""
|
|
Get content of the URL target. The content data is cached after
|
|
the first call to this method.
|
|
|
|
@return: URL content, decompressed and decoded
|
|
@rtype: string
|
|
"""
|
|
if self.data is None:
|
|
self.method = "GET"
|
|
response = self._get_http_response()
|
|
response = self.follow_redirections(response, set_result=False)[1]
|
|
self.headers = response.msg
|
|
self._read_content(response)
|
|
if self.data is None:
|
|
self.data = ""
|
|
return self.data
|
|
|
|
def _read_content (self, response):
|
|
t = time.time()
|
|
data = response.read()
|
|
encoding = headers.get_content_encoding(self.headers)
|
|
if encoding in _supported_encodings:
|
|
try:
|
|
if encoding == 'deflate':
|
|
f = StringIO(zlib.decompress(data))
|
|
else:
|
|
f = gzip.GzipFile('', 'rb', 9, StringIO(data))
|
|
except zlib.error, msg:
|
|
self.add_warning(_("Decompress error %(err)s") %
|
|
{"err": str(msg)},
|
|
tag=WARN_HTTP_DECOMPRESS_ERROR)
|
|
f = StringIO(data)
|
|
data = f.read()
|
|
if self.data is None and self.method == "GET" and \
|
|
response.status not in [301, 302]:
|
|
self.data = data
|
|
self.dltime = time.time() - t
|
|
|
|
def encoding_supported (self):
|
|
"""Check if page encoding is supported."""
|
|
encoding = headers.get_content_encoding(self.headers)
|
|
if encoding and encoding not in _supported_encodings and \
|
|
encoding != 'identity':
|
|
self.add_warning(_("Unsupported content encoding `%(encoding)s'.") %
|
|
{"encoding": encoding},
|
|
tag=WARN_HTTP_UNSUPPORTED_ENCODING)
|
|
return False
|
|
return True
|
|
|
|
def is_html (self):
|
|
"""
|
|
See if this URL points to a HTML file by looking at the
|
|
Content-Type header, file extension and file content.
|
|
|
|
@return: True if URL points to HTML file
|
|
@rtype: bool
|
|
"""
|
|
if not (self.valid and self.headers):
|
|
return False
|
|
if headers.get_content_type(self.headers) not in HTML_MIMETYPES:
|
|
return False
|
|
return self.encoding_supported()
|
|
|
|
def is_css (self):
|
|
"""Return True iff content of this url is CSS stylesheet."""
|
|
if not (self.valid and self.headers):
|
|
return False
|
|
if headers.get_content_type(self.headers) != "text/css":
|
|
return False
|
|
return self.encoding_supported()
|
|
|
|
def is_http (self):
|
|
"""
|
|
This is a HTTP file.
|
|
|
|
@return: True
|
|
@rtype: bool
|
|
"""
|
|
return True
|
|
|
|
def is_parseable (self):
|
|
"""
|
|
Check if content is parseable for recursion.
|
|
|
|
@return: True if content is parseable
|
|
@rtype: bool
|
|
"""
|
|
if not (self.valid and self.headers):
|
|
return False
|
|
if headers.get_content_type(self.headers) not in PARSE_MIMETYPES:
|
|
return False
|
|
return self.encoding_supported()
|
|
|
|
def parse_url (self):
|
|
"""
|
|
Parse file contents for new links to check.
|
|
"""
|
|
ctype = headers.get_content_type(self.headers)
|
|
if self.is_html():
|
|
self.parse_html()
|
|
elif self.is_css():
|
|
self.parse_css()
|
|
elif ctype == "application/x-shockwave-flash":
|
|
self.parse_swf()
|
|
|
|
def get_robots_txt_url (self):
|
|
"""
|
|
Get the according robots.txt URL for this URL.
|
|
|
|
@return: robots.txt URL
|
|
@rtype: string
|
|
"""
|
|
return "%s://%s/robots.txt" % tuple(self.urlparts[0:2])
|
|
|
|
def close_connection (self):
|
|
"""
|
|
If connection is persistent, add it to the connection pool.
|
|
Else close the connection. Errors on closing are ignored.
|
|
"""
|
|
if self.url_connection is None:
|
|
# no connection is open
|
|
return
|
|
# add to cached connections
|
|
_user, _password = self.get_user_password()
|
|
key = ("http", self.urlparts[1], _user, _password)
|
|
if self.persistent and self.url_connection.is_idle():
|
|
self.aggregate.connections.add(
|
|
key, self.url_connection, self.timeout)
|
|
else:
|
|
try:
|
|
self.url_connection.close()
|
|
except Exception:
|
|
# ignore close errors
|
|
pass
|
|
self.url_connection = None
|