Remove addrinfo cache.

This commit is contained in:
Bastian Kleineidam 2012-10-10 10:54:58 +02:00
parent 20be0f2519
commit e1e80b7dd5
7 changed files with 20 additions and 155 deletions

View file

@ -1,80 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2006-2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Cache for DNS lookups.
"""
import socket
import sys
from .. import LinkCheckerError
from ..lock import get_lock
from ..containers import LFUCache
from ..decorators import synchronized
_lock = get_lock("addrinfo")
class AddrInfo(object):
"""Cache for socket.getaddrinfo() results."""
def __init__(self):
"""Initialize address info cache and cache statistics."""
self.addrinfos = LFUCache(size=100)
self.misses = self.hits = 0
def getaddrinfo(self, host, port):
"""Determine address information for given host and port for
streaming sockets (SOCK_STREAM).
Already cached information is used."""
key = u"%s:%s" % (unicode(host), unicode(port))
if key in self.addrinfos:
self.hits += 1
value = self.addrinfos[key]
else:
self.misses += 1
# check if it's an ascii host
if isinstance(host, unicode):
try:
host = host.encode('ascii')
except UnicodeEncodeError:
pass
try:
value = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
except socket.error:
value = sys.exc_info()[1]
except UnicodeError, msg:
args = dict(host=host, msg=str(msg))
value = LinkCheckerError(_("could not parse host %(host)r: %(msg)s") % args)
self.addrinfos[key] = value
if isinstance(value, Exception):
raise value
return value
_addrinfo = AddrInfo()
@synchronized(_lock)
def getaddrinfo(host, port):
"""Determine address information for given host and port for
streaming sockets (SOCK_STREAM).
Already cached information is used."""
return _addrinfo.getaddrinfo(host, port)
@synchronized(_lock)
def getstats():
"""Get cache statistics.
@return: hits and misses
@rtype tuple(int, int)
"""
return _addrinfo.hits, _addrinfo.misses

View file

@ -526,8 +526,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
self.get_http_object(scheme, host, port)
self.add_connection_request()
self.add_connection_headers()
buffering = True
response = self.url_connection.getresponse(buffering)
response = self.url_connection.getresponse(buffering=True)
self.headers = response.msg
self.content_type = None
self.persistent = not response.will_close

View file

@ -21,7 +21,7 @@ import time
import threading
from .. import log, LOG_CHECK
from ..decorators import synchronized
from ..cache import urlqueue, addrinfo, content
from ..cache import urlqueue, content
from . import logger, status, checker, cleanup
@ -145,7 +145,5 @@ class Aggregate (object):
logger.
"""
robots_txt_stats = self.robots_txt.hits, self.robots_txt.misses
addrinfo_stats = addrinfo.getstats()
download_stats = self.downloaded_bytes
self.logger.add_statistics(robots_txt_stats, addrinfo_stats,
download_stats)
self.logger.add_statistics(robots_txt_stats, download_stats)

View file

@ -46,10 +46,10 @@ class Logger (object):
for logger in self.loggers:
logger.end_output()
def add_statistics(self, robots_txt_stats, addrinfo_stats, download_stats):
def add_statistics(self, robots_txt_stats, download_stats):
"""Add statistics to logger."""
for logger in self.loggers:
logger.add_statistics(robots_txt_stats, addrinfo_stats, download_stats)
logger.add_statistics(robots_txt_stats, download_stats)
def do_print (self, url_data):
"""Determine if URL entry should be logged or not."""

View file

@ -81,7 +81,6 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import cache.addrinfo
__all__ = ["HTTP", "HTTPResponse", "HTTPConnection",
"HTTPException", "NotConnected", "UnknownProtocol",
@ -366,7 +365,9 @@ class HTTPResponse:
def _read_status(self):
# Initialize with Simple-Response defaults
line = self.fp.readline()
line = self.fp.readline(_MAXLINE + 1)
if len(line) > _MAXLINE:
raise LineTooLong("header line")
if self.debuglevel > 0:
print "reply:", repr(line)
if not line:
@ -683,8 +684,9 @@ class HTTPConnection:
strict = 0
def __init__(self, host, port=None, strict=None,
timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
self.timeout = timeout
self.source_address = source_address
self.sock = None
self._buffer = []
self.__response = None
@ -763,26 +765,12 @@ class HTTPConnection:
if line == '\r\n':
break
def connect(self):
"""Connect to the host and port specified in __init__."""
msg = "getaddrinfo returns an empty list"
for res in cache.addrinfo.getaddrinfo(self.host, self.port):
af, socktype, proto, canonname, sa = res
try:
self.sock = socket.socket(af, socktype, proto)
if self.debuglevel > 0:
print "connect: (%s, %s)" % (self.host, self.port)
self.sock.connect(sa)
except socket.error, msg:
if self.debuglevel > 0:
print 'connect fail:', (self.host, self.port), msg
if self.sock is not None:
self.sock.close()
self.sock = None
continue
break
if not self.sock:
raise socket.error, msg
self.sock = socket.create_connection((self.host,self.port),
self.timeout, self.source_address)
if self._tunnel_host:
self._tunnel()
@ -1161,7 +1149,7 @@ class HTTP:
### should we keep this behavior? do people use it?
# keep the socket open (as a file), and return it
self.file = self._conn.sock.makefile('rb')
self.file = self._conn.sock.makefile('rb', 0)
# close our socket -- we want to restart after any protocol error
self.close()
@ -1183,7 +1171,6 @@ class HTTP:
### do it
self.file = None
try:
import ssl
except ImportError:
@ -1196,8 +1183,9 @@ else:
def __init__(self, host, port=None, key_file=None, cert_file=None,
strict=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
ca_certs=None):
HTTPConnection.__init__(self, host, port, strict, timeout)
source_address=None, ca_certs=None):
HTTPConnection.__init__(self, host, port, strict, timeout,
source_address)
self.key_file = key_file
self.cert_file = cert_file
self.ca_certs = ca_certs
@ -1209,7 +1197,8 @@ else:
def connect(self):
"Connect to a host on a given (SSL) port."
sock = socket.create_connection((self.host, self.port), self.timeout)
sock = socket.create_connection((self.host, self.port),
self.timeout, self.source_address)
if self._tunnel_host:
self.sock = sock
self._tunnel()

View file

@ -249,8 +249,6 @@ class TextLogger (Logger):
self.writeln(_("Downloaded: %s") % strformat.strsize(self.stats.downloaded_bytes))
hitsmisses = strformat.str_cache_stats(*self.stats.robots_txt_stats)
self.writeln(_("Robots.txt cache: %s") % hitsmisses)
hitsmisses = strformat.str_cache_stats(*self.stats.addrinfo_stats)
self.writeln(_("DNS cache: %s") % hitsmisses)
if len(self.stats.domains) > 1:
self.writeln(_("Number of domains: %d") % len(self.stats.domains))
if self.stats.number > 0:

View file

@ -1,39 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test address info caching.
"""
import unittest
import socket
from linkcheck import LinkCheckerError
from linkcheck.cache.addrinfo import getaddrinfo
class TestAddrinfoCache (unittest.TestCase):
"""Test address info caching."""
def test_addrinfo_cache1 (self):
# pure ascii hostname with >63 chars
host = u"a"*64
port = 80
# must not raise UnicodeEncodeError
self.assertRaises(socket.error, getaddrinfo, host, port)
def test_addrinfo_cache2 (self):
# non-ascii hostname with >63 chars
host = u"ä"*64
port = 80
self.assertRaises(LinkCheckerError, getaddrinfo, host, port)