Remove content cache.

This commit is contained in:
Bastian Kleineidam 2013-12-10 23:41:52 +01:00
parent a7c1cdd6f6
commit 0ca63797bf
5 changed files with 4 additions and 78 deletions

View file

@ -1,71 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Cache for content checksums.
"""
import hashlib
from ..lock import get_lock
from ..decorators import synchronized
_lock = get_lock("checksums")
class ChecksumInfo(object):
"""Cache for content checksums."""
def __init__(self):
"""Initialize checksums and cache statistics."""
# {hash -> [URL]}
self.checksums = {}
self.misses = self.hits = 0
def get_checksum_urls(self, url, checksum):
"""Look for and store checksum for URL.
@param url: the URL for the checksum
@ptype url: unicode
@param checksum: the URL content checksum
@ptype checksum: str
@return: list of URLs matching the given checksum (except the given URL)
@rtype: list of unicode
"""
if checksum in self.checksums:
self.hits += 1
urls = self.checksums[checksum]
if url in urls:
res = [x for x in urls if x != url]
else:
res = urls[:]
urls.append(url)
else:
self.misses += 1
res = []
self.checksums[checksum] = [url]
return res
_checksuminfo = ChecksumInfo()
@synchronized(_lock)
def get_checksum_urls(url, content):
"""See if given URL content is already stored under another URL.
@param url: the URL for which the content is valid
@ptype url: unicode
@param content: the content to hash
@ptype content: str
@return: list of URLs with the same content (except the given URL)
@rtype: list of unicode"""
checksum = hashlib.sha1(content).hexdigest()
return _checksuminfo.get_checksum_urls(url, checksum)

View file

@ -217,7 +217,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledco
buf = StringIO()
def stor_data (s):
"""Helper method storing given data"""
urls = self.aggregate.add_download_data(self.cache_content_key, s)
self.aggregate.add_download_data(self.cache_content_key, s)
# limit the download size
if (buf.tell() + len(s)) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("FTP file size too large"))

View file

@ -678,7 +678,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport, pooledc
if len(data) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
dlsize = len(data)
urls = self.aggregate.add_download_data(self.cache_content_key, data)
self.aggregate.add_download_data(self.cache_content_key, data)
encoding = headers.get_content_encoding(self.headers)
if encoding in SUPPORTED_ENCODINGS:
try:

View file

@ -761,7 +761,7 @@ class UrlBase (object):
if len(data) > self.MaxFilesizeBytes:
raise LinkCheckerError(_("File size too large"))
if not self.is_local():
urls = self.aggregate.add_download_data(self.cache_content_key, data)
self.aggregate.add_download_data(self.cache_content_key, data)
return data, len(data)
def check_content (self):

View file

@ -21,7 +21,7 @@ import time
import threading
from .. import log, LOG_CHECK, strformat
from ..decorators import synchronized
from ..cache import urlqueue, content
from ..cache import urlqueue
from . import logger, status, checker, cleanup
@ -141,11 +141,8 @@ class Aggregate (object):
@ptype url: unicode
@param data: downloaded data
@ptype data: string
@return: URLs with duplicate contents
@rtype: list of unicode
"""
self.downloaded_bytes += len(data)
return content.get_checksum_urls(url, data)
def gather_statistics(self):
"""Gather download and cache statistics and send them to the